diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index f9d3d3f..b4a1701 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -26,16 +26,16 @@ jobs: include: - python-version: "3.7" os: windows-latest - gdal-whl-url: https://download.lfd.uci.edu/pythonlibs/archived/cp37/GDAL-3.3.3-cp37-cp37m-win_amd64.whl + gdal-whl-url: https://paddlers.bj.bcebos.com/dependencies/gdal/GDAL-3.3.3-cp37-cp37m-win_amd64.whl - python-version: "3.7" os: ubuntu-latest - gdal-whl-url: https://versaweb.dl.sourceforge.net/project/gdal-wheels-for-linux/GDAL-3.4.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl + gdal-whl-url: https://paddlers.bj.bcebos.com/dependencies/gdal/GDAL-3.4.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl - python-version: "3.8" os: windows-latest - gdal-whl-url: https://download.lfd.uci.edu/pythonlibs/archived/GDAL-3.3.3-cp38-cp38-win_amd64.whl + gdal-whl-url: https://paddlers.bj.bcebos.com/dependencies/gdal/GDAL-3.3.3-cp38-cp38-win_amd64.whl - python-version: "3.8" os: ubuntu-latest - gdal-whl-url: https://versaweb.dl.sourceforge.net/project/gdal-wheels-for-linux/GDAL-3.4.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl + gdal-whl-url: https://paddlers.bj.bcebos.com/dependencies/gdal/GDAL-3.4.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl fail-fast: false steps: - uses: actions/checkout@v3 diff --git a/README.md b/README.md index 013710d..98b0951 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,7 @@ PaddleRS具有以下五大特色: * 如果您发现任何PaddleRS存在的问题或是对PaddleRS有建议, 欢迎通过[GitHub Issues](https://github.com/PaddlePaddle/PaddleRS/issues)向我们提出。 * 欢迎加入PaddleRS微信群:
- +
## 产品矩阵 diff --git a/docs/images/whole_picture.png b/docs/images/whole_picture.png index f99934e..963b854 100644 Binary files a/docs/images/whole_picture.png and b/docs/images/whole_picture.png differ diff --git a/paddlers/models/hash.txt b/paddlers/models/hash.txt index 3307510..922cec8 100644 --- a/paddlers/models/hash.txt +++ b/paddlers/models/hash.txt @@ -1 +1,2 @@ +ppdet ba2aad26e6bc1e5c2dad76ca96692a0d63eccfac ppseg f6c73b478cdf00f40ae69edd35bf6bce2a1687ef \ No newline at end of file diff --git a/paddlers/models/ppdet/core/workspace.py b/paddlers/models/ppdet/core/workspace.py index ec33b64..231532b 100644 --- a/paddlers/models/ppdet/core/workspace.py +++ b/paddlers/models/ppdet/core/workspace.py @@ -210,9 +210,17 @@ def create(cls_or_name, **kwargs): assert type(cls_or_name) in [type, str ], "should be a class or name of a class" name = type(cls_or_name) == str and cls_or_name or cls_or_name.__name__ - assert name in global_config and \ - isinstance(global_config[name], SchemaDict), \ - "the module {} is not registered".format(name) + if name in global_config: + if isinstance(global_config[name], SchemaDict): + pass + elif hasattr(global_config[name], "__dict__"): + # support instance return directly + return global_config[name] + else: + raise ValueError("The module {} is not registered".format(name)) + else: + raise ValueError("The module {} is not registered".format(name)) + config = global_config[name] cls = getattr(config.pymodule, name) cls_kwargs = {} diff --git a/paddlers/models/ppdet/data/__init__.py b/paddlers/models/ppdet/data/__init__.py index 69dd9a7..11bc9e4 100644 --- a/paddlers/models/ppdet/data/__init__.py +++ b/paddlers/models/ppdet/data/__init__.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. from . import source diff --git a/paddlers/models/ppdet/data/crop_utils/__init__.py b/paddlers/models/ppdet/data/crop_utils/__init__.py index 97043fd..c747d3e 100644 --- a/paddlers/models/ppdet/data/crop_utils/__init__.py +++ b/paddlers/models/ppdet/data/crop_utils/__init__.py @@ -10,4 +10,4 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. +# limitations under the License. \ No newline at end of file diff --git a/paddlers/models/ppdet/data/reader.py b/paddlers/models/ppdet/data/reader.py index 8c7845f..a1922d0 100644 --- a/paddlers/models/ppdet/data/reader.py +++ b/paddlers/models/ppdet/data/reader.py @@ -23,7 +23,7 @@ else: import numpy as np from paddle.io import DataLoader, DistributedBatchSampler -from paddle.fluid.dataloader.collate import default_collate_fn +from .utils import default_collate_fn from paddlers.models.ppdet.core.workspace import register from . import transform @@ -118,7 +118,7 @@ class BaseDataLoader(object): collate_batch (bool): whether to collate batch in dataloader. If set to True, the samples will collate into batch according to the batch size. Otherwise, the ground-truth will not collate, - which is used when the number of ground-truch is different in + which is used when the number of ground-truch is different in samples. use_shared_memory (bool): whether to use shared memory to accelerate data loading, enable this only if you @@ -144,7 +144,7 @@ class BaseDataLoader(object): self._sample_transforms = Compose( sample_transforms, num_classes=num_classes) - # batch transfrom + # batch transfrom self._batch_transforms = BatchCompose(batch_transforms, num_classes, collate_batch) self.batch_size = batch_size diff --git a/paddlers/models/ppdet/data/shm_utils.py b/paddlers/models/ppdet/data/shm_utils.py index 16e948c..5ff72eb 100644 --- a/paddlers/models/ppdet/data/shm_utils.py +++ b/paddlers/models/ppdet/data/shm_utils.py @@ -34,7 +34,10 @@ SHM_DEFAULT_MOUNT = '/dev/shm' def _parse_size_in_M(size_str): - num, unit = size_str[:-1], size_str[-1] + if size_str[-1] == 'B': + num, unit = size_str[:-2], size_str[-2] + else: + num, unit = size_str[:-1], size_str[-1] assert unit in SIZE_UNIT, \ "unknown shm size unit {}".format(unit) return float(num) * \ diff --git a/paddlers/models/ppdet/data/source/__init__.py b/paddlers/models/ppdet/data/source/__init__.py index ad593c4..a0ca322 100644 --- a/paddlers/models/ppdet/data/source/__init__.py +++ b/paddlers/models/ppdet/data/source/__init__.py @@ -27,3 +27,4 @@ from .category import * from .keypoint_coco import * from .mot import * from .sniper_coco import SniperCOCODataSet +from .dataset import ImageFolder diff --git a/paddlers/models/ppdet/data/source/category.py b/paddlers/models/ppdet/data/source/category.py index cf03d8a..73628a6 100644 --- a/paddlers/models/ppdet/data/source/category.py +++ b/paddlers/models/ppdet/data/source/category.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import @@ -39,24 +39,49 @@ def get_categories(metric_type, anno_file=None, arch=None): if arch == 'keypoint_arch': return (None, {'id': 'keypoint'}) + if anno_file == None or (not os.path.isfile(anno_file)): + logger.warning( + "anno_file '{}' is None or not set or not exist, " + "please recheck TrainDataset/EvalDataset/TestDataset.anno_path, " + "otherwise the default categories will be used by metric_type.". + format(anno_file)) + if metric_type.lower() == 'coco' or metric_type.lower( ) == 'rbox' or metric_type.lower() == 'snipercoco': if anno_file and os.path.isfile(anno_file): - # lazy import pycocotools here - from pycocotools.coco import COCO - - coco = COCO(anno_file) - cats = coco.loadCats(coco.getCatIds()) - - clsid2catid = {i: cat['id'] for i, cat in enumerate(cats)} - catid2name = {cat['id']: cat['name'] for cat in cats} + if anno_file.endswith('json'): + # lazy import pycocotools here + from pycocotools.coco import COCO + coco = COCO(anno_file) + cats = coco.loadCats(coco.getCatIds()) + + clsid2catid = {i: cat['id'] for i, cat in enumerate(cats)} + catid2name = {cat['id']: cat['name'] for cat in cats} + + elif anno_file.endswith('txt'): + cats = [] + with open(anno_file) as f: + for line in f.readlines(): + cats.append(line.strip()) + if cats[0] == 'background': cats = cats[1:] + + clsid2catid = {i: i for i in range(len(cats))} + catid2name = {i: name for i, name in enumerate(cats)} + + else: + raise ValueError("anno_file {} should be json or txt.".format( + anno_file)) return clsid2catid, catid2name # anno file not exist, load default categories of COCO17 else: if metric_type.lower() == 'rbox': + logger.warning( + "metric_type: {}, load default categories of DOTA.".format( + metric_type)) return _dota_category() - + logger.warning("metric_type: {}, load default categories of COCO.". + format(metric_type)) return _coco17_category() elif metric_type.lower() == 'voc': @@ -77,6 +102,8 @@ def get_categories(metric_type, anno_file=None, arch=None): # anno file not exist, load default categories of # VOC all 20 categories else: + logger.warning("metric_type: {}, load default categories of VOC.". + format(metric_type)) return _vocall_category() elif metric_type.lower() == 'oid': @@ -104,6 +131,9 @@ def get_categories(metric_type, anno_file=None, arch=None): return clsid2catid, catid2name # anno file not exist, load default category 'pedestrian'. else: + logger.warning( + "metric_type: {}, load default categories of pedestrian MOT.". + format(metric_type)) return _mot_category(category='pedestrian') elif metric_type.lower() in ['kitti', 'bdd100kmot']: @@ -122,6 +152,9 @@ def get_categories(metric_type, anno_file=None, arch=None): return clsid2catid, catid2name # anno file not exist, load default categories of visdrone all 10 categories else: + logger.warning( + "metric_type: {}, load default categories of VisDrone.".format( + metric_type)) return _visdrone_category() else: diff --git a/paddlers/models/ppdet/data/source/coco.py b/paddlers/models/ppdet/data/source/coco.py index efaf61f..0024009 100644 --- a/paddlers/models/ppdet/data/source/coco.py +++ b/paddlers/models/ppdet/data/source/coco.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. import os @@ -33,12 +33,13 @@ class COCODataSet(DetDataset): anno_path (str): coco annotation file path. data_fields (list): key name of data dictionary, at least have 'image'. sample_num (int): number of samples to load, -1 means all. - load_crowd (bool): whether to load crowded ground-truth. + load_crowd (bool): whether to load crowded ground-truth. False as default allow_empty (bool): whether to load empty entry. False as default - empty_ratio (float): the ratio of empty record number to total - record's, if empty_ratio is out of [0. ,1.), do not sample the + empty_ratio (float): the ratio of empty record number to total + record's, if empty_ratio is out of [0. ,1.), do not sample the records and use all the empty entries. 1. as default + repeat (int): repeat times for dataset, use in benchmark. """ def __init__(self, @@ -49,9 +50,15 @@ class COCODataSet(DetDataset): sample_num=-1, load_crowd=False, allow_empty=False, - empty_ratio=1.): - super(COCODataSet, self).__init__(dataset_dir, image_dir, anno_path, - data_fields, sample_num) + empty_ratio=1., + repeat=1): + super(COCODataSet, self).__init__( + dataset_dir, + image_dir, + anno_path, + data_fields, + sample_num, + repeat=repeat) self.load_image_only = False self.load_semantic = False self.load_crowd = load_crowd @@ -138,25 +145,14 @@ class COCODataSet(DetDataset): if not any(np.array(inst['bbox'])): continue - # read rbox anno or not - is_rbox_anno = True if len(inst['bbox']) == 5 else False - if is_rbox_anno: - xc, yc, box_w, box_h, angle = inst['bbox'] - x1 = xc - box_w / 2.0 - y1 = yc - box_h / 2.0 - x2 = x1 + box_w - y2 = y1 + box_h - else: - x1, y1, box_w, box_h = inst['bbox'] - x2 = x1 + box_w - y2 = y1 + box_h + x1, y1, box_w, box_h = inst['bbox'] + x2 = x1 + box_w + y2 = y1 + box_h eps = 1e-5 if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps: inst['clean_bbox'] = [ round(float(x), 3) for x in [x1, y1, x2, y2] ] - if is_rbox_anno: - inst['clean_rbox'] = [xc, yc, box_w, box_h, angle] bboxes.append(inst) else: logger.warning( @@ -171,9 +167,6 @@ class COCODataSet(DetDataset): is_empty = True gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32) - if is_rbox_anno: - gt_rbox = np.zeros((num_bbox, 5), dtype=np.float32) - gt_theta = np.zeros((num_bbox, 1), dtype=np.int32) gt_class = np.zeros((num_bbox, 1), dtype=np.int32) is_crowd = np.zeros((num_bbox, 1), dtype=np.int32) gt_poly = [None] * num_bbox @@ -183,13 +176,10 @@ class COCODataSet(DetDataset): catid = box['category_id'] gt_class[i][0] = self.catid2clsid[catid] gt_bbox[i, :] = box['clean_bbox'] - # xc, yc, w, h, theta - if is_rbox_anno: - gt_rbox[i, :] = box['clean_rbox'] is_crowd[i][0] = box['iscrowd'] - # check RLE format + # check RLE format if 'segmentation' in box and box['iscrowd'] == 1: - gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] + gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] elif 'segmentation' in box and box['segmentation']: if not np.array(box['segmentation'] ).size > 0 and not self.allow_empty: @@ -206,21 +196,12 @@ class COCODataSet(DetDataset): gt_poly) and not self.allow_empty: continue - if is_rbox_anno: - gt_rec = { - 'is_crowd': is_crowd, - 'gt_class': gt_class, - 'gt_bbox': gt_bbox, - 'gt_rbox': gt_rbox, - 'gt_poly': gt_poly, - } - else: - gt_rec = { - 'is_crowd': is_crowd, - 'gt_class': gt_class, - 'gt_bbox': gt_bbox, - 'gt_poly': gt_poly, - } + gt_rec = { + 'is_crowd': is_crowd, + 'gt_class': gt_class, + 'gt_bbox': gt_bbox, + 'gt_poly': gt_poly, + } for k, v in gt_rec.items(): if k in self.data_fields: @@ -247,3 +228,126 @@ class COCODataSet(DetDataset): empty_records = self._sample_empty(empty_records, len(records)) records += empty_records self.roidbs = records + + +@register +@serializable +class SlicedCOCODataSet(COCODataSet): + """Sliced COCODataSet""" + + def __init__( + self, + dataset_dir=None, + image_dir=None, + anno_path=None, + data_fields=['image'], + sample_num=-1, + load_crowd=False, + allow_empty=False, + empty_ratio=1., + repeat=1, + sliced_size=[640, 640], + overlap_ratio=[0.25, 0.25], ): + super(SlicedCOCODataSet, self).__init__( + dataset_dir=dataset_dir, + image_dir=image_dir, + anno_path=anno_path, + data_fields=data_fields, + sample_num=sample_num, + load_crowd=load_crowd, + allow_empty=allow_empty, + empty_ratio=empty_ratio, + repeat=repeat, ) + self.sliced_size = sliced_size + self.overlap_ratio = overlap_ratio + + def parse_dataset(self): + anno_path = os.path.join(self.dataset_dir, self.anno_path) + image_dir = os.path.join(self.dataset_dir, self.image_dir) + + assert anno_path.endswith('.json'), \ + 'invalid coco annotation file: ' + anno_path + from pycocotools.coco import COCO + coco = COCO(anno_path) + img_ids = coco.getImgIds() + img_ids.sort() + cat_ids = coco.getCatIds() + records = [] + empty_records = [] + ct = 0 + ct_sub = 0 + + self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)}) + self.cname2cid = dict({ + coco.loadCats(catid)[0]['name']: clsid + for catid, clsid in self.catid2clsid.items() + }) + + if 'annotations' not in coco.dataset: + self.load_image_only = True + logger.warning('Annotation file: {} does not contains ground truth ' + 'and load image information only.'.format(anno_path)) + try: + import sahi + from sahi.slicing import slice_image + except Exception as e: + logger.error( + 'sahi not found, plaese install sahi. ' + 'for example: `pip install sahi`, see https://github.com/obss/sahi.' + ) + raise e + + sub_img_ids = 0 + for img_id in img_ids: + img_anno = coco.loadImgs([img_id])[0] + im_fname = img_anno['file_name'] + im_w = float(img_anno['width']) + im_h = float(img_anno['height']) + + im_path = os.path.join(image_dir, + im_fname) if image_dir else im_fname + is_empty = False + if not os.path.exists(im_path): + logger.warning('Illegal image file: {}, and it will be ' + 'ignored'.format(im_path)) + continue + + if im_w < 0 or im_h < 0: + logger.warning('Illegal width: {} or height: {} in annotation, ' + 'and im_id: {} will be ignored'.format( + im_w, im_h, img_id)) + continue + + slice_image_result = sahi.slicing.slice_image( + image=im_path, + slice_height=self.sliced_size[0], + slice_width=self.sliced_size[1], + overlap_height_ratio=self.overlap_ratio[0], + overlap_width_ratio=self.overlap_ratio[1]) + + sub_img_num = len(slice_image_result) + for _ind in range(sub_img_num): + im = slice_image_result.images[_ind] + coco_rec = { + 'image': im, + 'im_id': np.array([sub_img_ids + _ind]), + 'h': im.shape[0], + 'w': im.shape[1], + 'ori_im_id': np.array([img_id]), + 'st_pix': np.array( + slice_image_result.starting_pixels[_ind], + dtype=np.float32), + 'is_last': 1 if _ind == sub_img_num - 1 else 0, + } if 'image' in self.data_fields else {} + records.append(coco_rec) + ct_sub += sub_img_num + ct += 1 + if self.sample_num > 0 and ct >= self.sample_num: + break + assert ct > 0, 'not found any coco record in %s' % (anno_path) + logger.info('{} samples and slice to {} sub_samples in file {}'.format( + ct, ct_sub, anno_path)) + if self.allow_empty and len(empty_records) > 0: + empty_records = self._sample_empty(empty_records, len(records)) + records += empty_records + self.roidbs = records diff --git a/paddlers/models/ppdet/data/source/dataset.py b/paddlers/models/ppdet/data/source/dataset.py index 7345d2b..bf3c0c7 100644 --- a/paddlers/models/ppdet/data/source/dataset.py +++ b/paddlers/models/ppdet/data/source/dataset.py @@ -1,20 +1,20 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. import os +import copy import numpy as np - try: from collections.abc import Sequence except Exception: @@ -22,7 +22,10 @@ except Exception: from paddle.io import Dataset from paddlers.models.ppdet.core.workspace import register, serializable from paddlers.models.ppdet.utils.download import get_dataset_path -import copy +from paddlers.models.ppdet.data import source + +from paddlers.models.ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) @serializable @@ -37,6 +40,7 @@ class DetDataset(Dataset): data_fields (list): key name of data dictionary, at least have 'image'. sample_num (int): number of samples to load, -1 means all. use_default_label (bool): whether to load default label list. + repeat (int): repeat times for dataset, use in benchmark. """ def __init__(self, @@ -46,6 +50,7 @@ class DetDataset(Dataset): data_fields=['image'], sample_num=-1, use_default_label=None, + repeat=1, **kwargs): super(DetDataset, self).__init__() self.dataset_dir = dataset_dir if dataset_dir is not None else '' @@ -54,28 +59,32 @@ class DetDataset(Dataset): self.data_fields = data_fields self.sample_num = sample_num self.use_default_label = use_default_label + self.repeat = repeat self._epoch = 0 self._curr_iter = 0 def __len__(self, ): - return len(self.roidbs) + return len(self.roidbs) * self.repeat + + def __call__(self, *args, **kwargs): + return self def __getitem__(self, idx): + n = len(self.roidbs) + if self.repeat > 1: + idx %= n # data batch roidb = copy.deepcopy(self.roidbs[idx]) if self.mixup_epoch == 0 or self._epoch < self.mixup_epoch: - n = len(self.roidbs) idx = np.random.randint(n) roidb = [roidb, copy.deepcopy(self.roidbs[idx])] elif self.cutmix_epoch == 0 or self._epoch < self.cutmix_epoch: - n = len(self.roidbs) idx = np.random.randint(n) roidb = [roidb, copy.deepcopy(self.roidbs[idx])] elif self.mosaic_epoch == 0 or self._epoch < self.mosaic_epoch: - n = len(self.roidbs) roidb = [roidb, ] + [ copy.deepcopy(self.roidbs[np.random.randint(n)]) - for _ in range(3) + for _ in range(4) ] if isinstance(roidb, Sequence): for r in roidb: @@ -149,12 +158,15 @@ class ImageFolder(DetDataset): self.sample_num = sample_num def check_or_download_dataset(self): + return + + def get_anno(self): + if self.anno_path is None: + return if self.dataset_dir: - # NOTE: ImageFolder is only used for prediction, in - # infer mode, image_dir is set by set_images - # so we only check anno_path here - self.dataset_dir = get_dataset_path(self.dataset_dir, - self.anno_path, None) + return os.path.join(self.dataset_dir, self.anno_path) + else: + return self.anno_path def parse_dataset(self, ): if not self.roidbs: @@ -195,3 +207,93 @@ class ImageFolder(DetDataset): def set_images(self, images): self.image_dir = images self.roidbs = self._load_images() + + def set_slice_images(self, + images, + slice_size=[640, 640], + overlap_ratio=[0.25, 0.25]): + self.image_dir = images + ori_records = self._load_images() + try: + import sahi + from sahi.slicing import slice_image + except Exception as e: + logger.error( + 'sahi not found, plaese install sahi. ' + 'for example: `pip install sahi`, see https://github.com/obss/sahi.' + ) + raise e + + sub_img_ids = 0 + ct = 0 + ct_sub = 0 + records = [] + for i, ori_rec in enumerate(ori_records): + im_path = ori_rec['im_file'] + slice_image_result = sahi.slicing.slice_image( + image=im_path, + slice_height=slice_size[0], + slice_width=slice_size[1], + overlap_height_ratio=overlap_ratio[0], + overlap_width_ratio=overlap_ratio[1]) + + sub_img_num = len(slice_image_result) + for _ind in range(sub_img_num): + im = slice_image_result.images[_ind] + rec = { + 'image': im, + 'im_id': np.array([sub_img_ids + _ind]), + 'h': im.shape[0], + 'w': im.shape[1], + 'ori_im_id': np.array([ori_rec['im_id'][0]]), + 'st_pix': np.array( + slice_image_result.starting_pixels[_ind], + dtype=np.float32), + 'is_last': 1 if _ind == sub_img_num - 1 else 0, + } if 'image' in self.data_fields else {} + records.append(rec) + ct_sub += sub_img_num + ct += 1 + print('{} samples and slice to {} sub_samples'.format(ct, ct_sub)) + self.roidbs = records + + def get_label_list(self): + # Only VOC dataset needs label list in ImageFold + return self.anno_path + + +@register +class CommonDataset(object): + def __init__(self, **dataset_args): + super(CommonDataset, self).__init__() + dataset_args = copy.deepcopy(dataset_args) + type = dataset_args.pop("name") + self.dataset = getattr(source, type)(**dataset_args) + + def __call__(self): + return self.dataset + + +@register +class TrainDataset(CommonDataset): + pass + + +@register +class EvalMOTDataset(CommonDataset): + pass + + +@register +class TestMOTDataset(CommonDataset): + pass + + +@register +class EvalDataset(CommonDataset): + pass + + +@register +class TestDataset(CommonDataset): + pass diff --git a/paddlers/models/ppdet/data/source/keypoint_coco.py b/paddlers/models/ppdet/data/source/keypoint_coco.py index d51e674..e2c36d7 100644 --- a/paddlers/models/ppdet/data/source/keypoint_coco.py +++ b/paddlers/models/ppdet/data/source/keypoint_coco.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. """ this code is base on https://github.com/open-mmlab/mmpose @@ -27,7 +27,7 @@ from paddlers.models.ppdet.core.workspace import register, serializable @serializable class KeypointBottomUpBaseDataset(DetDataset): - """Base class for bottom-up datasets. + """Base class for bottom-up datasets. All datasets should subclass it. All subclasses should overwrite: @@ -91,7 +91,7 @@ class KeypointBottomUpBaseDataset(DetDataset): @register @serializable class KeypointBottomUpCocoDataset(KeypointBottomUpBaseDataset): - """COCO dataset for bottom-up pose estimation. + """COCO dataset for bottom-up pose estimation. The dataset loads raw features and apply specified transforms to return a dict containing the image tensors and other information. @@ -262,7 +262,7 @@ class KeypointBottomUpCocoDataset(KeypointBottomUpBaseDataset): @register @serializable class KeypointBottomUpCrowdPoseDataset(KeypointBottomUpCocoDataset): - """CrowdPose dataset for bottom-up pose estimation. + """CrowdPose dataset for bottom-up pose estimation. The dataset loads raw features and apply specified transforms to return a dict containing the image tensors and other information. @@ -386,7 +386,7 @@ class KeypointTopDownBaseDataset(DetDataset): @register @serializable class KeypointTopDownCocoDataset(KeypointTopDownBaseDataset): - """COCO dataset for top-down pose estimation. + """COCO dataset for top-down pose estimation. The dataset loads raw features and apply specified transforms to return a dict containing the image tensors and other information. diff --git a/paddlers/models/ppdet/data/source/mot.py b/paddlers/models/ppdet/data/source/mot.py index 3e96ba0..0386387 100644 --- a/paddlers/models/ppdet/data/source/mot.py +++ b/paddlers/models/ppdet/data/source/mot.py @@ -39,15 +39,16 @@ class MOTDataSet(DetDataset): image_lists (str|list): mot data image lists, muiti-source mot dataset. data_fields (list): key name of data dictionary, at least have 'image'. sample_num (int): number of samples to load, -1 means all. + repeat (int): repeat times for dataset, use in benchmark. Notes: MOT datasets root directory following this: dataset/mot |——————image_lists - | |——————caltech.train - | |——————caltech.val - | |——————mot16.train - | |——————mot17.train + | |——————caltech.train + | |——————caltech.val + | |——————mot16.train + | |——————mot17.train | ...... |——————Caltech |——————MOT17 @@ -77,11 +78,13 @@ class MOTDataSet(DetDataset): dataset_dir=None, image_lists=[], data_fields=['image'], - sample_num=-1): + sample_num=-1, + repeat=1): super(MOTDataSet, self).__init__( dataset_dir=dataset_dir, data_fields=data_fields, - sample_num=sample_num) + sample_num=sample_num, + repeat=repeat) self.dataset_dir = dataset_dir self.image_lists = image_lists if isinstance(self.image_lists, str): @@ -243,8 +246,8 @@ class MCMOTDataSet(DetDataset): MCMOT datasets root directory following this: dataset/mot |——————image_lists - | |——————visdrone_mcmot.train - | |——————visdrone_mcmot.val + | |——————visdrone_mcmot.train + | |——————visdrone_mcmot.val visdrone_mcmot |——————images | └——————train @@ -348,10 +351,10 @@ class MCMOTDataSet(DetDataset): self.num_imgs_each_data = [len(x) for x in self.img_files.values()] self.total_imgs = sum(self.num_imgs_each_data) - # cname2cid and cid2cname + # cname2cid and cid2cname cname2cid = {} if self.label_list is not None: - # if use label_list for multi source mix dataset, + # if use label_list for multi source mix dataset, # please make sure label_list in the first sub_dataset at least. sub_dataset = self.image_lists[0].split('.')[0] label_path = os.path.join(self.dataset_dir, sub_dataset, @@ -461,7 +464,7 @@ class MOTImageFolder(DetDataset): video_file (str): path of the video file, default ''. frame_rate (int): frame rate of the video, use cv2 VideoCapture if not set. dataset_dir (str): root directory for dataset. - keep_ori_im (bool): whether to keep original image, default False. + keep_ori_im (bool): whether to keep original image, default False. Set True when used during MOT model inference while saving images or video, or used in DeepSORT. """ @@ -474,6 +477,7 @@ class MOTImageFolder(DetDataset): image_dir=None, sample_num=-1, keep_ori_im=False, + anno_path=None, **kwargs): super(MOTImageFolder, self).__init__( dataset_dir, image_dir, sample_num=sample_num) @@ -483,6 +487,7 @@ class MOTImageFolder(DetDataset): self._imid2path = {} self.roidbs = None self.frame_rate = frame_rate + self.anno_path = anno_path def check_or_download_dataset(self): return @@ -573,6 +578,9 @@ class MOTImageFolder(DetDataset): "wrong or unsupported file format: {}".format(self.video_file) self.roidbs = self._load_video_images() + def get_anno(self): + return self.anno_path + def _is_valid_video(f, extensions=('.mp4', '.avi', '.mov', '.rmvb', 'flv')): return f.lower().endswith(extensions) diff --git a/paddlers/models/ppdet/data/source/voc.py b/paddlers/models/ppdet/data/source/voc.py index 41324c3..5a1e0cf 100644 --- a/paddlers/models/ppdet/data/source/voc.py +++ b/paddlers/models/ppdet/data/source/voc.py @@ -43,9 +43,10 @@ class VOCDataSet(DetDataset): label_list (str): if use_default_label is False, will load mapping between category and class index. allow_empty (bool): whether to load empty entry. False as default - empty_ratio (float): the ratio of empty record number to total - record's, if empty_ratio is out of [0. ,1.), do not sample the + empty_ratio (float): the ratio of empty record number to total + record's, if empty_ratio is out of [0. ,1.), do not sample the records and use all the empty entries. 1. as default + repeat (int): repeat times for dataset, use in benchmark. """ def __init__(self, @@ -56,13 +57,15 @@ class VOCDataSet(DetDataset): sample_num=-1, label_list=None, allow_empty=False, - empty_ratio=1.): + empty_ratio=1., + repeat=1): super(VOCDataSet, self).__init__( dataset_dir=dataset_dir, image_dir=image_dir, anno_path=anno_path, data_fields=data_fields, - sample_num=sample_num) + sample_num=sample_num, + repeat=repeat) self.label_list = label_list self.allow_empty = allow_empty self.empty_ratio = empty_ratio diff --git a/paddlers/models/ppdet/data/transform/__init__.py b/paddlers/models/ppdet/data/transform/__init__.py index b6af6ae..58cec84 100644 --- a/paddlers/models/ppdet/data/transform/__init__.py +++ b/paddlers/models/ppdet/data/transform/__init__.py @@ -16,11 +16,13 @@ from . import operators from . import batch_operators from . import keypoint_operators from . import mot_operators +from . import rotated_operators from .operators import * from .batch_operators import * from .keypoint_operators import * from .mot_operators import * +from .rotated_operators import * __all__ = [] __all__ += registered_ops diff --git a/paddlers/models/ppdet/data/transform/autoaugment_utils.py b/paddlers/models/ppdet/data/transform/autoaugment_utils.py index 4fbfa4e..094f827 100644 --- a/paddlers/models/ppdet/data/transform/autoaugment_utils.py +++ b/paddlers/models/ppdet/data/transform/autoaugment_utils.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# Reference: +# Reference: # https://github.com/tensorflow/tpu/blob/master/models/official/detection/utils/autoaugment_utils.py """AutoAugment util file.""" @@ -65,7 +65,7 @@ def policy_v1(): [('ShearY_Only_BBoxes', 0.8, 2), ('Flip_Only_BBoxes', 0.0, 10)], [('Equalize', 0.6, 10), ('TranslateX_BBox', 0.2, 2)], [('Color', 1.0, 10), ('TranslateY_Only_BBoxes', 0.4, 6)], - [('Rotate_BBox', 0.8, 10), ('Contrast', 0.0, 10)], # , + [('Rotate_BBox', 0.8, 10), ('Contrast', 0.0, 10)], # , [('Cutout', 0.2, 2), ('Brightness', 0.8, 10)], [('Color', 1.0, 6), ('Equalize', 1.0, 2)], [('Cutout_Only_BBoxes', 0.4, 6), ('TranslateY_Only_BBoxes', 0.8, 2)], diff --git a/paddlers/models/ppdet/data/transform/batch_operators.py b/paddlers/models/ppdet/data/transform/batch_operators.py index 69a3e88..d64b94d 100644 --- a/paddlers/models/ppdet/data/transform/batch_operators.py +++ b/paddlers/models/ppdet/data/transform/batch_operators.py @@ -47,6 +47,8 @@ __all__ = [ 'PadMaskBatch', 'Gt2GFLTarget', 'Gt2CenterNetTarget', + 'PadGT', + 'PadRGT', ] @@ -108,12 +110,6 @@ class PadBatch(BaseOperator): padding_segm[:, :im_h, :im_w] = gt_segm data['gt_segm'] = padding_segm - if 'gt_rbox2poly' in data and data['gt_rbox2poly'] is not None: - # ploy to rbox - polys = data['gt_rbox2poly'] - rbox = bbox_utils.poly2rbox(polys) - data['gt_rbox'] = rbox - return samples @@ -233,7 +229,7 @@ class Gt2YoloTarget(BaseOperator): gi = int(gx * grid_w) gj = int(gy * grid_h) - # gtbox should be regresed in this layes if best match + # gtbox should be regresed in this layes if best match # anchor index in anchor mask of this layer if best_idx in mask: best_n = mask.index(best_idx) @@ -253,7 +249,7 @@ class Gt2YoloTarget(BaseOperator): # classification target[best_n, 6 + cls, gj, gi] = 1. - # For non-matched anchors, calculate the target if the iou + # For non-matched anchors, calculate the target if the iou # between anchor and gt is larger than iou_thresh if self.iou_thresh < 1: for idx, mask_i in enumerate(mask): @@ -618,7 +614,7 @@ class Gt2TTFTarget(BaseOperator): """ Gt2TTFTarget Generate TTFNet targets by ground truth data - + Args: num_classes(int): the number of classes. down_ratio(int): the down ratio from images to heatmap, 4 by default. @@ -980,12 +976,6 @@ class PadMaskBatch(BaseOperator): padding_mask[:im_h, :im_w] = 1. data['pad_mask'] = padding_mask - if 'gt_rbox2poly' in data and data['gt_rbox2poly'] is not None: - # ploy to rbox - polys = data['gt_rbox2poly'] - rbox = bbox_utils.poly2rbox(polys) - data['gt_rbox'] = rbox - return samples @@ -994,7 +984,7 @@ class Gt2CenterNetTarget(BaseOperator): """Gt2CenterNetTarget Genterate CenterNet targets by ground-truth Args: - down_ratio (int): The down sample ratio between output feature and + down_ratio (int): The down sample ratio between output feature and input image. num_classes (int): The number of classes, 80 by default. max_objs (int): The maximum objects detected, 128 by default. @@ -1068,3 +1058,110 @@ class Gt2CenterNetTarget(BaseOperator): sample['size'] = wh sample['offset'] = reg return sample + + +@register_op +class PadGT(BaseOperator): + """ + Pad 0 to `gt_class`, `gt_bbox`, `gt_score`... + The num_max_boxes is the largest for batch. + Args: + return_gt_mask (bool): If true, return `pad_gt_mask`, + 1 means bbox, 0 means no bbox. + """ + + def __init__(self, return_gt_mask=True): + super(PadGT, self).__init__() + self.return_gt_mask = return_gt_mask + + def __call__(self, samples, context=None): + num_max_boxes = max([len(s['gt_bbox']) for s in samples]) + for sample in samples: + if self.return_gt_mask: + sample['pad_gt_mask'] = np.zeros( + (num_max_boxes, 1), dtype=np.float32) + if num_max_boxes == 0: + continue + + num_gt = len(sample['gt_bbox']) + pad_gt_class = np.zeros((num_max_boxes, 1), dtype=np.int32) + pad_gt_bbox = np.zeros((num_max_boxes, 4), dtype=np.float32) + if num_gt > 0: + pad_gt_class[:num_gt] = sample['gt_class'] + pad_gt_bbox[:num_gt] = sample['gt_bbox'] + sample['gt_class'] = pad_gt_class + sample['gt_bbox'] = pad_gt_bbox + # pad_gt_mask + if 'pad_gt_mask' in sample: + sample['pad_gt_mask'][:num_gt] = 1 + # gt_score + if 'gt_score' in sample: + pad_gt_score = np.zeros((num_max_boxes, 1), dtype=np.float32) + if num_gt > 0: + pad_gt_score[:num_gt] = sample['gt_score'] + sample['gt_score'] = pad_gt_score + if 'is_crowd' in sample: + pad_is_crowd = np.zeros((num_max_boxes, 1), dtype=np.int32) + if num_gt > 0: + pad_is_crowd[:num_gt] = sample['is_crowd'] + sample['is_crowd'] = pad_is_crowd + if 'difficult' in sample: + pad_diff = np.zeros((num_max_boxes, 1), dtype=np.int32) + if num_gt > 0: + pad_diff[:num_gt] = sample['difficult'] + sample['difficult'] = pad_diff + return samples + + +@register_op +class PadRGT(BaseOperator): + """ + Pad 0 to `gt_class`, `gt_bbox`, `gt_score`... + The num_max_boxes is the largest for batch. + Args: + return_gt_mask (bool): If true, return `pad_gt_mask`, + 1 means bbox, 0 means no bbox. + """ + + def __init__(self, return_gt_mask=True): + super(PadRGT, self).__init__() + self.return_gt_mask = return_gt_mask + + def pad_field(self, sample, field, num_gt): + name, shape, dtype = field + if name in sample: + pad_v = np.zeros(shape, dtype=dtype) + if num_gt > 0: + pad_v[:num_gt] = sample[name] + sample[name] = pad_v + + def __call__(self, samples, context=None): + num_max_boxes = max([len(s['gt_bbox']) for s in samples]) + for sample in samples: + if self.return_gt_mask: + sample['pad_gt_mask'] = np.zeros( + (num_max_boxes, 1), dtype=np.float32) + if num_max_boxes == 0: + continue + + num_gt = len(sample['gt_bbox']) + pad_gt_class = np.zeros((num_max_boxes, 1), dtype=np.int32) + pad_gt_bbox = np.zeros((num_max_boxes, 4), dtype=np.float32) + if num_gt > 0: + pad_gt_class[:num_gt] = sample['gt_class'] + pad_gt_bbox[:num_gt] = sample['gt_bbox'] + sample['gt_class'] = pad_gt_class + sample['gt_bbox'] = pad_gt_bbox + # pad_gt_mask + if 'pad_gt_mask' in sample: + sample['pad_gt_mask'][:num_gt] = 1 + # gt_score + names = ['gt_score', 'is_crowd', 'difficult', 'gt_poly', 'gt_rbox'] + dims = [1, 1, 1, 8, 5] + dtypes = [np.float32, np.int32, np.int32, np.float32, np.float32] + + for name, dim, dtype in zip(names, dims, dtypes): + self.pad_field(sample, [name, (num_max_boxes, dim), dtype], + num_gt) + + return samples diff --git a/paddlers/models/ppdet/data/transform/keypoint_operators.py b/paddlers/models/ppdet/data/transform/keypoint_operators.py index c9b9648..b33e33b 100644 --- a/paddlers/models/ppdet/data/transform/keypoint_operators.py +++ b/paddlers/models/ppdet/data/transform/keypoint_operators.py @@ -511,18 +511,18 @@ class RandomFlipHalfBodyTransform(object): @register_keypointop class AugmentationbyInformantionDropping(object): - """AID: Augmentation by Informantion Dropping. Please refer - to https://arxiv.org/abs/2008.07139 - + """AID: Augmentation by Informantion Dropping. Please refer + to https://arxiv.org/abs/2008.07139 + Args: prob_cutout (float): The probability of the Cutout augmentation. offset_factor (float): Offset factor of cutout center. - num_patch (int): Number of patches to be cutout. + num_patch (int): Number of patches to be cutout. records(dict): the dict contained the image and coords - + Returns: records (dict): contain the image and coords after tranformed - + """ def __init__(self, @@ -698,8 +698,8 @@ class ToHeatmapsTopDown(object): tmp_size = self.sigma * 3 feat_stride = image_size / self.hmsize for joint_id in range(num_joints): - mu_x = int(joints[joint_id][0] + 0.5) / feat_stride[0] - mu_y = int(joints[joint_id][1] + 0.5) / feat_stride[1] + mu_x = int(joints[joint_id][0] / feat_stride[0] + 0.5) + mu_y = int(joints[joint_id][1] / feat_stride[1] + 0.5) # Check that any part of the gaussian is in-bounds ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)] br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)] @@ -796,7 +796,7 @@ class ToHeatmapsTopDown_DARK(object): class ToHeatmapsTopDown_UDP(object): """This code is based on: https://github.com/HuangJunJie2017/UDP-Pose/blob/master/deep-high-resolution-net.pytorch/lib/dataset/JointsDataset.py - + to generate the gaussian heatmaps of keypoint for heatmap loss. ref: Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020). diff --git a/paddlers/models/ppdet/data/transform/mot_operators.py b/paddlers/models/ppdet/data/transform/mot_operators.py index 43bdac1..da9c23d 100644 --- a/paddlers/models/ppdet/data/transform/mot_operators.py +++ b/paddlers/models/ppdet/data/transform/mot_operators.py @@ -132,7 +132,7 @@ class LetterBoxResize(BaseOperator): @register_op class MOTRandomAffine(BaseOperator): - """ + """ Affine transform to image and coords to achieve the rotate, scale and shift effect for training image. @@ -271,7 +271,7 @@ class Gt2JDETargetThres(BaseOperator): anchors (list): anchors of JDE model anchor_masks (list): anchor_masks of JDE model downsample_ratios (list): downsample ratios of JDE model - ide_thresh (float): thresh of identity, higher is groud truth + ide_thresh (float): thresh of identity, higher is groud truth fg_thresh (float): thresh of foreground, higher is foreground bg_thresh (float): thresh of background, lower is background num_classes (int): number of classes @@ -529,8 +529,8 @@ class Gt2FairMOTTarget(Gt2TTFTarget): Generate FairMOT targets by ground truth data. Difference between Gt2FairMOTTarget and Gt2TTFTarget are: 1. the gaussian kernal radius to generate a heatmap. - 2. the targets needed during traing. - + 2. the targets needed during training. + Args: num_classes(int): the number of classes. down_ratio(int): the down ratio from images to heatmap, 4 by default. diff --git a/paddlers/models/ppdet/data/transform/operators.py b/paddlers/models/ppdet/data/transform/operators.py index 8e09902..078ba2e 100644 --- a/paddlers/models/ppdet/data/transform/operators.py +++ b/paddlers/models/ppdet/data/transform/operators.py @@ -41,7 +41,6 @@ import threading MUTEX = threading.Lock() from paddlers.models.ppdet.core.workspace import serializable -from paddlers.models.ppdet.modeling import bbox_utils from ..reader import Compose from .op_helper import (satisfy_sample_constraint, filter_and_process, @@ -123,12 +122,15 @@ class Decode(BaseOperator): sample['image'] = f.read() sample.pop('im_file') - im = sample['image'] - data = np.frombuffer(im, dtype='uint8') - im = cv2.imdecode(data, 1) # BGR mode, but need RGB mode - if 'keep_ori_im' in sample and sample['keep_ori_im']: - sample['ori_image'] = im - im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) + try: + im = sample['image'] + data = np.frombuffer(im, dtype='uint8') + im = cv2.imdecode(data, 1) # BGR mode, but need RGB mode + if 'keep_ori_im' in sample and sample['keep_ori_im']: + sample['ori_image'] = im + im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) + except: + im = sample['image'] sample['image'] = im if 'h' not in sample: @@ -357,19 +359,26 @@ class RandomErasingImage(BaseOperator): @register_op class NormalizeImage(BaseOperator): - def __init__(self, mean=[0.485, 0.456, 0.406], std=[1, 1, 1], - is_scale=True): + def __init__(self, + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225], + is_scale=True, + norm_type='mean_std'): """ Args: mean (list): the pixel mean std (list): the pixel variance + is_scale (bool): scale the pixel to [0,1] + norm_type (str): type in ['mean_std', 'none'] """ super(NormalizeImage, self).__init__() self.mean = mean self.std = std self.is_scale = is_scale + self.norm_type = norm_type if not (isinstance(self.mean, list) and isinstance(self.std, list) and - isinstance(self.is_scale, bool)): + isinstance(self.is_scale, bool) and + self.norm_type in ['mean_std', 'none']): raise TypeError("{}: input type is invalid.".format(self)) from functools import reduce if reduce(lambda x, y: x * y, self.std) == 0: @@ -378,20 +387,20 @@ class NormalizeImage(BaseOperator): def apply(self, sample, context=None): """Normalize the image. Operators: - 1.(optional) Scale the image to [0,1] - 2. Each pixel minus mean and is divided by std + 1.(optional) Scale the pixel to [0,1] + 2.(optional) Each pixel minus mean and is divided by std """ im = sample['image'] im = im.astype(np.float32, copy=False) - mean = np.array(self.mean)[np.newaxis, np.newaxis, :] - std = np.array(self.std)[np.newaxis, np.newaxis, :] - if self.is_scale: - im = im / 255.0 - - im -= mean - im /= std - + scale = 1.0 / 255.0 + im *= scale + + if self.norm_type == 'mean_std': + mean = np.array(self.mean)[np.newaxis, np.newaxis, :] + std = np.array(self.std)[np.newaxis, np.newaxis, :] + im -= mean + im /= std sample['image'] = im return sample @@ -448,6 +457,10 @@ class GridMask(BaseOperator): @register_op class RandomDistort(BaseOperator): """Random color distortion. + Note: + The 'probability' in [lower, upper, probability] is the probability of not using this transformation, + not the probability of using this transformation. And this only applies in this operator(RandomDistort), + 'probability' in other BaseOperator means the probability of using that transformation. Args: hue (list): hue settings. in [lower, upper, probability] format. saturation (list): saturation settings. in [lower, upper, probability] format. @@ -657,18 +670,6 @@ class RandomFlip(BaseOperator): bbox[:, 2] = width - oldx1 return bbox - def apply_rbox(self, bbox, width): - oldx1 = bbox[:, 0].copy() - oldx2 = bbox[:, 2].copy() - oldx3 = bbox[:, 4].copy() - oldx4 = bbox[:, 6].copy() - bbox[:, 0] = width - oldx1 - bbox[:, 2] = width - oldx2 - bbox[:, 4] = width - oldx3 - bbox[:, 6] = width - oldx4 - bbox = [bbox_utils.get_best_begin_point_single(e) for e in bbox] - return bbox - def apply(self, sample, context=None): """Filp the image and bounding box. Operators: @@ -700,10 +701,6 @@ class RandomFlip(BaseOperator): if 'gt_segm' in sample and sample['gt_segm'].any(): sample['gt_segm'] = sample['gt_segm'][:, :, ::-1] - if 'gt_rbox2poly' in sample and sample['gt_rbox2poly'].any(): - sample['gt_rbox2poly'] = self.apply_rbox(sample['gt_rbox2poly'], - width) - sample['flipped'] = True sample['image'] = im return sample @@ -713,7 +710,7 @@ class RandomFlip(BaseOperator): class Resize(BaseOperator): def __init__(self, target_size, keep_ratio, interp=cv2.INTER_LINEAR): """ - Resize image to target size. if keep_ratio is True, + Resize image to target size. if keep_ratio is True, resize the image's long side to the maximum of target_size if keep_ratio is False, resize the image to target size(h, w) Args: @@ -824,7 +821,7 @@ class Resize(BaseOperator): im_scale_x = resize_w / im_shape[1] im = self.apply_image(sample['image'], [im_scale_x, im_scale_y]) - sample['image'] = im + sample['image'] = im.astype(np.float32) sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32) if 'scale_factor' in sample: scale_factor = sample['scale_factor'] @@ -841,16 +838,6 @@ class Resize(BaseOperator): [im_scale_x, im_scale_y], [resize_w, resize_h]) - # apply rbox - if 'gt_rbox2poly' in sample: - if np.array(sample['gt_rbox2poly']).shape[1] != 8: - logger.warning( - "gt_rbox2poly's length shoule be 8, but actually is {}". - format(len(sample['gt_rbox2poly']))) - sample['gt_rbox2poly'] = self.apply_bbox(sample['gt_rbox2poly'], - [im_scale_x, im_scale_y], - [resize_w, resize_h]) - # apply polygon if 'gt_poly' in sample and len(sample['gt_poly']) > 0: sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im_shape[:2], @@ -1054,7 +1041,7 @@ class CropWithSampling(BaseOperator): [max sample, max trial, min scale, max scale, min aspect ratio, max aspect ratio, min overlap, max overlap] - avoid_no_bbox (bool): whether to to avoid the + avoid_no_bbox (bool): whether to avoid the situation where the box does not appear. """ super(CropWithSampling, self).__init__() @@ -1145,7 +1132,7 @@ class CropWithDataAchorSampling(BaseOperator): das_anchor_scales (list[float]): a list of anchor scales in data anchor smapling. min_size (float): minimum size of sampled bbox. - avoid_no_bbox (bool): whether to to avoid the + avoid_no_bbox (bool): whether to avoid the situation where the box does not appear. """ super(CropWithDataAchorSampling, self).__init__() @@ -1504,6 +1491,11 @@ class RandomCrop(BaseOperator): if 'is_crowd' in sample: sample['is_crowd'] = np.take( sample['is_crowd'], valid_ids, axis=0) + + if 'difficult' in sample: + sample['difficult'] = np.take( + sample['difficult'], valid_ids, axis=0) + return sample return sample @@ -1604,7 +1596,7 @@ class RandomScaledCrop(BaseOperator): @register_op class Cutmix(BaseOperator): def __init__(self, alpha=1.5, beta=1.5): - """ + """ CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features, see https://arxiv.org/abs/1905.04899 Cutmix image and gt_bbbox/gt_score Args: @@ -1747,7 +1739,7 @@ class Mixup(BaseOperator): gt_score2 = np.ones_like(sample[1]['gt_class']) gt_score = np.concatenate( (gt_score1 * factor, gt_score2 * (1. - factor)), axis=0) - result['gt_score'] = gt_score + result['gt_score'] = gt_score.astype('float32') if 'is_crowd' in sample[0]: is_crowd1 = sample[0]['is_crowd'] is_crowd2 = sample[1]['is_crowd'] @@ -2029,13 +2021,14 @@ class Pad(BaseOperator): if self.size: h, w = self.size assert ( - im_h < h and im_w < w + im_h <= h and im_w <= w ), '(h, w) of target size should be greater than (im_h, im_w)' else: - h = np.ceil(im_h / self.size_divisor) * self.size_divisor - w = np.ceil(im_w / self.size_divisor) * self.size_divisor + h = int(np.ceil(im_h / self.size_divisor) * self.size_divisor) + w = int(np.ceil(im_w / self.size_divisor) * self.size_divisor) if h == im_h and w == im_w: + sample['image'] = im.astype(np.float32) return sample if self.pad_mode == -1: @@ -2106,44 +2099,30 @@ class Poly2Mask(BaseOperator): @register_op -class Rbox2Poly(BaseOperator): - """ - Convert rbbox format to poly format. +class AugmentHSV(BaseOperator): + """ + Augment the SV channel of image data. + Args: + fraction (float): the fraction for augment. Default: 0.5. + is_bgr (bool): whether the image is BGR mode. Default: True. + hgain (float): H channel gains + sgain (float): S channel gains + vgain (float): V channel gains """ - def __init__(self): - super(Rbox2Poly, self).__init__() - - def apply(self, sample, context=None): - assert 'gt_rbox' in sample - assert sample['gt_rbox'].shape[1] == 5 - rrects = sample['gt_rbox'] - x_ctr = rrects[:, 0] - y_ctr = rrects[:, 1] - width = rrects[:, 2] - height = rrects[:, 3] - x1 = x_ctr - width / 2.0 - y1 = y_ctr - height / 2.0 - x2 = x_ctr + width / 2.0 - y2 = y_ctr + height / 2.0 - sample['gt_bbox'] = np.stack([x1, y1, x2, y2], axis=1) - polys = bbox_utils.rbox2poly_np(rrects) - sample['gt_rbox2poly'] = polys - return sample - - -@register_op -class AugmentHSV(BaseOperator): - def __init__(self, fraction=0.50, is_bgr=True): - """ - Augment the SV channel of image data. - Args: - fraction (float): the fraction for augment. Default: 0.5. - is_bgr (bool): whether the image is BGR mode. Default: True. - """ + def __init__(self, + fraction=0.50, + is_bgr=True, + hgain=None, + sgain=None, + vgain=None): super(AugmentHSV, self).__init__() self.fraction = fraction self.is_bgr = is_bgr + self.hgain = hgain + self.sgain = sgain + self.vgain = vgain + self.use_hsvgain = False if hgain is None else True def apply(self, sample, context=None): img = sample['image'] @@ -2151,27 +2130,39 @@ class AugmentHSV(BaseOperator): img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) else: img_hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV) - S = img_hsv[:, :, 1].astype(np.float32) - V = img_hsv[:, :, 2].astype(np.float32) - a = (random.random() * 2 - 1) * self.fraction + 1 - S *= a - if a > 1: - np.clip(S, a_min=0, a_max=255, out=S) + if self.use_hsvgain: + hsv_augs = np.random.uniform( + -1, 1, 3) * [self.hgain, self.sgain, self.vgain] + # random selection of h, s, v + hsv_augs *= np.random.randint(0, 2, 3) + img_hsv[..., 0] = (img_hsv[..., 0] + hsv_augs[0]) % 180 + img_hsv[..., 1] = np.clip(img_hsv[..., 1] + hsv_augs[1], 0, 255) + img_hsv[..., 2] = np.clip(img_hsv[..., 2] + hsv_augs[2], 0, 255) + + else: + S = img_hsv[:, :, 1].astype(np.float32) + V = img_hsv[:, :, 2].astype(np.float32) + + a = (random.random() * 2 - 1) * self.fraction + 1 + S *= a + if a > 1: + np.clip(S, a_min=0, a_max=255, out=S) + + a = (random.random() * 2 - 1) * self.fraction + 1 + V *= a + if a > 1: + np.clip(V, a_min=0, a_max=255, out=V) - a = (random.random() * 2 - 1) * self.fraction + 1 - V *= a - if a > 1: - np.clip(V, a_min=0, a_max=255, out=V) + img_hsv[:, :, 1] = S.astype(np.uint8) + img_hsv[:, :, 2] = V.astype(np.uint8) - img_hsv[:, :, 1] = S.astype(np.uint8) - img_hsv[:, :, 2] = V.astype(np.uint8) if self.is_bgr: cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img) else: cv2.cvtColor(img_hsv, cv2.COLOR_HSV2RGB, dst=img) - sample['image'] = img + sample['image'] = img.astype(np.float32) return sample @@ -2223,7 +2214,7 @@ class RandomResizeCrop(BaseOperator): 'long', resize the image's long side to the maximum of target_size, if keep_ratio is True and mode is 'short', resize the image's short side to the minimum of target_size. cropsizes (list): crop sizes after resize, [(min_crop_1, max_crop_1), ...] - mode (str): resize mode, `long` or `short`. Details see resizes. + mode (str): resize mode, `long` or `short`. Details see resizes. prob (float): probability of this op. keep_ratio (bool): whether keep_ratio or not, default true interp (int): the interpolation method @@ -2425,16 +2416,6 @@ class RandomResizeCrop(BaseOperator): [im_scale_x, im_scale_y], [resize_w, resize_h]) - # apply rbox - if 'gt_rbox2poly' in sample: - if np.array(sample['gt_rbox2poly']).shape[1] != 8: - logger.warn( - "gt_rbox2poly's length shoule be 8, but actually is {}". - format(len(sample['gt_rbox2poly']))) - sample['gt_rbox2poly'] = self.apply_bbox(sample['gt_rbox2poly'], - [im_scale_x, im_scale_y], - [resize_w, resize_h]) - # apply polygon if 'gt_poly' in sample and len(sample['gt_poly']) > 0: sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im_shape[:2], @@ -2892,7 +2873,7 @@ class FlipWarpAffine(BaseOperator): """FlipWarpAffine 1. Random Crop 2. Flip the image horizontal - 3. Warp affine the image + 3. Warp affine the image """ super(FlipWarpAffine, self).__init__() self.keep_res = keep_res @@ -3013,3 +2994,409 @@ class CenterRandColor(BaseOperator): img = func(img, img_gray) sample['image'] = img return sample + + +@register_op +class Mosaic(BaseOperator): + """ Mosaic operator for image and gt_bboxes + The code is based on https://github.com/Megvii-BaseDetection/YOLOX/blob/main/yolox/data/datasets/mosaicdetection.py + + 1. get mosaic coords + 2. clip bbox and get mosaic_labels + 3. random_affine augment + 4. Mixup augment as copypaste (optinal), not used in tiny/nano + + Args: + prob (float): probability of using Mosaic, 1.0 as default + input_dim (list[int]): input shape + degrees (list[2]): the rotate range to apply, transform range is [min, max] + translate (list[2]): the translate range to apply, transform range is [min, max] + scale (list[2]): the scale range to apply, transform range is [min, max] + shear (list[2]): the shear range to apply, transform range is [min, max] + enable_mixup (bool): whether to enable Mixup or not + mixup_prob (float): probability of using Mixup, 1.0 as default + mixup_scale (list[int]): scale range of Mixup + remove_outside_box (bool): whether remove outside boxes, False as + default in COCO dataset, True in MOT dataset + """ + + def __init__(self, + prob=1.0, + input_dim=[640, 640], + degrees=[-10, 10], + translate=[-0.1, 0.1], + scale=[0.1, 2], + shear=[-2, 2], + enable_mixup=True, + mixup_prob=1.0, + mixup_scale=[0.5, 1.5], + remove_outside_box=False): + super(Mosaic, self).__init__() + self.prob = prob + if isinstance(input_dim, Integral): + input_dim = [input_dim, input_dim] + self.input_dim = input_dim + self.degrees = degrees + self.translate = translate + self.scale = scale + self.shear = shear + self.enable_mixup = enable_mixup + self.mixup_prob = mixup_prob + self.mixup_scale = mixup_scale + self.remove_outside_box = remove_outside_box + + def get_mosaic_coords(self, mosaic_idx, xc, yc, w, h, input_h, input_w): + # (x1, y1, x2, y2) means coords in large image, + # small_coords means coords in small image in mosaic aug. + if mosaic_idx == 0: + # top left + x1, y1, x2, y2 = max(xc - w, 0), max(yc - h, 0), xc, yc + small_coords = w - (x2 - x1), h - (y2 - y1), w, h + elif mosaic_idx == 1: + # top right + x1, y1, x2, y2 = xc, max(yc - h, 0), min(xc + w, input_w * 2), yc + small_coords = 0, h - (y2 - y1), min(w, x2 - x1), h + elif mosaic_idx == 2: + # bottom left + x1, y1, x2, y2 = max(xc - w, 0), yc, xc, min(input_h * 2, yc + h) + small_coords = w - (x2 - x1), 0, w, min(y2 - y1, h) + elif mosaic_idx == 3: + # bottom right + x1, y1, x2, y2 = xc, yc, min(xc + w, input_w * 2), min(input_h * 2, + yc + h) + small_coords = 0, 0, min(w, x2 - x1), min(y2 - y1, h) + + return (x1, y1, x2, y2), small_coords + + def random_affine_augment(self, + img, + labels=[], + input_dim=[640, 640], + degrees=[-10, 10], + scales=[0.1, 2], + shears=[-2, 2], + translates=[-0.1, 0.1]): + # random rotation and scale + degree = random.uniform(degrees[0], degrees[1]) + scale = random.uniform(scales[0], scales[1]) + assert scale > 0, "Argument scale should be positive." + R = cv2.getRotationMatrix2D(angle=degree, center=(0, 0), scale=scale) + M = np.ones([2, 3]) + + # random shear + shear = random.uniform(shears[0], shears[1]) + shear_x = math.tan(shear * math.pi / 180) + shear_y = math.tan(shear * math.pi / 180) + M[0] = R[0] + shear_y * R[1] + M[1] = R[1] + shear_x * R[0] + + # random translation + translate = random.uniform(translates[0], translates[1]) + translation_x = translate * input_dim[0] + translation_y = translate * input_dim[1] + M[0, 2] = translation_x + M[1, 2] = translation_y + + # warpAffine + img = cv2.warpAffine( + img, M, dsize=tuple(input_dim), borderValue=(114, 114, 114)) + + num_gts = len(labels) + if num_gts > 0: + # warp corner points + corner_points = np.ones((4 * num_gts, 3)) + corner_points[:, :2] = labels[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape( + 4 * num_gts, 2) # x1y1, x2y2, x1y2, x2y1 + # apply affine transform + corner_points = corner_points @M.T + corner_points = corner_points.reshape(num_gts, 8) + + # create new boxes + corner_xs = corner_points[:, 0::2] + corner_ys = corner_points[:, 1::2] + new_bboxes = np.concatenate((corner_xs.min(1), corner_ys.min(1), + corner_xs.max(1), corner_ys.max(1))) + new_bboxes = new_bboxes.reshape(4, num_gts).T + + # clip boxes + new_bboxes[:, 0::2] = np.clip(new_bboxes[:, 0::2], 0, input_dim[0]) + new_bboxes[:, 1::2] = np.clip(new_bboxes[:, 1::2], 0, input_dim[1]) + labels[:, :4] = new_bboxes + + return img, labels + + def __call__(self, sample, context=None): + if not isinstance(sample, Sequence): + return sample + + assert len( + sample) == 5, "Mosaic needs 5 samples, 4 for mosaic and 1 for mixup." + if np.random.uniform(0., 1.) > self.prob: + return sample[0] + + mosaic_gt_bbox, mosaic_gt_class, mosaic_is_crowd, mosaic_difficult = [], [], [], [] + input_h, input_w = self.input_dim + yc = int(random.uniform(0.5 * input_h, 1.5 * input_h)) + xc = int(random.uniform(0.5 * input_w, 1.5 * input_w)) + mosaic_img = np.full((input_h * 2, input_w * 2, 3), 114, dtype=np.uint8) + + # 1. get mosaic coords + for mosaic_idx, sp in enumerate(sample[:4]): + img = sp['image'] + gt_bbox = sp['gt_bbox'] + h0, w0 = img.shape[:2] + scale = min(1. * input_h / h0, 1. * input_w / w0) + img = cv2.resize( + img, (int(w0 * scale), int(h0 * scale)), + interpolation=cv2.INTER_LINEAR) + (h, w, c) = img.shape[:3] + + # suffix l means large image, while s means small image in mosaic aug. + (l_x1, l_y1, l_x2, l_y2), ( + s_x1, s_y1, s_x2, s_y2) = self.get_mosaic_coords( + mosaic_idx, xc, yc, w, h, input_h, input_w) + + mosaic_img[l_y1:l_y2, l_x1:l_x2] = img[s_y1:s_y2, s_x1:s_x2] + padw, padh = l_x1 - s_x1, l_y1 - s_y1 + + # Normalized xywh to pixel xyxy format + _gt_bbox = gt_bbox.copy() + if len(gt_bbox) > 0: + _gt_bbox[:, 0] = scale * gt_bbox[:, 0] + padw + _gt_bbox[:, 1] = scale * gt_bbox[:, 1] + padh + _gt_bbox[:, 2] = scale * gt_bbox[:, 2] + padw + _gt_bbox[:, 3] = scale * gt_bbox[:, 3] + padh + + mosaic_gt_bbox.append(_gt_bbox) + mosaic_gt_class.append(sp['gt_class']) + if 'is_crowd' in sp: + mosaic_is_crowd.append(sp['is_crowd']) + if 'difficult' in sp: + mosaic_difficult.append(sp['difficult']) + + # 2. clip bbox and get mosaic_labels([gt_bbox, gt_class, is_crowd]) + if len(mosaic_gt_bbox): + mosaic_gt_bbox = np.concatenate(mosaic_gt_bbox, 0) + mosaic_gt_class = np.concatenate(mosaic_gt_class, 0) + if mosaic_is_crowd: + mosaic_is_crowd = np.concatenate(mosaic_is_crowd, 0) + mosaic_labels = np.concatenate([ + mosaic_gt_bbox, + mosaic_gt_class.astype(mosaic_gt_bbox.dtype), + mosaic_is_crowd.astype(mosaic_gt_bbox.dtype) + ], 1) + elif mosaic_difficult: + mosaic_difficult = np.concatenate(mosaic_difficult, 0) + mosaic_labels = np.concatenate([ + mosaic_gt_bbox, + mosaic_gt_class.astype(mosaic_gt_bbox.dtype), + mosaic_difficult.astype(mosaic_gt_bbox.dtype) + ], 1) + else: + mosaic_labels = np.concatenate([ + mosaic_gt_bbox, mosaic_gt_class.astype(mosaic_gt_bbox.dtype) + ], 1) + if self.remove_outside_box: + # for MOT dataset + flag1 = mosaic_gt_bbox[:, 0] < 2 * input_w + flag2 = mosaic_gt_bbox[:, 2] > 0 + flag3 = mosaic_gt_bbox[:, 1] < 2 * input_h + flag4 = mosaic_gt_bbox[:, 3] > 0 + flag_all = flag1 * flag2 * flag3 * flag4 + mosaic_labels = mosaic_labels[flag_all] + else: + mosaic_labels[:, 0] = np.clip(mosaic_labels[:, 0], 0, + 2 * input_w) + mosaic_labels[:, 1] = np.clip(mosaic_labels[:, 1], 0, + 2 * input_h) + mosaic_labels[:, 2] = np.clip(mosaic_labels[:, 2], 0, + 2 * input_w) + mosaic_labels[:, 3] = np.clip(mosaic_labels[:, 3], 0, + 2 * input_h) + else: + mosaic_labels = np.zeros((1, 6)) + + # 3. random_affine augment + mosaic_img, mosaic_labels = self.random_affine_augment( + mosaic_img, + mosaic_labels, + input_dim=self.input_dim, + degrees=self.degrees, + translates=self.translate, + scales=self.scale, + shears=self.shear) + + # 4. Mixup augment as copypaste, https://arxiv.org/abs/2012.07177 + # optinal, not used(enable_mixup=False) in tiny/nano + if (self.enable_mixup and not len(mosaic_labels) == 0 and + random.random() < self.mixup_prob): + sample_mixup = sample[4] + mixup_img = sample_mixup['image'] + if 'is_crowd' in sample_mixup: + cp_labels = np.concatenate([ + sample_mixup['gt_bbox'], + sample_mixup['gt_class'].astype(mosaic_labels.dtype), + sample_mixup['is_crowd'].astype(mosaic_labels.dtype) + ], 1) + elif 'difficult' in sample_mixup: + cp_labels = np.concatenate([ + sample_mixup['gt_bbox'], + sample_mixup['gt_class'].astype(mosaic_labels.dtype), + sample_mixup['difficult'].astype(mosaic_labels.dtype) + ], 1) + else: + cp_labels = np.concatenate([ + sample_mixup['gt_bbox'], + sample_mixup['gt_class'].astype(mosaic_labels.dtype) + ], 1) + mosaic_img, mosaic_labels = self.mixup_augment( + mosaic_img, mosaic_labels, self.input_dim, cp_labels, mixup_img) + + sample0 = sample[0] + sample0['image'] = mosaic_img.astype(np.uint8) # can not be float32 + sample0['h'] = float(mosaic_img.shape[0]) + sample0['w'] = float(mosaic_img.shape[1]) + sample0['im_shape'][0] = sample0['h'] + sample0['im_shape'][1] = sample0['w'] + sample0['gt_bbox'] = mosaic_labels[:, :4].astype(np.float32) + sample0['gt_class'] = mosaic_labels[:, 4:5].astype(np.float32) + if 'is_crowd' in sample[0]: + sample0['is_crowd'] = mosaic_labels[:, 5:6].astype(np.float32) + if 'difficult' in sample[0]: + sample0['difficult'] = mosaic_labels[:, 5:6].astype(np.float32) + return sample0 + + def mixup_augment(self, origin_img, origin_labels, input_dim, cp_labels, + img): + jit_factor = random.uniform(*self.mixup_scale) + FLIP = random.uniform(0, 1) > 0.5 + if len(img.shape) == 3: + cp_img = np.ones( + (input_dim[0], input_dim[1], 3), dtype=np.uint8) * 114 + else: + cp_img = np.ones(input_dim, dtype=np.uint8) * 114 + + cp_scale_ratio = min(input_dim[0] / img.shape[0], + input_dim[1] / img.shape[1]) + resized_img = cv2.resize( + img, (int(img.shape[1] * cp_scale_ratio), + int(img.shape[0] * cp_scale_ratio)), + interpolation=cv2.INTER_LINEAR) + + cp_img[:int(img.shape[0] * cp_scale_ratio), :int(img.shape[ + 1] * cp_scale_ratio)] = resized_img + + cp_img = cv2.resize(cp_img, (int(cp_img.shape[1] * jit_factor), + int(cp_img.shape[0] * jit_factor))) + cp_scale_ratio *= jit_factor + + if FLIP: + cp_img = cp_img[:, ::-1, :] + + origin_h, origin_w = cp_img.shape[:2] + target_h, target_w = origin_img.shape[:2] + padded_img = np.zeros( + (max(origin_h, target_h), max(origin_w, target_w), 3), + dtype=np.uint8) + padded_img[:origin_h, :origin_w] = cp_img + + x_offset, y_offset = 0, 0 + if padded_img.shape[0] > target_h: + y_offset = random.randint(0, padded_img.shape[0] - target_h - 1) + if padded_img.shape[1] > target_w: + x_offset = random.randint(0, padded_img.shape[1] - target_w - 1) + padded_cropped_img = padded_img[y_offset:y_offset + target_h, x_offset: + x_offset + target_w] + + # adjust boxes + cp_bboxes_origin_np = cp_labels[:, :4].copy() + cp_bboxes_origin_np[:, 0::2] = np.clip(cp_bboxes_origin_np[:, 0::2] * + cp_scale_ratio, 0, origin_w) + cp_bboxes_origin_np[:, 1::2] = np.clip(cp_bboxes_origin_np[:, 1::2] * + cp_scale_ratio, 0, origin_h) + + if FLIP: + cp_bboxes_origin_np[:, 0::2] = ( + origin_w - cp_bboxes_origin_np[:, 0::2][:, ::-1]) + cp_bboxes_transformed_np = cp_bboxes_origin_np.copy() + if self.remove_outside_box: + # for MOT dataset + cp_bboxes_transformed_np[:, 0::2] -= x_offset + cp_bboxes_transformed_np[:, 1::2] -= y_offset + else: + cp_bboxes_transformed_np[:, 0::2] = np.clip( + cp_bboxes_transformed_np[:, 0::2] - x_offset, 0, target_w) + cp_bboxes_transformed_np[:, 1::2] = np.clip( + cp_bboxes_transformed_np[:, 1::2] - y_offset, 0, target_h) + + cls_labels = cp_labels[:, 4:5].copy() + box_labels = cp_bboxes_transformed_np + if cp_labels.shape[-1] == 6: + crd_labels = cp_labels[:, 5:6].copy() + labels = np.hstack((box_labels, cls_labels, crd_labels)) + else: + labels = np.hstack((box_labels, cls_labels)) + if self.remove_outside_box: + labels = labels[labels[:, 0] < target_w] + labels = labels[labels[:, 2] > 0] + labels = labels[labels[:, 1] < target_h] + labels = labels[labels[:, 3] > 0] + + origin_labels = np.vstack((origin_labels, labels)) + origin_img = origin_img.astype(np.float32) + origin_img = 0.5 * origin_img + 0.5 * padded_cropped_img.astype( + np.float32) + + return origin_img.astype(np.uint8), origin_labels + + +@register_op +class PadResize(BaseOperator): + """ PadResize for image and gt_bbbox + + Args: + target_size (list[int]): input shape + fill_value (float): pixel value of padded image + """ + + def __init__(self, target_size, fill_value=114): + super(PadResize, self).__init__() + if isinstance(target_size, Integral): + target_size = [target_size, target_size] + self.target_size = target_size + self.fill_value = fill_value + + def _resize(self, img, bboxes, labels): + ratio = min(self.target_size[0] / img.shape[0], + self.target_size[1] / img.shape[1]) + w, h = int(img.shape[1] * ratio), int(img.shape[0] * ratio) + resized_img = cv2.resize(img, (w, h), interpolation=cv2.INTER_LINEAR) + + if len(bboxes) > 0: + bboxes *= ratio + mask = np.minimum(bboxes[:, 2] - bboxes[:, 0], + bboxes[:, 3] - bboxes[:, 1]) > 1 + bboxes = bboxes[mask] + labels = labels[mask] + return resized_img, bboxes, labels + + def _pad(self, img): + h, w, _ = img.shape + if h == self.target_size[0] and w == self.target_size[1]: + return img + padded_img = np.full( + (self.target_size[0], self.target_size[1], 3), + self.fill_value, + dtype=np.uint8) + padded_img[:h, :w] = img + return padded_img + + def apply(self, sample, context=None): + image = sample['image'] + bboxes = sample['gt_bbox'] + labels = sample['gt_class'] + image, bboxes, labels = self._resize(image, bboxes, labels) + sample['image'] = self._pad(image).astype(np.float32) + sample['gt_bbox'] = bboxes + sample['gt_class'] = labels + return sample diff --git a/paddlers/models/ppdet/data/transform/rotated_operators.py b/paddlers/models/ppdet/data/transform/rotated_operators.py new file mode 100644 index 0000000..e643d37 --- /dev/null +++ b/paddlers/models/ppdet/data/transform/rotated_operators.py @@ -0,0 +1,479 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +try: + from collections.abc import Sequence +except Exception: + from collections import Sequence + +from numbers import Number, Integral + +import cv2 +import numpy as np +import math +import copy + +from .operators import register_op, BaseOperator +from paddlers.models.ppdet.modeling.rbox_utils import poly2rbox_le135_np, poly2rbox_oc_np, rbox2poly_np +from paddlers.models.ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + + +@register_op +class RRotate(BaseOperator): + """ Rotate Image, Polygon, Box + + Args: + scale (float): rotate scale + angle (float): rotate angle + fill_value (int, tuple): fill color + auto_bound (bool): whether auto bound or not + """ + + def __init__(self, scale=1.0, angle=0., fill_value=0., auto_bound=True): + super(RRotate, self).__init__() + self.scale = scale + self.angle = angle + self.fill_value = fill_value + self.auto_bound = auto_bound + + def get_rotated_matrix(self, angle, scale, h, w): + center = ((w - 1) * 0.5, (h - 1) * 0.5) + matrix = cv2.getRotationMatrix2D(center, -angle, scale) + # calculate the new size + cos = np.abs(matrix[0, 0]) + sin = np.abs(matrix[0, 1]) + new_w = h * sin + w * cos + new_h = h * cos + w * sin + # calculate offset + n_w = int(np.round(new_w)) + n_h = int(np.round(new_h)) + if self.auto_bound: + ratio = min(w / n_w, h / n_h) + matrix = cv2.getRotationMatrix2D(center, -angle, ratio) + else: + matrix[0, 2] += (new_w - w) * 0.5 + matrix[1, 2] += (new_h - h) * 0.5 + w = n_w + h = n_h + return matrix, h, w + + def get_rect_from_pts(self, pts, h, w): + """ get minimum rectangle of points + """ + assert pts.shape[-1] % 2 == 0, 'the dim of input [pts] is not correct' + min_x, min_y = np.min(pts[:, 0::2], axis=1), np.min(pts[:, 1::2], + axis=1) + max_x, max_y = np.max(pts[:, 0::2], axis=1), np.max(pts[:, 1::2], + axis=1) + min_x, min_y = np.clip(min_x, 0, w), np.clip(min_y, 0, h) + max_x, max_y = np.clip(max_x, 0, w), np.clip(max_y, 0, h) + boxes = np.stack([min_x, min_y, max_x, max_y], axis=-1) + return boxes + + def apply_image(self, image, matrix, h, w): + return cv2.warpAffine( + image, matrix, (w, h), borderValue=self.fill_value) + + def apply_pts(self, pts, matrix, h, w): + assert pts.shape[-1] % 2 == 0, 'the dim of input [pts] is not correct' + # n is number of samples and m is two times the number of points due to (x, y) + _, m = pts.shape + # transpose points + pts_ = pts.reshape(-1, 2).T + # pad 1 to convert the points to homogeneous coordinates + padding = np.ones((1, pts_.shape[1]), pts.dtype) + rotated_pts = np.matmul(matrix, np.concatenate((pts_, padding), axis=0)) + return rotated_pts[:2, :].T.reshape(-1, m) + + def apply(self, sample, context=None): + image = sample['image'] + h, w = image.shape[:2] + matrix, h, w = self.get_rotated_matrix(self.angle, self.scale, h, w) + sample['image'] = self.apply_image(image, matrix, h, w) + polys = sample['gt_poly'] + # TODO: segment or keypoint to be processed + if len(polys) > 0: + pts = self.apply_pts(polys, matrix, h, w) + sample['gt_poly'] = pts + sample['gt_bbox'] = self.get_rect_from_pts(pts, h, w) + + return sample + + +@register_op +class RandomRRotate(BaseOperator): + """ Random Rotate Image + Args: + scale (float, tuple, list): rotate scale + scale_mode (str): mode of scale, [range, value, None] + angle (float, tuple, list): rotate angle + angle_mode (str): mode of angle, [range, value, None] + fill_value (float, tuple, list): fill value + rotate_prob (float): probability of rotation + auto_bound (bool): whether auto bound or not + """ + + def __init__(self, + scale=1.0, + scale_mode=None, + angle=0., + angle_mode=None, + fill_value=0., + rotate_prob=1.0, + auto_bound=True): + super(RandomRRotate, self).__init__() + self.scale = scale + self.scale_mode = scale_mode + self.angle = angle + self.angle_mode = angle_mode + self.fill_value = fill_value + self.rotate_prob = rotate_prob + self.auto_bound = auto_bound + + def get_angle(self, angle, angle_mode): + assert not angle_mode or angle_mode in [ + 'range', 'value' + ], 'angle mode should be in [range, value, None]' + if not angle_mode: + return angle + elif angle_mode == 'range': + low, high = angle + return np.random.rand() * (high - low) + low + elif angle_mode == 'value': + return np.random.choice(angle) + + def get_scale(self, scale, scale_mode): + assert not scale_mode or scale_mode in [ + 'range', 'value' + ], 'scale mode should be in [range, value, None]' + if not scale_mode: + return scale + elif scale_mode == 'range': + low, high = scale + return np.random.rand() * (high - low) + low + elif scale_mode == 'value': + return np.random.choice(scale) + + def apply(self, sample, context=None): + if np.random.rand() > self.rotate_prob: + return sample + + angle = self.get_angle(self.angle, self.angle_mode) + scale = self.get_scale(self.scale, self.scale_mode) + rotator = RRotate(scale, angle, self.fill_value, self.auto_bound) + return rotator(sample) + + +@register_op +class Poly2RBox(BaseOperator): + """ Polygon to Rotated Box, using new OpenCV definition since 4.5.1 + + Args: + filter_threshold (int, float): threshold to filter annotations + filter_mode (str): filter mode, ['area', 'edge'] + rbox_type (str): rbox type, ['le135', 'oc'] + + """ + + def __init__(self, filter_threshold=4, filter_mode=None, rbox_type='le135'): + super(Poly2RBox, self).__init__() + self.filter_fn = lambda size: self.filter(size, filter_threshold, filter_mode) + self.rbox_fn = poly2rbox_le135_np if rbox_type == 'le135' else poly2rbox_oc_np + + def filter(self, size, threshold, mode): + if mode == 'area': + if size[0] * size[1] < threshold: + return True + elif mode == 'edge': + if min(size) < threshold: + return True + return False + + def get_rbox(self, polys): + valid_ids, rboxes, bboxes = [], [], [] + for i, poly in enumerate(polys): + cx, cy, w, h, angle = self.rbox_fn(poly) + if self.filter_fn((w, h)): + continue + rboxes.append(np.array([cx, cy, w, h, angle], dtype=np.float32)) + valid_ids.append(i) + xmin, ymin = min(poly[0::2]), min(poly[1::2]) + xmax, ymax = max(poly[0::2]), max(poly[1::2]) + bboxes.append(np.array([xmin, ymin, xmax, ymax], dtype=np.float32)) + + if len(valid_ids) == 0: + rboxes = np.zeros((0, 5), dtype=np.float32) + bboxes = np.zeros((0, 4), dtype=np.float32) + else: + rboxes = np.stack(rboxes) + bboxes = np.stack(bboxes) + + return rboxes, bboxes, valid_ids + + def apply(self, sample, context=None): + rboxes, bboxes, valid_ids = self.get_rbox(sample['gt_poly']) + sample['gt_rbox'] = rboxes + sample['gt_bbox'] = bboxes + for k in ['gt_class', 'gt_score', 'gt_poly', 'is_crowd', 'difficult']: + if k in sample: + sample[k] = sample[k][valid_ids] + + return sample + + +@register_op +class Poly2Array(BaseOperator): + """ convert gt_poly to np.array for rotated bboxes + """ + + def __init__(self): + super(Poly2Array, self).__init__() + + def apply(self, sample, context=None): + if 'gt_poly' in sample: + sample['gt_poly'] = np.array( + sample['gt_poly'], dtype=np.float32).reshape((-1, 8)) + + return sample + + +@register_op +class RResize(BaseOperator): + def __init__(self, target_size, keep_ratio, interp=cv2.INTER_LINEAR): + """ + Resize image to target size. if keep_ratio is True, + resize the image's long side to the maximum of target_size + if keep_ratio is False, resize the image to target size(h, w) + Args: + target_size (int|list): image target size + keep_ratio (bool): whether keep_ratio or not, default true + interp (int): the interpolation method + """ + super(RResize, self).__init__() + self.keep_ratio = keep_ratio + self.interp = interp + if not isinstance(target_size, (Integral, Sequence)): + raise TypeError( + "Type of target_size is invalid. Must be Integer or List or Tuple, now is {}". + format(type(target_size))) + if isinstance(target_size, Integral): + target_size = [target_size, target_size] + self.target_size = target_size + + def apply_image(self, image, scale): + im_scale_x, im_scale_y = scale + + return cv2.resize( + image, + None, + None, + fx=im_scale_x, + fy=im_scale_y, + interpolation=self.interp) + + def apply_pts(self, pts, scale, size): + im_scale_x, im_scale_y = scale + resize_w, resize_h = size + pts[:, 0::2] *= im_scale_x + pts[:, 1::2] *= im_scale_y + pts[:, 0::2] = np.clip(pts[:, 0::2], 0, resize_w) + pts[:, 1::2] = np.clip(pts[:, 1::2], 0, resize_h) + return pts + + def apply(self, sample, context=None): + """ Resize the image numpy. + """ + im = sample['image'] + if not isinstance(im, np.ndarray): + raise TypeError("{}: image type is not numpy.".format(self)) + if len(im.shape) != 3: + raise ImageError('{}: image is not 3-dimensional.'.format(self)) + + # apply image + im_shape = im.shape + if self.keep_ratio: + + im_size_min = np.min(im_shape[0:2]) + im_size_max = np.max(im_shape[0:2]) + + target_size_min = np.min(self.target_size) + target_size_max = np.max(self.target_size) + + im_scale = min(target_size_min / im_size_min, + target_size_max / im_size_max) + + resize_h = im_scale * float(im_shape[0]) + resize_w = im_scale * float(im_shape[1]) + + im_scale_x = im_scale + im_scale_y = im_scale + else: + resize_h, resize_w = self.target_size + im_scale_y = resize_h / im_shape[0] + im_scale_x = resize_w / im_shape[1] + + im = self.apply_image(sample['image'], [im_scale_x, im_scale_y]) + sample['image'] = im.astype(np.float32) + sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32) + if 'scale_factor' in sample: + scale_factor = sample['scale_factor'] + sample['scale_factor'] = np.asarray( + [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x], + dtype=np.float32) + else: + sample['scale_factor'] = np.asarray( + [im_scale_y, im_scale_x], dtype=np.float32) + + # apply bbox + if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: + sample['gt_bbox'] = self.apply_pts(sample['gt_bbox'], + [im_scale_x, im_scale_y], + [resize_w, resize_h]) + + # apply polygon + if 'gt_poly' in sample and len(sample['gt_poly']) > 0: + sample['gt_poly'] = self.apply_pts(sample['gt_poly'], + [im_scale_x, im_scale_y], + [resize_w, resize_h]) + + return sample + + +@register_op +class RandomRFlip(BaseOperator): + def __init__(self, prob=0.5): + """ + Args: + prob (float): the probability of flipping image + """ + super(RandomRFlip, self).__init__() + self.prob = prob + if not (isinstance(self.prob, float)): + raise TypeError("{}: input type is invalid.".format(self)) + + def apply_image(self, image): + return image[:, ::-1, :] + + def apply_pts(self, pts, width): + oldx = pts[:, 0::2].copy() + pts[:, 0::2] = width - oldx - 1 + return pts + + def apply(self, sample, context=None): + """Filp the image and bounding box. + Operators: + 1. Flip the image numpy. + 2. Transform the bboxes' x coordinates. + (Must judge whether the coordinates are normalized!) + 3. Transform the segmentations' x coordinates. + (Must judge whether the coordinates are normalized!) + Output: + sample: the image, bounding box and segmentation part + in sample are flipped. + """ + if np.random.uniform(0, 1) < self.prob: + im = sample['image'] + height, width = im.shape[:2] + im = self.apply_image(im) + if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: + sample['gt_bbox'] = self.apply_pts(sample['gt_bbox'], width) + if 'gt_poly' in sample and len(sample['gt_poly']) > 0: + sample['gt_poly'] = self.apply_pts(sample['gt_poly'], width) + + sample['flipped'] = True + sample['image'] = im + return sample + + +@register_op +class VisibleRBox(BaseOperator): + """ + In debug mode, visualize images according to `gt_box`. + (Currently only supported when not cropping and flipping image.) + """ + + def __init__(self, output_dir='debug'): + super(VisibleRBox, self).__init__() + self.output_dir = output_dir + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + + def apply(self, sample, context=None): + image = Image.fromarray(sample['image'].astype(np.uint8)) + out_file_name = '{:012d}.jpg'.format(sample['im_id'][0]) + width = sample['w'] + height = sample['h'] + # gt_poly = sample['gt_rbox'] + gt_poly = sample['gt_poly'] + gt_class = sample['gt_class'] + draw = ImageDraw.Draw(image) + for i in range(gt_poly.shape[0]): + x1, y1, x2, y2, x3, y3, x4, y4 = gt_poly[i] + draw.line( + [(x1, y1), (x2, y2), (x3, y3), (x4, y4), (x1, y1)], + width=2, + fill='green') + # draw label + xmin = min(x1, x2, x3, x4) + ymin = min(y1, y2, y3, y4) + text = str(gt_class[i][0]) + tw, th = draw.textsize(text) + draw.rectangle( + [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill='green') + draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255)) + + if 'gt_keypoint' in sample.keys(): + gt_keypoint = sample['gt_keypoint'] + if self.is_normalized: + for i in range(gt_keypoint.shape[1]): + if i % 2: + gt_keypoint[:, i] = gt_keypoint[:, i] * height + else: + gt_keypoint[:, i] = gt_keypoint[:, i] * width + for i in range(gt_keypoint.shape[0]): + keypoint = gt_keypoint[i] + for j in range(int(keypoint.shape[0] / 2)): + x1 = round(keypoint[2 * j]).astype(np.int32) + y1 = round(keypoint[2 * j + 1]).astype(np.int32) + draw.ellipse( + (x1, y1, x1 + 5, y1 + 5), fill='green', outline='green') + save_path = os.path.join(self.output_dir, out_file_name) + image.save(save_path, quality=95) + return sample + + +@register_op +class Rbox2Poly(BaseOperator): + """ + Convert rbbox format to poly format. + """ + + def __init__(self): + super(Rbox2Poly, self).__init__() + + def apply(self, sample, context=None): + assert 'gt_rbox' in sample + assert sample['gt_rbox'].shape[1] == 5 + rboxes = sample['gt_rbox'] + polys = rbox2poly_np(rboxes) + sample['gt_poly'] = polys + xmin, ymin = polys[:, 0::2].min(1), polys[:, 1::2].min(1) + xmax, ymax = polys[:, 0::2].max(1), polys[:, 1::2].max(1) + sample['gt_bbox'] = np.stack([xmin, ymin, xmin, ymin], axis=1) + return sample diff --git a/paddlers/models/ppdet/data/utils.py b/paddlers/models/ppdet/data/utils.py new file mode 100644 index 0000000..02573e6 --- /dev/null +++ b/paddlers/models/ppdet/data/utils.py @@ -0,0 +1,72 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import numbers +import numpy as np + +try: + from collections.abc import Sequence, Mapping +except: + from collections import Sequence, Mapping + + +def default_collate_fn(batch): + """ + Default batch collating function for :code:`paddle.io.DataLoader`, + get input data as a list of sample datas, each element in list + if the data of a sample, and sample data should composed of list, + dictionary, string, number, numpy array, this + function will parse input data recursively and stack number, + numpy array and paddle.Tensor datas as batch datas. e.g. for + following input data: + [{'image': np.array(shape=[3, 224, 224]), 'label': 1}, + {'image': np.array(shape=[3, 224, 224]), 'label': 3}, + {'image': np.array(shape=[3, 224, 224]), 'label': 4}, + {'image': np.array(shape=[3, 224, 224]), 'label': 5},] + + + This default collate function zipped each number and numpy array + field together and stack each field as the batch field as follows: + {'image': np.array(shape=[4, 3, 224, 224]), 'label': np.array([1, 3, 4, 5])} + Args: + batch(list of sample data): batch should be a list of sample data. + + Returns: + Batched data: batched each number, numpy array and paddle.Tensor + in input data. + """ + sample = batch[0] + if isinstance(sample, np.ndarray): + batch = np.stack(batch, axis=0) + return batch + elif isinstance(sample, numbers.Number): + batch = np.array(batch) + return batch + elif isinstance(sample, (str, bytes)): + return batch + elif isinstance(sample, Mapping): + return { + key: default_collate_fn([d[key] for d in batch]) + for key in sample + } + elif isinstance(sample, Sequence): + sample_fields_num = len(sample) + if not all(len(sample) == sample_fields_num for sample in iter(batch)): + raise RuntimeError( + "fileds number not same among samples in a batch") + return [default_collate_fn(fields) for fields in zip(*batch)] + + raise TypeError("batch data con only contains: tensor, numpy.ndarray, " + "dict, list, number, but got {}".format(type(sample))) diff --git a/paddlers/models/ppdet/engine/__init__.py b/paddlers/models/ppdet/engine/__init__.py index 038bb0f..0074a7e 100644 --- a/paddlers/models/ppdet/engine/__init__.py +++ b/paddlers/models/ppdet/engine/__init__.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. from . import trainer diff --git a/paddlers/models/ppdet/engine/callbacks.py b/paddlers/models/ppdet/engine/callbacks.py index 6af2e82..e4a25f7 100644 --- a/paddlers/models/ppdet/engine/callbacks.py +++ b/paddlers/models/ppdet/engine/callbacks.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import @@ -182,7 +182,7 @@ class Checkpointer(Callback): ) % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1: save_name = str( epoch_id) if epoch_id != end_epoch - 1 else "model_final" - weight = self.weight + weight = self.weight.state_dict() elif mode == 'eval': if 'save_best_model' in status and status['save_best_model']: for metric in self.model._metrics: @@ -198,15 +198,25 @@ class Checkpointer(Callback): "training iterations being too few or not " \ "loading the correct weights.") return - if map_res[key][0] > self.best_ap: + if map_res[key][0] >= self.best_ap: self.best_ap = map_res[key][0] save_name = 'best_model' - weight = self.weight + weight = self.weight.state_dict() logger.info("Best test {} ap is {:0.3f}.".format( key, self.best_ap)) if weight: - save_model(weight, self.model.optimizer, self.save_dir, - save_name, epoch_id + 1) + if self.model.use_ema: + # save model and ema_model + save_model( + status['weight'], + self.model.optimizer, + self.save_dir, + save_name, + epoch_id + 1, + ema_model=weight) + else: + save_model(weight, self.model.optimizer, self.save_dir, + save_name, epoch_id + 1) class WiferFaceEval(Callback): @@ -251,7 +261,7 @@ class VisualDLWriter(Callback): for loss_name, loss_value in training_staus.get().items(): self.vdl_writer.add_scalar(loss_name, loss_value, self.vdl_loss_step) - self.vdl_loss_step += 1 + self.vdl_loss_step += 1 elif mode == 'test': ori_image = status['original_image'] result_image = status['result_image'] @@ -279,6 +289,157 @@ class VisualDLWriter(Callback): self.vdl_mAP_step += 1 +class WandbCallback(Callback): + def __init__(self, model): + super(WandbCallback, self).__init__(model) + + try: + import wandb + self.wandb = wandb + except Exception as e: + logger.error('wandb not found, please install wandb. ' + 'Use: `pip install wandb`.') + raise e + + self.wandb_params = model.cfg.get('wandb', None) + self.save_dir = os.path.join(self.model.cfg.save_dir, + self.model.cfg.filename) + if self.wandb_params is None: + self.wandb_params = {} + for k, v in model.cfg.items(): + if k.startswith("wandb_"): + self.wandb_params.update({k.lstrip("wandb_"): v}) + + self._run = None + if dist.get_world_size() < 2 or dist.get_rank() == 0: + _ = self.run + self.run.config.update(self.model.cfg) + self.run.define_metric("epoch") + self.run.define_metric("eval/*", step_metric="epoch") + + self.best_ap = 0 + + @property + def run(self): + if self._run is None: + if self.wandb.run is not None: + logger.info( + "There is an ongoing wandb run which will be used" + "for logging. Please use `wandb.finish()` to end that" + "if the behaviour is not intended") + self._run = self.wandb.run + else: + self._run = self.wandb.init(**self.wandb_params) + return self._run + + def save_model(self, + optimizer, + save_dir, + save_name, + last_epoch, + ema_model=None, + ap=None, + tags=None): + if dist.get_world_size() < 2 or dist.get_rank() == 0: + model_path = os.path.join(save_dir, save_name) + metadata = {} + metadata["last_epoch"] = last_epoch + if ap: + metadata["ap"] = ap + if ema_model is None: + ema_artifact = self.wandb.Artifact( + name="ema_model-{}".format(self.run.id), + type="model", + metadata=metadata) + model_artifact = self.wandb.Artifact( + name="model-{}".format(self.run.id), + type="model", + metadata=metadata) + + ema_artifact.add_file(model_path + ".pdema", name="model_ema") + model_artifact.add_file(model_path + ".pdparams", name="model") + + self.run.log_artifact(ema_artifact, aliases=tags) + self.run.log_artfact(model_artifact, aliases=tags) + else: + model_artifact = self.wandb.Artifact( + name="model-{}".format(self.run.id), + type="model", + metadata=metadata) + model_artifact.add_file(model_path + ".pdparams", name="model") + self.run.log_artifact(model_artifact, aliases=tags) + + def on_step_end(self, status): + + mode = status['mode'] + if dist.get_world_size() < 2 or dist.get_rank() == 0: + if mode == 'train': + training_status = status['training_staus'].get() + for k, v in training_status.items(): + training_status[k] = float(v) + metrics = {"train/" + k: v for k, v in training_status.items()} + self.run.log(metrics) + + def on_epoch_end(self, status): + mode = status['mode'] + epoch_id = status['epoch_id'] + save_name = None + if dist.get_world_size() < 2 or dist.get_rank() == 0: + if mode == 'train': + end_epoch = self.model.cfg.epoch + if ( + epoch_id + 1 + ) % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1: + save_name = str( + epoch_id) if epoch_id != end_epoch - 1 else "model_final" + tags = ["latest", "epoch_{}".format(epoch_id)] + self.save_model( + self.model.optimizer, + self.save_dir, + save_name, + epoch_id + 1, + self.model.use_ema, + tags=tags) + if mode == 'eval': + merged_dict = {} + for metric in self.model._metrics: + for key, map_value in metric.get_results().items(): + merged_dict["eval/{}-mAP".format(key)] = map_value[0] + merged_dict["epoch"] = status["epoch_id"] + self.run.log(merged_dict) + + if 'save_best_model' in status and status['save_best_model']: + for metric in self.model._metrics: + map_res = metric.get_results() + if 'bbox' in map_res: + key = 'bbox' + elif 'keypoint' in map_res: + key = 'keypoint' + else: + key = 'mask' + if key not in map_res: + logger.warning("Evaluation results empty, this may be due to " \ + "training iterations being too few or not " \ + "loading the correct weights.") + return + if map_res[key][0] >= self.best_ap: + self.best_ap = map_res[key][0] + save_name = 'best_model' + tags = ["best", "epoch_{}".format(epoch_id)] + + self.save_model( + self.model.optimizer, + self.save_dir, + save_name, + last_epoch=epoch_id + 1, + ema_model=self.model.use_ema, + ap=self.best_ap, + tags=tags) + + def on_train_end(self, status): + self.run.finish() + + class SniperProposalsGenerator(Callback): def __init__(self, model): super(SniperProposalsGenerator, self).__init__(model) diff --git a/paddlers/models/ppdet/engine/env.py b/paddlers/models/ppdet/engine/env.py index 9a378dc..9e1a7e8 100644 --- a/paddlers/models/ppdet/engine/env.py +++ b/paddlers/models/ppdet/engine/env.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import diff --git a/paddlers/models/ppdet/engine/export_utils.py b/paddlers/models/ppdet/engine/export_utils.py index 79fe86a..5b0a997 100644 --- a/paddlers/models/ppdet/engine/export_utils.py +++ b/paddlers/models/ppdet/engine/export_utils.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import @@ -41,22 +41,26 @@ TRT_MIN_SUBGRAPH = { 'HigherHRNet': 3, 'HRNet': 3, 'DeepSORT': 3, + 'ByteTrack': 10, 'JDE': 10, 'FairMOT': 5, 'GFL': 16, 'PicoDet': 3, 'CenterNet': 5, 'TOOD': 5, + 'YOLOX': 8, } KEYPOINT_ARCH = ['HigherHRNet', 'TopDownHRNet'] -MOT_ARCH = ['DeepSORT', 'JDE', 'FairMOT'] +MOT_ARCH = ['DeepSORT', 'JDE', 'FairMOT', 'ByteTrack'] def _prune_input_spec(input_spec, program, targets): # try to prune static program to figure out pruned input spec # so we perform following operations in static mode + device = paddle.get_device() paddle.enable_static() + paddle.set_device(device) pruned_input_spec = [{}] program = program.clone() program = program._prune(targets=targets) @@ -67,7 +71,7 @@ def _prune_input_spec(input_spec, program, targets): pruned_input_spec[0][name] = spec except Exception: pass - paddle.disable_static() + paddle.disable_static(place=device) return pruned_input_spec @@ -88,6 +92,7 @@ def _parse_reader(reader_cfg, dataset_cfg, metric, arch, image_shape): if key == 'Resize': if int(image_shape[1]) != -1: value['target_size'] = image_shape[1:] + value['interp'] = value.get('interp', 1) # cv2.INTER_LINEAR if fuse_normalize and key == 'NormalizeImage': continue p.update(value) @@ -120,12 +125,20 @@ def _dump_infer_config(config, path, image_shape, model): setup_orderdict() use_dynamic_shape = True if image_shape[2] == -1 else False infer_cfg = OrderedDict({ - 'mode': 'fluid', + 'mode': 'paddle', 'draw_threshold': 0.5, 'metric': config['metric'], 'use_dynamic_shape': use_dynamic_shape }) + export_onnx = config.get('export_onnx', False) + export_eb = config.get('export_eb', False) + infer_arch = config['architecture'] + if 'RCNN' in infer_arch and export_onnx: + logger.warning( + "Exporting RCNN model to ONNX only support batch_size = 1") + infer_cfg['export_onnx'] = True + infer_cfg['export_eb'] = export_eb if infer_arch in MOT_ARCH: if infer_arch == 'DeepSORT': @@ -140,6 +153,12 @@ def _dump_infer_config(config, path, image_shape, model): infer_cfg['min_subgraph_size'] = min_subgraph_size arch_state = True break + + if infer_arch == 'YOLOX': + infer_cfg['arch'] = infer_arch + infer_cfg['min_subgraph_size'] = TRT_MIN_SUBGRAPH[infer_arch] + arch_state = True + if not arch_state: logger.error( 'Architecture: {} is not supported for exporting model now.\n'. @@ -165,12 +184,17 @@ def _dump_infer_config(config, path, image_shape, model): reader_cfg, dataset_cfg, config['metric'], label_arch, image_shape[1:]) if infer_arch == 'PicoDet': - infer_cfg['NMS'] = config['PicoHead']['nms'] - # In order to speed up the prediction, the threshold of nms + if hasattr(config, 'export') and config['export'].get( + 'post_process', + False) and not config['export'].get('benchmark', False): + infer_cfg['arch'] = 'GFL' + head_name = 'PicoHeadV2' if config['PicoHeadV2'] else 'PicoHead' + infer_cfg['NMS'] = config[head_name]['nms'] + # In order to speed up the prediction, the threshold of nms # is adjusted here, which can be changed in infer_cfg.yml - config['PicoHead']['nms']["score_threshold"] = 0.3 - config['PicoHead']['nms']["nms_threshold"] = 0.5 - infer_cfg['fpn_stride'] = config['PicoHead']['fpn_stride'] + config[head_name]['nms']["score_threshold"] = 0.3 + config[head_name]['nms']["nms_threshold"] = 0.5 + infer_cfg['fpn_stride'] = config[head_name]['fpn_stride'] yaml.dump(infer_cfg, open(path, 'w')) logger.info("Export inference config file to {}".format(os.path.join(path))) diff --git a/paddlers/models/ppdet/engine/tracker.py b/paddlers/models/ppdet/engine/tracker.py index ab358bc..6a1b7a2 100644 --- a/paddlers/models/ppdet/engine/tracker.py +++ b/paddlers/models/ppdet/engine/tracker.py @@ -17,27 +17,33 @@ from __future__ import division from __future__ import print_function import os -import cv2 import glob import re import paddle +import paddle.nn as nn import numpy as np -import os.path as osp +from tqdm import tqdm from collections import defaultdict from paddlers.models.ppdet.core.workspace import create from paddlers.models.ppdet.utils.checkpoint import load_weight, load_pretrain_weight from paddlers.models.ppdet.modeling.mot.utils import Detection, get_crops, scale_coords, clip_box from paddlers.models.ppdet.modeling.mot.utils import MOTTimer, load_det_results, write_mot_results, save_vis_results - -from paddlers.models.ppdet.metrics import Metric, MOTMetric, KITTIMOTMetric -from paddlers.models.ppdet.metrics import MCMOTMetric +from paddlers.models.ppdet.modeling.mot.tracker import JDETracker, DeepSORTTracker, OCSORTTracker +from paddlers.models.ppdet.modeling.architectures import YOLOX +from paddlers.models.ppdet.metrics import Metric, MOTMetric, KITTIMOTMetric, MCMOTMetric +import paddlers.models.ppdet.utils.stats as stats from .callbacks import Callback, ComposeCallback from paddlers.models.ppdet.utils.logger import setup_logger logger = setup_logger(__name__) +MOT_ARCH = ['DeepSORT', 'JDE', 'FairMOT', 'ByteTrack'] +MOT_ARCH_JDE = ['JDE', 'FairMOT'] +MOT_ARCH_SDE = ['DeepSORT', 'ByteTrack'] +MOT_DATA_TYPE = ['mot', 'mcmot', 'kitti'] + __all__ = ['Tracker'] @@ -55,6 +61,12 @@ class Tracker(object): # build model self.model = create(cfg.architecture) + if isinstance(self.model.detector, YOLOX): + for k, m in self.model.named_sublayers(): + if isinstance(m, nn.BatchNorm2D): + m._epsilon = 1e-3 # for amp(fp16) + m._momentum = 0.97 # 0.03 in pytorch + self.status = {} self.start_epoch = 0 @@ -108,11 +120,15 @@ class Tracker(object): load_weight(self.model, weights, self.optimizer) def load_weights_sde(self, det_weights, reid_weights): - if self.model.detector: + with_detector = self.model.detector is not None + with_reid = self.model.reid is not None + + if with_detector: load_weight(self.model.detector, det_weights) - load_weight(self.model.reid, reid_weights) + if with_reid: + load_weight(self.model.reid, reid_weights) else: - load_weight(self.model.reid, reid_weights, self.optimizer) + load_weight(self.model.reid, reid_weights) def _eval_seq_jde(self, dataloader, @@ -131,11 +147,8 @@ class Tracker(object): self.model.eval() results = defaultdict(list) # support single class and multi classes - for step_id, data in enumerate(dataloader): + for step_id, data in enumerate(tqdm(dataloader)): self.status['step_id'] = step_id - if frame_id % 40 == 0: - logger.info('Processing frame {} ({:.2f} fps)'.format( - frame_id, 1. / max(1e-5, timer.average_time))) # forward timer.tic() pred_dets, pred_embs = self.model(data) @@ -184,24 +197,23 @@ class Tracker(object): if save_dir: if not os.path.exists(save_dir): os.makedirs(save_dir) use_detector = False if not self.model.detector else True + use_reid = False if not self.model.reid else True timer = MOTTimer() results = defaultdict(list) frame_id = 0 self.status['mode'] = 'track' self.model.eval() - self.model.reid.eval() + if use_reid: + self.model.reid.eval() if not use_detector: dets_list = load_det_results(det_file, len(dataloader)) logger.info('Finish loading detection results file {}.'.format( det_file)) - for step_id, data in enumerate(dataloader): + tracker = self.model.tracker + for step_id, data in enumerate(tqdm(dataloader)): self.status['step_id'] = step_id - if frame_id % 40 == 0: - logger.info('Processing frame {} ({:.2f} fps)'.format( - frame_id, 1. / max(1e-5, timer.average_time))) - ori_image = data['ori_image'] # [bs, H, W, 3] ori_image_shape = data['ori_image'].shape[1:3] # ori_image_shape: [H, W] @@ -214,7 +226,7 @@ class Tracker(object): scale_factor = data['scale_factor'][0].numpy() empty_detections = False - # when it has no detected bboxes, will not inference reid model + # when it has no detected bboxes, will not inference reid model # and if visualize, use original image instead # forward @@ -240,7 +252,7 @@ class Tracker(object): outs['bbox'] = outs['bbox'].numpy() outs['bbox_num'] = outs['bbox_num'].numpy() - if outs['bbox_num'] > 0 and empty_detections == False: + if len(outs['bbox']) > 0 and empty_detections == False: # detector outputs: pred_cls_ids, pred_scores, pred_bboxes pred_cls_ids = outs['bbox'][:, 0:1] pred_scores = outs['bbox'][:, 1:2] @@ -249,13 +261,15 @@ class Tracker(object): # with LetterBoxResize and JDEBBoxPostProcess. # # 'scaled' means whether the coords after detector outputs - # have been scaled back to the original image, set True + # have been scaled back to the original image, set True # in general detector, set False in JDE YOLOv3. pred_bboxes = scale_coords(outs['bbox'][:, 2:], input_shape, im_shape, scale_factor) else: pred_bboxes = outs['bbox'][:, 2:] + pred_dets_old = np.concatenate( + (pred_cls_ids, pred_scores, pred_bboxes), axis=1) else: logger.warning( 'Frame {} has not detected object, try to modify score threshold.'. @@ -281,52 +295,104 @@ class Tracker(object): # thus will not inference reid model continue - pred_scores = pred_scores[keep_idx[0]] pred_cls_ids = pred_cls_ids[keep_idx[0]] - pred_tlwhs = np.concatenate( - (pred_xyxys[:, 0:2], - pred_xyxys[:, 2:4] - pred_xyxys[:, 0:2] + 1), - axis=1) + pred_scores = pred_scores[keep_idx[0]] pred_dets = np.concatenate( - (pred_tlwhs, pred_scores, pred_cls_ids), axis=1) - - tracker = self.model.tracker - crops = get_crops( - pred_xyxys, - ori_image, - w=tracker.input_size[0], - h=tracker.input_size[1]) - crops = paddle.to_tensor(crops) - - data.update({'crops': crops}) - pred_embs = self.model(data).numpy() - - tracker.predict() - online_targets = tracker.update(pred_dets, pred_embs) - - online_tlwhs, online_scores, online_ids = [], [], [] - for t in online_targets: - if not t.is_confirmed() or t.time_since_update > 1: - continue - tlwh = t.to_tlwh() - tscore = t.score - tid = t.track_id - if tscore < draw_threshold: continue - if tlwh[2] * tlwh[3] <= tracker.min_box_area: continue - if tracker.vertical_ratio > 0 and tlwh[2] / tlwh[ - 3] > tracker.vertical_ratio: - continue - online_tlwhs.append(tlwh) - online_scores.append(tscore) - online_ids.append(tid) - timer.toc() + (pred_cls_ids, pred_scores, pred_xyxys), axis=1) + + if use_reid: + crops = get_crops( + pred_xyxys, + ori_image, + w=tracker.input_size[0], + h=tracker.input_size[1]) + crops = paddle.to_tensor(crops) + + data.update({'crops': crops}) + pred_embs = self.model(data)['embeddings'].numpy() + else: + pred_embs = None - # save results - results[0].append( - (frame_id + 1, online_tlwhs, online_scores, online_ids)) - save_vis_results(data, frame_id, online_ids, online_tlwhs, - online_scores, timer.average_time, show_image, - save_dir, self.cfg.num_classes) + if isinstance(tracker, DeepSORTTracker): + online_tlwhs, online_scores, online_ids = [], [], [] + tracker.predict() + online_targets = tracker.update(pred_dets, pred_embs) + for t in online_targets: + if not t.is_confirmed() or t.time_since_update > 1: + continue + tlwh = t.to_tlwh() + tscore = t.score + tid = t.track_id + if tscore < draw_threshold: continue + if tlwh[2] * tlwh[3] <= tracker.min_box_area: continue + if tracker.vertical_ratio > 0 and tlwh[2] / tlwh[ + 3] > tracker.vertical_ratio: + continue + online_tlwhs.append(tlwh) + online_scores.append(tscore) + online_ids.append(tid) + timer.toc() + + # save results + results[0].append( + (frame_id + 1, online_tlwhs, online_scores, online_ids)) + save_vis_results(data, frame_id, online_ids, online_tlwhs, + online_scores, timer.average_time, show_image, + save_dir, self.cfg.num_classes) + + elif isinstance(tracker, JDETracker): + # trick hyperparams only used for MOTChallenge (MOT17, MOT20) Test-set + tracker.track_buffer, tracker.conf_thres = get_trick_hyperparams( + seq_name, tracker.track_buffer, tracker.conf_thres) + + online_targets_dict = tracker.update(pred_dets_old, pred_embs) + online_tlwhs = defaultdict(list) + online_scores = defaultdict(list) + online_ids = defaultdict(list) + for cls_id in range(self.cfg.num_classes): + online_targets = online_targets_dict[cls_id] + for t in online_targets: + tlwh = t.tlwh + tid = t.track_id + tscore = t.score + if tlwh[2] * tlwh[3] <= tracker.min_box_area: continue + if tracker.vertical_ratio > 0 and tlwh[2] / tlwh[ + 3] > tracker.vertical_ratio: + continue + online_tlwhs[cls_id].append(tlwh) + online_ids[cls_id].append(tid) + online_scores[cls_id].append(tscore) + # save results + results[cls_id].append( + (frame_id + 1, online_tlwhs[cls_id], + online_scores[cls_id], online_ids[cls_id])) + timer.toc() + save_vis_results(data, frame_id, online_ids, online_tlwhs, + online_scores, timer.average_time, show_image, + save_dir, self.cfg.num_classes) + elif isinstance(tracker, OCSORTTracker): + # OC_SORT Tracker + online_targets = tracker.update(pred_dets_old, pred_embs) + online_tlwhs = [] + online_ids = [] + online_scores = [] + for t in online_targets: + tlwh = [t[0], t[1], t[2] - t[0], t[3] - t[1]] + tscore = float(t[4]) + tid = int(t[5]) + if tlwh[2] * tlwh[3] > 0: + online_tlwhs.append(tlwh) + online_ids.append(tid) + online_scores.append(tscore) + timer.toc() + # save results + results[0].append( + (frame_id + 1, online_tlwhs, online_scores, online_ids)) + save_vis_results(data, frame_id, online_ids, online_tlwhs, + online_scores, timer.average_time, show_image, + save_dir, self.cfg.num_classes) + else: + raise ValueError(tracker) frame_id += 1 return results, frame_id, timer.average_time, timer.calls @@ -345,10 +411,10 @@ class Tracker(object): if not os.path.exists(output_dir): os.makedirs(output_dir) result_root = os.path.join(output_dir, 'mot_results') if not os.path.exists(result_root): os.makedirs(result_root) - assert data_type in ['mot', 'mcmot', 'kitti'], \ + assert data_type in MOT_DATA_TYPE, \ "data_type should be 'mot', 'mcmot' or 'kitti'" - assert model_type in ['JDE', 'DeepSORT', 'FairMOT'], \ - "model_type should be 'JDE', 'DeepSORT' or 'FairMOT'" + assert model_type in MOT_ARCH, \ + "model_type should be 'JDE', 'DeepSORT', 'FairMOT' or 'ByteTrack'" # run tracking n_frame = 0 @@ -371,7 +437,7 @@ class Tracker(object): save_dir = os.path.join(output_dir, 'mot_outputs', seq) if save_images or save_videos else None - logger.info('start seq: {}'.format(seq)) + logger.info('Evaluate seq: {}'.format(seq)) self.dataset.set_images(self.get_infer_images(infer_dir)) dataloader = create('EvalMOTReader')(self.dataset, 0) @@ -379,13 +445,13 @@ class Tracker(object): result_filename = os.path.join(result_root, '{}.txt'.format(seq)) with paddle.no_grad(): - if model_type in ['JDE', 'FairMOT']: + if model_type in MOT_ARCH_JDE: results, nf, ta, tc = self._eval_seq_jde( dataloader, save_dir=save_dir, show_image=show_image, frame_rate=frame_rate) - elif model_type in ['DeepSORT']: + elif model_type in MOT_ARCH_SDE: results, nf, ta, tc = self._eval_seq_sde( dataloader, save_dir=save_dir, @@ -412,7 +478,6 @@ class Tracker(object): os.system(cmd_str) logger.info('Save video in {}.'.format(output_video_path)) - logger.info('Evaluate seq: {}'.format(seq)) # update metrics for metric in self._metrics: metric.update(data_root, seq, data_type, result_root, @@ -471,12 +536,12 @@ class Tracker(object): if not os.path.exists(output_dir): os.makedirs(output_dir) result_root = os.path.join(output_dir, 'mot_results') if not os.path.exists(result_root): os.makedirs(result_root) - assert data_type in ['mot', 'mcmot', 'kitti'], \ + assert data_type in MOT_DATA_TYPE, \ "data_type should be 'mot', 'mcmot' or 'kitti'" - assert model_type in ['JDE', 'DeepSORT', 'FairMOT'], \ - "model_type should be 'JDE', 'DeepSORT' or 'FairMOT'" + assert model_type in MOT_ARCH, \ + "model_type should be 'JDE', 'DeepSORT', 'FairMOT' or 'ByteTrack'" - # run tracking + # run tracking if video_file: seq = video_file.split('/')[-1].split('.')[0] self.dataset.set_video(video_file, frame_rate) @@ -504,14 +569,14 @@ class Tracker(object): frame_rate = self.dataset.frame_rate with paddle.no_grad(): - if model_type in ['JDE', 'FairMOT']: + if model_type in MOT_ARCH_JDE: results, nf, ta, tc = self._eval_seq_jde( dataloader, save_dir=save_dir, show_image=show_image, frame_rate=frame_rate, draw_threshold=draw_threshold) - elif model_type in ['DeepSORT']: + elif model_type in MOT_ARCH_SDE: results, nf, ta, tc = self._eval_seq_sde( dataloader, save_dir=save_dir, @@ -535,3 +600,35 @@ class Tracker(object): write_mot_results(result_filename, results, data_type, self.cfg.num_classes) + + +def get_trick_hyperparams(video_name, ori_buffer, ori_thresh): + if video_name[:3] != 'MOT': + # only used for MOTChallenge (MOT17, MOT20) Test-set + return ori_buffer, ori_thresh + + video_name = video_name[:8] + if 'MOT17-05' in video_name: + track_buffer = 14 + elif 'MOT17-13' in video_name: + track_buffer = 25 + else: + track_buffer = ori_buffer + + if 'MOT17-01' in video_name: + track_thresh = 0.65 + elif 'MOT17-06' in video_name: + track_thresh = 0.65 + elif 'MOT17-12' in video_name: + track_thresh = 0.7 + elif 'MOT17-14' in video_name: + track_thresh = 0.67 + else: + track_thresh = ori_thresh + + if 'MOT20-06' in video_name or 'MOT20-08' in video_name: + track_thresh = 0.3 + else: + track_thresh = ori_thresh + + return track_buffer, ori_thresh diff --git a/paddlers/models/ppdet/engine/trainer.py b/paddlers/models/ppdet/engine/trainer.py index 2d6640e..93caa64 100644 --- a/paddlers/models/ppdet/engine/trainer.py +++ b/paddlers/models/ppdet/engine/trainer.py @@ -20,38 +20,44 @@ import os import sys import copy import time +from tqdm import tqdm import numpy as np import typing -from PIL import Image, ImageOps +from PIL import Image, ImageOps, ImageFile + +ImageFile.LOAD_TRUNCATED_IMAGES = True import paddle +import paddle.nn as nn import paddle.distributed as dist from paddle.distributed import fleet -from paddle import amp from paddle.static import InputSpec from paddlers.models.ppdet.optimizer import ModelEMA from paddlers.models.ppdet.core.workspace import create -from paddlers.models.ppdet.modeling.architectures.meta_arch import BaseArch from paddlers.models.ppdet.utils.checkpoint import load_weight, load_pretrain_weight from paddlers.models.ppdet.utils.visualizer import visualize_results, save_result from paddlers.models.ppdet.metrics import Metric, COCOMetric, VOCMetric, WiderFaceMetric, get_infer_results, KeyPointTopDownCOCOEval, KeyPointTopDownMPIIEval from paddlers.models.ppdet.metrics import RBoxMetric, JDEDetMetric, SNIPERCOCOMetric from paddlers.models.ppdet.data.source.sniper_coco import SniperCOCODataSet from paddlers.models.ppdet.data.source.category import get_categories -from paddlers.models.ppdet.utils import stats +import paddlers.models.ppdet.utils.stats as stats +from paddlers.models.ppdet.utils.fuse_utils import fuse_conv_bn from paddlers.models.ppdet.utils import profiler +from paddlers.models.ppdet.modeling.post_process import multiclass_nms -from .callbacks import Callback, ComposeCallback, LogPrinter, Checkpointer, WiferFaceEval, VisualDLWriter, SniperProposalsGenerator +from .callbacks import Callback, ComposeCallback, LogPrinter, Checkpointer, WiferFaceEval, VisualDLWriter, SniperProposalsGenerator, WandbCallback from .export_utils import _dump_infer_config, _prune_input_spec +from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients + from paddlers.models.ppdet.utils.logger import setup_logger logger = setup_logger('ppdet.engine') __all__ = ['Trainer'] -MOT_ARCH = ['DeepSORT', 'JDE', 'FairMOT'] +MOT_ARCH = ['DeepSORT', 'JDE', 'FairMOT', 'ByteTrack'] class Trainer(object): @@ -62,19 +68,30 @@ class Trainer(object): self.mode = mode.lower() self.optimizer = None self.is_loaded_weights = False + self.use_amp = self.cfg.get('amp', False) + self.amp_level = self.cfg.get('amp_level', 'O1') + self.custom_white_list = self.cfg.get('custom_white_list', None) + self.custom_black_list = self.cfg.get('custom_black_list', None) # build data loader + capital_mode = self.mode.capitalize() if cfg.architecture in MOT_ARCH and self.mode in ['eval', 'test']: - self.dataset = cfg['{}MOTDataset'.format(self.mode.capitalize())] + self.dataset = self.cfg['{}MOTDataset'.format( + capital_mode)] = create('{}MOTDataset'.format(capital_mode))() else: - self.dataset = cfg['{}Dataset'.format(self.mode.capitalize())] + self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create( + '{}Dataset'.format(capital_mode))() if cfg.architecture == 'DeepSORT' and self.mode == 'train': logger.error('DeepSORT has no need of training on mot dataset.') sys.exit(1) + if cfg.architecture == 'FairMOT' and self.mode == 'eval': + images = self.parse_mot_images(cfg) + self.dataset.set_images(images) + if self.mode == 'train': - self.loader = create('{}Reader'.format(self.mode.capitalize()))( + self.loader = create('{}Reader'.format(capital_mode))( self.dataset, cfg.worker_num) if cfg.architecture == 'JDE' and self.mode == 'train': @@ -94,41 +111,73 @@ class Trainer(object): self.model = self.cfg.model self.is_loaded_weights = True - #normalize params for deploy - self.model.load_meanstd(cfg['TestReader']['sample_transforms']) + if cfg.architecture == 'YOLOX': + for k, m in self.model.named_sublayers(): + if isinstance(m, nn.BatchNorm2D): + m._epsilon = 1e-3 # for amp(fp16) + m._momentum = 0.97 # 0.03 in pytorch - self.use_ema = ('use_ema' in cfg and cfg['use_ema']) - if self.use_ema: - ema_decay = self.cfg.get('ema_decay', 0.9998) - cycle_epoch = self.cfg.get('cycle_epoch', -1) - self.ema = ModelEMA( - self.model, - decay=ema_decay, - use_thres_step=True, - cycle_epoch=cycle_epoch) + #normalize params for deploy + if 'slim' in cfg and cfg['slim_type'] == 'OFA': + self.model.model.load_meanstd(cfg['TestReader'][ + 'sample_transforms']) + elif 'slim' in cfg and cfg['slim_type'] == 'Distill': + self.model.student_model.load_meanstd(cfg['TestReader'][ + 'sample_transforms']) + elif 'slim' in cfg and cfg[ + 'slim_type'] == 'DistillPrune' and self.mode == 'train': + self.model.student_model.load_meanstd(cfg['TestReader'][ + 'sample_transforms']) + else: + self.model.load_meanstd(cfg['TestReader']['sample_transforms']) # EvalDataset build with BatchSampler to evaluate in single device # TODO: multi-device evaluate if self.mode == 'eval': - self._eval_batch_sampler = paddle.io.BatchSampler( - self.dataset, batch_size=self.cfg.EvalReader['batch_size']) - reader_name = '{}Reader'.format(self.mode.capitalize()) - # If metric is VOC, need to be set collate_batch=False. - if cfg.metric == 'VOC': - cfg[reader_name]['collate_batch'] = False - self.loader = create(reader_name)(self.dataset, cfg.worker_num, - self._eval_batch_sampler) + if cfg.architecture == 'FairMOT': + self.loader = create('EvalMOTReader')(self.dataset, 0) + else: + self._eval_batch_sampler = paddle.io.BatchSampler( + self.dataset, batch_size=self.cfg.EvalReader['batch_size']) + reader_name = '{}Reader'.format(self.mode.capitalize()) + # If metric is VOC, need to be set collate_batch=False. + if cfg.metric == 'VOC': + cfg[reader_name]['collate_batch'] = False + self.loader = create(reader_name)(self.dataset, cfg.worker_num, + self._eval_batch_sampler) # TestDataset build after user set images, skip loader creation here # build optimizer in train mode if self.mode == 'train': steps_per_epoch = len(self.loader) + if steps_per_epoch < 1: + logger.warning( + "Samples in dataset are less than batch_size, please set smaller batch_size in TrainReader." + ) self.lr = create('LearningRate')(steps_per_epoch) self.optimizer = create('OptimizerBuilder')(self.lr, self.model) - if self.cfg.get('unstructured_prune'): - self.pruner = create('UnstructuredPruner')(self.model, - steps_per_epoch) + # Unstructured pruner is only enabled in the train mode. + if self.cfg.get('unstructured_prune'): + self.pruner = create('UnstructuredPruner')(self.model, + steps_per_epoch) + if self.use_amp and self.amp_level == 'O2': + self.model, self.optimizer = paddle.amp.decorate( + models=self.model, + optimizers=self.optimizer, + level=self.amp_level) + self.use_ema = ('use_ema' in cfg and cfg['use_ema']) + if self.use_ema: + ema_decay = self.cfg.get('ema_decay', 0.9998) + ema_decay_type = self.cfg.get('ema_decay_type', 'threshold') + cycle_epoch = self.cfg.get('cycle_epoch', -1) + ema_black_list = self.cfg.get('ema_black_list', None) + self.ema = ModelEMA( + self.model, + decay=ema_decay, + ema_decay_type=ema_decay_type, + cycle_epoch=cycle_epoch, + ema_black_list=ema_black_list) self._nranks = dist.get_world_size() self._local_rank = dist.get_rank() @@ -152,6 +201,8 @@ class Trainer(object): self._callbacks.append(VisualDLWriter(self)) if self.cfg.get('save_proposals', False): self._callbacks.append(SniperProposalsGenerator(self)) + if self.cfg.get('use_wandb', False) or 'wandb' in self.cfg: + self._callbacks.append(WandbCallback(self)) self._compose_callback = ComposeCallback(self._callbacks) elif self.mode == 'eval': self._callbacks = [LogPrinter(self)] @@ -172,7 +223,7 @@ class Trainer(object): classwise = self.cfg['classwise'] if 'classwise' in self.cfg else False if self.cfg.metric == 'COCO' or self.cfg.metric == "SNIPERCOCO": # TODO: bias should be unified - bias = self.cfg['bias'] if 'bias' in self.cfg else 0 + bias = 1 if self.cfg.get('bias', False) else 0 output_eval = self.cfg['output_eval'] \ if 'output_eval' in self.cfg else None save_prediction_only = self.cfg.get('save_prediction_only', False) @@ -184,13 +235,14 @@ class Trainer(object): # when do validation in train, annotation file should be get from # EvalReader instead of self.dataset(which is TrainReader) - anno_file = self.dataset.get_anno() - dataset = self.dataset if self.mode == 'train' and validate: eval_dataset = self.cfg['EvalDataset'] eval_dataset.check_or_download_dataset() anno_file = eval_dataset.get_anno() dataset = eval_dataset + else: + dataset = self.dataset + anno_file = dataset.get_anno() IouType = self.cfg['IouType'] if 'IouType' in self.cfg else 'bbox' if self.cfg.metric == "COCO": @@ -222,11 +274,7 @@ class Trainer(object): output_eval = self.cfg['output_eval'] \ if 'output_eval' in self.cfg else None save_prediction_only = self.cfg.get('save_prediction_only', False) - - # pass clsid2catid info to metric instance to avoid multiple loading - # annotation file - clsid2catid = {v: k for k, v in self.dataset.catid2clsid.items()} \ - if self.mode == 'eval' else None + imid2path = self.cfg.get('imid2path', None) # when do validation in train, annotation file should be get from # EvalReader instead of self.dataset(which is TrainReader) @@ -239,19 +287,25 @@ class Trainer(object): self._metrics = [ RBoxMetric( anno_file=anno_file, - clsid2catid=clsid2catid, classwise=classwise, output_eval=output_eval, bias=bias, - save_prediction_only=save_prediction_only) + save_prediction_only=save_prediction_only, + imid2path=imid2path) ] elif self.cfg.metric == 'VOC': + output_eval = self.cfg['output_eval'] \ + if 'output_eval' in self.cfg else None + save_prediction_only = self.cfg.get('save_prediction_only', False) + self._metrics = [ VOCMetric( label_list=self.dataset.get_label_list(), class_num=self.cfg.num_classes, map_type=self.cfg.map_type, - classwise=classwise) + classwise=classwise, + output_eval=output_eval, + save_prediction_only=save_prediction_only) ] elif self.cfg.metric == 'WiderFace': multi_scale = self.cfg.multi_scale_eval if 'multi_scale_eval' in self.cfg else True @@ -334,19 +388,29 @@ class Trainer(object): self.start_epoch = load_weight(self.model.student_model, weights, self.optimizer) else: - self.start_epoch = load_weight(self.model, weights, self.optimizer) + self.start_epoch = load_weight(self.model, weights, self.optimizer, + self.ema if self.use_ema else None) logger.debug("Resume weights of epoch {}".format(self.start_epoch)) def train(self, validate=False): assert self.mode == 'train', "Model not in 'train' mode" Init_mark = False + if validate: + self.cfg['EvalDataset'] = self.cfg.EvalDataset = create( + "EvalDataset")() - sync_bn = (getattr(self.cfg, 'norm_type', None) in [None, 'sync_bn'] and + model = self.model + sync_bn = (getattr(self.cfg, 'norm_type', None) == 'sync_bn' and self.cfg.use_gpu and self._nranks > 1) if sync_bn: - self.model = BaseArch.convert_sync_batchnorm(self.model) - - model = self.model + model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model) + + # enabel auto mixed precision mode + if self.use_amp: + scaler = paddle.amp.GradScaler( + enable=self.cfg.use_gpu or self.cfg.use_npu, + init_loss_scaling=self.cfg.get('init_loss_scaling', 1024)) + # get distributed model if self.cfg.get('fleet', False): model = fleet.distributed_model(model) self.optimizer = fleet.distributed_optimizer(self.optimizer) @@ -354,12 +418,7 @@ class Trainer(object): find_unused_parameters = self.cfg[ 'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False model = paddle.DataParallel( - self.model, find_unused_parameters=find_unused_parameters) - - # initial fp16 - if self.cfg.get('fp16', False): - scaler = amp.GradScaler( - enable=self.cfg.use_gpu, init_loss_scaling=1024) + model, find_unused_parameters=find_unused_parameters) self.status.update({ 'epoch_id': self.start_epoch, @@ -381,6 +440,9 @@ class Trainer(object): self._compose_callback.on_train_begin(self.status) + use_fused_allreduce_gradients = self.cfg[ + 'use_fused_allreduce_gradients'] if 'use_fused_allreduce_gradients' in self.cfg else False + for epoch_id in range(self.start_epoch, self.cfg.epoch): self.status['mode'] = 'train' self.status['epoch_id'] = epoch_id @@ -395,23 +457,56 @@ class Trainer(object): self._compose_callback.on_step_begin(self.status) data['epoch_id'] = epoch_id - if self.cfg.get('fp16', False): - with amp.auto_cast(enable=self.cfg.use_gpu): - # model forward - outputs = model(data) - loss = outputs['loss'] - - # model backward - scaled_loss = scaler.scale(loss) - scaled_loss.backward() + if self.use_amp: + if isinstance( + model, paddle. + DataParallel) and use_fused_allreduce_gradients: + with model.no_sync(): + with paddle.amp.auto_cast( + enable=self.cfg.use_gpu, + custom_white_list=self.custom_white_list, + custom_black_list=self.custom_black_list, + level=self.amp_level): + # model forward + outputs = model(data) + loss = outputs['loss'] + # model backward + scaled_loss = scaler.scale(loss) + scaled_loss.backward() + fused_allreduce_gradients( + list(model.parameters()), None) + else: + with paddle.amp.auto_cast( + enable=self.cfg.use_gpu, + custom_white_list=self.custom_white_list, + custom_black_list=self.custom_black_list, + level=self.amp_level): + # model forward + outputs = model(data) + loss = outputs['loss'] + # model backward + scaled_loss = scaler.scale(loss) + scaled_loss.backward() # in dygraph mode, optimizer.minimize is equal to optimizer.step scaler.minimize(self.optimizer, scaled_loss) else: - # model forward - outputs = model(data) - loss = outputs['loss'] - # model backward - loss.backward() + if isinstance( + model, paddle. + DataParallel) and use_fused_allreduce_gradients: + with model.no_sync(): + # model forward + outputs = model(data) + loss = outputs['loss'] + # model backward + loss.backward() + fused_allreduce_gradients( + list(model.parameters()), None) + else: + # model forward + outputs = model(data) + loss = outputs['loss'] + # model backward + loss.backward() self.optimizer.step() curr_lr = self.optimizer.get_lr() self.lr.step() @@ -426,21 +521,23 @@ class Trainer(object): self.status['batch_time'].update(time.time() - iter_tic) self._compose_callback.on_step_end(self.status) if self.use_ema: - self.ema.update(self.model) + self.ema.update() iter_tic = time.time() - # apply ema weight on model - if self.use_ema: - weight = copy.deepcopy(self.model.state_dict()) - self.model.set_dict(self.ema.apply()) if self.cfg.get('unstructured_prune'): self.pruner.update_params() + is_snapshot = (self._nranks < 2 or self._local_rank == 0) \ + and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 or epoch_id == self.end_epoch - 1) + if is_snapshot and self.use_ema: + # apply ema weight on model + weight = copy.deepcopy(self.model.state_dict()) + self.model.set_dict(self.ema.apply()) + self.status['weight'] = weight + self._compose_callback.on_epoch_end(self.status) - if validate and (self._nranks < 2 or self._local_rank == 0) \ - and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 \ - or epoch_id == self.end_epoch - 1): + if validate and is_snapshot: if not hasattr(self, '_eval_loader'): # build evaluation dataset and loader self._eval_dataset = self.cfg.EvalDataset @@ -461,13 +558,15 @@ class Trainer(object): Init_mark = True self._init_metrics(validate=validate) self._reset_metrics() + with paddle.no_grad(): self.status['save_best_model'] = True self._eval_with_loader(self._eval_loader) - # restore origin weight on model - if self.use_ema: + if is_snapshot and self.use_ema: + # reset original weight self.model.set_dict(weight) + self.status.pop('weight') self._compose_callback.on_train_end(self.status) @@ -485,7 +584,15 @@ class Trainer(object): self.status['step_id'] = step_id self._compose_callback.on_step_begin(self.status) # forward - outs = self.model(data) + if self.use_amp: + with paddle.amp.auto_cast( + enable=self.cfg.use_gpu, + custom_white_list=self.custom_white_list, + custom_black_list=self.custom_black_list, + level=self.amp_level): + outs = self.model(data) + else: + outs = self.model(data) # update metrics for metric in self._metrics: @@ -513,32 +620,267 @@ class Trainer(object): with paddle.no_grad(): self._eval_with_loader(self.loader) + def _eval_with_loader_slice(self, + loader, + slice_size=[640, 640], + overlap_ratio=[0.25, 0.25], + combine_method='nms', + match_threshold=0.6, + match_metric='iou'): + sample_num = 0 + tic = time.time() + self._compose_callback.on_epoch_begin(self.status) + self.status['mode'] = 'eval' + self.model.eval() + if self.cfg.get('print_flops', False): + flops_loader = create('{}Reader'.format(self.mode.capitalize()))( + self.dataset, self.cfg.worker_num, self._eval_batch_sampler) + self._flops(flops_loader) + + merged_bboxs = [] + for step_id, data in enumerate(loader): + self.status['step_id'] = step_id + self._compose_callback.on_step_begin(self.status) + # forward + if self.use_amp: + with paddle.amp.auto_cast( + enable=self.cfg.use_gpu, + custom_white_list=self.custom_white_list, + custom_black_list=self.custom_black_list, + level=self.amp_level): + outs = self.model(data) + else: + outs = self.model(data) + + shift_amount = data['st_pix'] + outs['bbox'][:, 2:4] = outs['bbox'][:, 2:4] + shift_amount + outs['bbox'][:, 4:6] = outs['bbox'][:, 4:6] + shift_amount + merged_bboxs.append(outs['bbox']) + + if data['is_last'] > 0: + # merge matching predictions + merged_results = {'bbox': []} + if combine_method == 'nms': + final_boxes = multiclass_nms( + np.concatenate(merged_bboxs), self.cfg.num_classes, + match_threshold, match_metric) + merged_results['bbox'] = np.concatenate(final_boxes) + elif combine_method == 'concat': + merged_results['bbox'] = np.concatenate(merged_bboxs) + else: + raise ValueError( + "Now only support 'nms' or 'concat' to fuse detection results." + ) + merged_results['im_id'] = np.array([[0]]) + merged_results['bbox_num'] = np.array( + [len(merged_results['bbox'])]) + + merged_bboxs = [] + data['im_id'] = data['ori_im_id'] + # update metrics + for metric in self._metrics: + metric.update(data, merged_results) + + # multi-scale inputs: all inputs have same im_id + if isinstance(data, typing.Sequence): + sample_num += data[0]['im_id'].numpy().shape[0] + else: + sample_num += data['im_id'].numpy().shape[0] + + self._compose_callback.on_step_end(self.status) + + self.status['sample_num'] = sample_num + self.status['cost_time'] = time.time() - tic + + # accumulate metric to log out + for metric in self._metrics: + metric.accumulate() + metric.log() + self._compose_callback.on_epoch_end(self.status) + # reset metric states for metric may performed multiple times + self._reset_metrics() + + def evaluate_slice(self, + slice_size=[640, 640], + overlap_ratio=[0.25, 0.25], + combine_method='nms', + match_threshold=0.6, + match_metric='iou'): + with paddle.no_grad(): + self._eval_with_loader_slice(self.loader, slice_size, overlap_ratio, + combine_method, match_threshold, + match_metric) + + def slice_predict(self, + images, + slice_size=[640, 640], + overlap_ratio=[0.25, 0.25], + combine_method='nms', + match_threshold=0.6, + match_metric='iou', + draw_threshold=0.5, + output_dir='output', + save_results=False, + visualize=True): + self.dataset.set_slice_images(images, slice_size, overlap_ratio) + loader = create('TestReader')(self.dataset, 0) + + imid2path = self.dataset.get_imid2path() + + anno_file = self.dataset.get_anno() + clsid2catid, catid2name = get_categories( + self.cfg.metric, anno_file=anno_file) + + # Run Infer + self.status['mode'] = 'test' + self.model.eval() + if self.cfg.get('print_flops', False): + flops_loader = create('TestReader')(self.dataset, 0) + self._flops(flops_loader) + + results = [] # all images + merged_bboxs = [] # single image + for step_id, data in enumerate(tqdm(loader)): + self.status['step_id'] = step_id + # forward + outs = self.model(data) + + outs['bbox'] = outs['bbox'].numpy() # only in test mode + shift_amount = data['st_pix'] + outs['bbox'][:, 2:4] = outs['bbox'][:, 2:4] + shift_amount.numpy() + outs['bbox'][:, 4:6] = outs['bbox'][:, 4:6] + shift_amount.numpy() + merged_bboxs.append(outs['bbox']) + + if data['is_last'] > 0: + # merge matching predictions + merged_results = {'bbox': []} + if combine_method == 'nms': + final_boxes = multiclass_nms( + np.concatenate(merged_bboxs), self.cfg.num_classes, + match_threshold, match_metric) + merged_results['bbox'] = np.concatenate(final_boxes) + elif combine_method == 'concat': + merged_results['bbox'] = np.concatenate(merged_bboxs) + else: + raise ValueError( + "Now only support 'nms' or 'concat' to fuse detection results." + ) + merged_results['im_id'] = np.array([[0]]) + merged_results['bbox_num'] = np.array( + [len(merged_results['bbox'])]) + + merged_bboxs = [] + data['im_id'] = data['ori_im_id'] + + for key in ['im_shape', 'scale_factor', 'im_id']: + if isinstance(data, typing.Sequence): + merged_results[key] = data[0][key] + else: + merged_results[key] = data[key] + for key, value in merged_results.items(): + if hasattr(value, 'numpy'): + merged_results[key] = value.numpy() + results.append(merged_results) + + if visualize: + for outs in results: + batch_res = get_infer_results(outs, clsid2catid) + bbox_num = outs['bbox_num'] + start = 0 + for i, im_id in enumerate(outs['im_id']): + image_path = imid2path[int(im_id)] + image = Image.open(image_path).convert('RGB') + image = ImageOps.exif_transpose(image) + self.status['original_image'] = np.array(image.copy()) + end = start + bbox_num[i] + bbox_res = batch_res['bbox'][start:end] \ + if 'bbox' in batch_res else None + mask_res, segm_res, keypoint_res = None, None, None + image = visualize_results( + image, bbox_res, mask_res, segm_res, keypoint_res, + int(im_id), catid2name, draw_threshold) + self.status['result_image'] = np.array(image.copy()) + if self._compose_callback: + self._compose_callback.on_step_end(self.status) + # save image with detection + save_name = self._get_save_image_name(output_dir, + image_path) + logger.info("Detection bbox results save in {}".format( + save_name)) + image.save(save_name, quality=95) + start = end + def predict(self, images, draw_threshold=0.5, output_dir='output', - save_txt=False): + save_results=False, + visualize=True): + if not os.path.exists(output_dir): + os.makedirs(output_dir) + self.dataset.set_images(images) loader = create('TestReader')(self.dataset, 0) imid2path = self.dataset.get_imid2path() + def setup_metrics_for_loader(): + # mem + metrics = copy.deepcopy(self._metrics) + mode = self.mode + save_prediction_only = self.cfg[ + 'save_prediction_only'] if 'save_prediction_only' in self.cfg else None + output_eval = self.cfg[ + 'output_eval'] if 'output_eval' in self.cfg else None + + # modify + self.mode = '_test' + self.cfg['save_prediction_only'] = True + self.cfg['output_eval'] = output_dir + self.cfg['imid2path'] = imid2path + self._init_metrics() + + # restore + self.mode = mode + self.cfg.pop('save_prediction_only') + if save_prediction_only is not None: + self.cfg['save_prediction_only'] = save_prediction_only + + self.cfg.pop('output_eval') + if output_eval is not None: + self.cfg['output_eval'] = output_eval + + self.cfg.pop('imid2path') + + _metrics = copy.deepcopy(self._metrics) + self._metrics = metrics + + return _metrics + + if save_results: + metrics = setup_metrics_for_loader() + else: + metrics = [] + anno_file = self.dataset.get_anno() clsid2catid, catid2name = get_categories( self.cfg.metric, anno_file=anno_file) - # Run Infer + # Run Infer self.status['mode'] = 'test' self.model.eval() if self.cfg.get('print_flops', False): flops_loader = create('TestReader')(self.dataset, 0) self._flops(flops_loader) results = [] - for step_id, data in enumerate(loader): + for step_id, data in enumerate(tqdm(loader)): self.status['step_id'] = step_id # forward outs = self.model(data) + for _m in metrics: + _m.update(data, outs) + for key in ['im_shape', 'scale_factor', 'im_id']: if isinstance(data, typing.Sequence): outs[key] = data[0][key] @@ -548,64 +890,64 @@ class Trainer(object): if hasattr(value, 'numpy'): outs[key] = value.numpy() results.append(outs) + # sniper if type(self.dataset) == SniperCOCODataSet: results = self.dataset.anno_cropper.aggregate_chips_detections( results) - for outs in results: - batch_res = get_infer_results(outs, clsid2catid) - bbox_num = outs['bbox_num'] - - start = 0 - for i, im_id in enumerate(outs['im_id']): - image_path = imid2path[int(im_id)] - image = Image.open(image_path).convert('RGB') - image = ImageOps.exif_transpose(image) - self.status['original_image'] = np.array(image.copy()) - - end = start + bbox_num[i] - bbox_res = batch_res['bbox'][start:end] \ - if 'bbox' in batch_res else None - mask_res = batch_res['mask'][start:end] \ - if 'mask' in batch_res else None - segm_res = batch_res['segm'][start:end] \ - if 'segm' in batch_res else None - keypoint_res = batch_res['keypoint'][start:end] \ - if 'keypoint' in batch_res else None - image = visualize_results( - image, bbox_res, mask_res, segm_res, keypoint_res, - int(im_id), catid2name, draw_threshold) - self.status['result_image'] = np.array(image.copy()) - if self._compose_callback: - self._compose_callback.on_step_end(self.status) - # save image with detection - save_name = self._get_save_image_name(output_dir, image_path) - logger.info("Detection bbox results save in {}".format( - save_name)) - image.save(save_name, quality=95) - if save_txt: - save_path = os.path.splitext(save_name)[0] + '.txt' - results = {} - results["im_id"] = im_id - if bbox_res: - results["bbox_res"] = bbox_res - if keypoint_res: - results["keypoint_res"] = keypoint_res - save_result(save_path, results, catid2name, draw_threshold) - start = end + for _m in metrics: + _m.accumulate() + _m.reset() + + if visualize: + for outs in results: + batch_res = get_infer_results(outs, clsid2catid) + bbox_num = outs['bbox_num'] + + start = 0 + for i, im_id in enumerate(outs['im_id']): + image_path = imid2path[int(im_id)] + image = Image.open(image_path).convert('RGB') + image = ImageOps.exif_transpose(image) + self.status['original_image'] = np.array(image.copy()) + + end = start + bbox_num[i] + bbox_res = batch_res['bbox'][start:end] \ + if 'bbox' in batch_res else None + mask_res = batch_res['mask'][start:end] \ + if 'mask' in batch_res else None + segm_res = batch_res['segm'][start:end] \ + if 'segm' in batch_res else None + keypoint_res = batch_res['keypoint'][start:end] \ + if 'keypoint' in batch_res else None + image = visualize_results( + image, bbox_res, mask_res, segm_res, keypoint_res, + int(im_id), catid2name, draw_threshold) + self.status['result_image'] = np.array(image.copy()) + if self._compose_callback: + self._compose_callback.on_step_end(self.status) + # save image with detection + save_name = self._get_save_image_name(output_dir, + image_path) + logger.info("Detection bbox results save in {}".format( + save_name)) + image.save(save_name, quality=95) + + start = end def _get_save_image_name(self, output_dir, image_path): """ Get save image name from source image path. """ - if not os.path.exists(output_dir): - os.makedirs(output_dir) image_name = os.path.split(image_path)[-1] name, ext = os.path.splitext(image_name) return os.path.join(output_dir, "{}".format(name)) + ext - def _get_infer_cfg_and_input_spec(self, save_dir, prune_input=True): + def _get_infer_cfg_and_input_spec(self, + save_dir, + prune_input=True, + kl_quant=False): image_shape = None im_shape = [None, 2] scale_factor = [None, 2] @@ -628,9 +970,27 @@ class Trainer(object): if hasattr(self.model, 'deploy'): self.model.deploy = True + + if 'slim' not in self.cfg: + for layer in self.model.sublayers(): + if hasattr(layer, 'convert_to_deploy'): + layer.convert_to_deploy() + + export_post_process = self.cfg['export'].get( + 'post_process', False) if hasattr(self.cfg, 'export') else True + export_nms = self.cfg['export'].get('nms', False) if hasattr( + self.cfg, 'export') else True + export_benchmark = self.cfg['export'].get( + 'benchmark', False) if hasattr(self.cfg, 'export') else False if hasattr(self.model, 'fuse_norm'): self.model.fuse_norm = self.cfg['TestReader'].get('fuse_normalize', False) + if hasattr(self.model, 'export_post_process'): + self.model.export_post_process = export_post_process if not export_benchmark else False + if hasattr(self.model, 'export_nms'): + self.model.export_nms = export_nms if not export_benchmark else False + if export_post_process and not export_benchmark: + image_shape = [None] + image_shape[1:] # Save infer cfg _dump_infer_config(self.cfg, @@ -663,16 +1023,34 @@ class Trainer(object): pruned_input_spec = input_spec # TODO: Hard code, delete it when support prune input_spec. - if self.cfg.architecture == 'PicoDet': + if self.cfg.architecture == 'PicoDet' and not export_post_process: pruned_input_spec = [{ "image": InputSpec( shape=image_shape, name='image') }] + if kl_quant: + if self.cfg.architecture == 'PicoDet' or 'ppyoloe' in self.cfg.weights: + pruned_input_spec = [{ + "image": InputSpec( + shape=image_shape, name='image'), + "scale_factor": InputSpec( + shape=scale_factor, name='scale_factor') + }] + elif 'tinypose' in self.cfg.weights: + pruned_input_spec = [{ + "image": InputSpec( + shape=image_shape, name='image') + }] return static_model, pruned_input_spec def export(self, output_dir='output_inference'): self.model.eval() + + if hasattr(self.cfg, 'export') and 'fuse_conv_bn' in self.cfg[ + 'export'] and self.cfg['export']['fuse_conv_bn']: + self.model = fuse_conv_bn(self.model) + model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0] save_dir = os.path.join(output_dir, model_name) if not os.path.exists(save_dir): @@ -682,7 +1060,7 @@ class Trainer(object): save_dir) # dy2st and save model - if 'slim' not in self.cfg or self.cfg['slim_type'] != 'QAT': + if 'slim' not in self.cfg or 'QAT' not in self.cfg['slim_type']: paddle.jit.save( static_model, os.path.join(save_dir, 'model'), @@ -706,8 +1084,9 @@ class Trainer(object): break # TODO: support prune input_spec + kl_quant = True if hasattr(self.cfg.slim, 'ptq') else False _, pruned_input_spec = self._get_infer_cfg_and_input_spec( - save_dir, prune_input=False) + save_dir, prune_input=False, kl_quant=kl_quant) self.cfg.slim.save_quantized_model( self.model, @@ -739,3 +1118,29 @@ class Trainer(object): flops = flops(self.model, input_spec) / (1000**3) logger.info(" Model FLOPs : {:.6f}G. (image shape is {})".format( flops, input_data['image'][0].unsqueeze(0).shape)) + + def parse_mot_images(self, cfg): + import glob + # for quant + dataset_dir = cfg['EvalMOTDataset'].dataset_dir + data_root = cfg['EvalMOTDataset'].data_root + data_root = '{}/{}'.format(dataset_dir, data_root) + seqs = os.listdir(data_root) + seqs.sort() + all_images = [] + for seq in seqs: + infer_dir = os.path.join(data_root, seq) + assert infer_dir is None or os.path.isdir(infer_dir), \ + "{} is not a directory".format(infer_dir) + images = set() + exts = ['jpg', 'jpeg', 'png', 'bmp'] + exts += [ext.upper() for ext in exts] + for ext in exts: + images.update(glob.glob('{}/*.{}'.format(infer_dir, ext))) + images = list(images) + images.sort() + assert len(images) > 0, "no image found in {}".format(infer_dir) + all_images.extend(images) + logger.info("Found {} inference images in total.".format( + len(images))) + return all_images diff --git a/paddlers/models/ppdet/ext_op/README.md b/paddlers/models/ppdet/ext_op/README.md new file mode 100644 index 0000000..0d67062 --- /dev/null +++ b/paddlers/models/ppdet/ext_op/README.md @@ -0,0 +1,35 @@ +# 自定义OP编译 +旋转框IOU计算OP是参考[自定义外部算子](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/custom_op/new_cpp_op_cn.html) 。 + +## 1. 环境依赖 +- Paddle >= 2.0.1 +- gcc 8.2 + +## 2. 安装 +``` +python setup.py install +``` + +编译完成后即可使用,以下为`rbox_iou`的使用示例 +``` +# 引入自定义op +from ext_op import rbox_iou + +paddle.set_device('gpu:0') +paddle.disable_static() + +rbox1 = np.random.rand(13000, 5) +rbox2 = np.random.rand(7, 5) + +pd_rbox1 = paddle.to_tensor(rbox1) +pd_rbox2 = paddle.to_tensor(rbox2) + +iou = rbox_iou(pd_rbox1, pd_rbox2) +print('iou', iou) +``` + +## 3. 单元测试 +可以通过执行单元测试来确认自定义算子功能的正确性,执行单元测试的示例如下所示: +``` +python unittest/test_matched_rbox_iou.py +``` diff --git a/paddlers/models/ppdet/ext_op/csrc/rbox_iou/matched_rbox_iou_op.cc b/paddlers/models/ppdet/ext_op/csrc/rbox_iou/matched_rbox_iou_op.cc new file mode 100644 index 0000000..2c3c58b --- /dev/null +++ b/paddlers/models/ppdet/ext_op/csrc/rbox_iou/matched_rbox_iou_op.cc @@ -0,0 +1,90 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// The code is based on +// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/box_iou_rotated + +#include "paddle/extension.h" +#include "rbox_iou_op.h" + +template +void matched_rbox_iou_cpu_kernel(const int rbox_num, const T *rbox1_data_ptr, + const T *rbox2_data_ptr, T *output_data_ptr) { + + int i; + for (i = 0; i < rbox_num; i++) { + output_data_ptr[i] = + rbox_iou_single(rbox1_data_ptr + i * 5, rbox2_data_ptr + i * 5); + } +} + +#define CHECK_INPUT_CPU(x) \ + PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.") + +std::vector MatchedRboxIouCPUForward(const paddle::Tensor &rbox1, + const paddle::Tensor &rbox2) { + CHECK_INPUT_CPU(rbox1); + CHECK_INPUT_CPU(rbox2); + PD_CHECK(rbox1.shape()[0] == rbox2.shape()[0], "inputs must be same dim"); + + auto rbox_num = rbox1.shape()[0]; + auto output = paddle::Tensor(paddle::PlaceType::kCPU, {rbox_num}); + + PD_DISPATCH_FLOATING_TYPES(rbox1.type(), "rotated_iou_cpu_kernel", ([&] { + matched_rbox_iou_cpu_kernel( + rbox_num, rbox1.data(), + rbox2.data(), + output.mutable_data()); + })); + + return {output}; +} + +#ifdef PADDLE_WITH_CUDA +std::vector MatchedRboxIouCUDAForward(const paddle::Tensor &rbox1, + const paddle::Tensor &rbox2); +#endif + +#define CHECK_INPUT_SAME(x1, x2) \ + PD_CHECK(x1.place() == x2.place(), "input must be smae pacle.") + +std::vector MatchedRboxIouForward(const paddle::Tensor &rbox1, + const paddle::Tensor &rbox2) { + CHECK_INPUT_SAME(rbox1, rbox2); + if (rbox1.place() == paddle::PlaceType::kCPU) { + return MatchedRboxIouCPUForward(rbox1, rbox2); +#ifdef PADDLE_WITH_CUDA + } else if (rbox1.place() == paddle::PlaceType::kGPU) { + return MatchedRboxIouCUDAForward(rbox1, rbox2); +#endif + } +} + +std::vector> +MatchedRboxIouInferShape(std::vector rbox1_shape, + std::vector rbox2_shape) { + return {{rbox1_shape[0]}}; +} + +std::vector MatchedRboxIouInferDtype(paddle::DataType t1, + paddle::DataType t2) { + return {t1}; +} + +PD_BUILD_OP(matched_rbox_iou) + .Inputs({"RBOX1", "RBOX2"}) + .Outputs({"Output"}) + .SetKernelFn(PD_KERNEL(MatchedRboxIouForward)) + .SetInferShapeFn(PD_INFER_SHAPE(MatchedRboxIouInferShape)) + .SetInferDtypeFn(PD_INFER_DTYPE(MatchedRboxIouInferDtype)); diff --git a/paddlers/models/ppdet/ext_op/csrc/rbox_iou/matched_rbox_iou_op.cu b/paddlers/models/ppdet/ext_op/csrc/rbox_iou/matched_rbox_iou_op.cu new file mode 100644 index 0000000..8d03ecc --- /dev/null +++ b/paddlers/models/ppdet/ext_op/csrc/rbox_iou/matched_rbox_iou_op.cu @@ -0,0 +1,63 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// The code is based on +// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/box_iou_rotated + +#include "paddle/extension.h" +#include "rbox_iou_op.h" + +/** + Computes ceil(a / b) +*/ + +static inline int CeilDiv(const int a, const int b) { return (a + b - 1) / b; } + +template +__global__ void +matched_rbox_iou_cuda_kernel(const int rbox_num, const T *rbox1_data_ptr, + const T *rbox2_data_ptr, T *output_data_ptr) { + for (int tid = blockIdx.x * blockDim.x + threadIdx.x; tid < rbox_num; + tid += blockDim.x * gridDim.x) { + output_data_ptr[tid] = + rbox_iou_single(rbox1_data_ptr + tid * 5, rbox2_data_ptr + tid * 5); + } +} + +#define CHECK_INPUT_GPU(x) \ + PD_CHECK(x.place() == paddle::PlaceType::kGPU, #x " must be a GPU Tensor.") + +std::vector MatchedRboxIouCUDAForward(const paddle::Tensor &rbox1, + const paddle::Tensor &rbox2) { + CHECK_INPUT_GPU(rbox1); + CHECK_INPUT_GPU(rbox2); + PD_CHECK(rbox1.shape()[0] == rbox2.shape()[0], "inputs must be same dim"); + + auto rbox_num = rbox1.shape()[0]; + + auto output = paddle::Tensor(paddle::PlaceType::kGPU, {rbox_num}); + + const int thread_per_block = 512; + const int block_per_grid = CeilDiv(rbox_num, thread_per_block); + + PD_DISPATCH_FLOATING_TYPES( + rbox1.type(), "matched_rbox_iou_cuda_kernel", ([&] { + matched_rbox_iou_cuda_kernel< + data_t><<>>( + rbox_num, rbox1.data(), rbox2.data(), + output.mutable_data()); + })); + + return {output}; +} diff --git a/paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou_op.cc b/paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou_op.cc new file mode 100644 index 0000000..d66004e --- /dev/null +++ b/paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou_op.cc @@ -0,0 +1,97 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// The code is based on https://github.com/csuhan/s2anet/blob/master/mmdet/ops/box_iou_rotated + +#include "rbox_iou_op.h" +#include "paddle/extension.h" + + +template +void rbox_iou_cpu_kernel( + const int rbox1_num, + const int rbox2_num, + const T* rbox1_data_ptr, + const T* rbox2_data_ptr, + T* output_data_ptr) { + + int i, j; + for (i = 0; i < rbox1_num; i++) { + for (j = 0; j < rbox2_num; j++) { + int offset = i * rbox2_num + j; + output_data_ptr[offset] = rbox_iou_single(rbox1_data_ptr + i * 5, rbox2_data_ptr + j * 5); + } + } +} + + +#define CHECK_INPUT_CPU(x) PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.") + +std::vector RboxIouCPUForward(const paddle::Tensor& rbox1, const paddle::Tensor& rbox2) { + CHECK_INPUT_CPU(rbox1); + CHECK_INPUT_CPU(rbox2); + + auto rbox1_num = rbox1.shape()[0]; + auto rbox2_num = rbox2.shape()[0]; + + auto output = paddle::Tensor(paddle::PlaceType::kCPU, {rbox1_num, rbox2_num}); + + PD_DISPATCH_FLOATING_TYPES( + rbox1.type(), + "rbox_iou_cpu_kernel", + ([&] { + rbox_iou_cpu_kernel( + rbox1_num, + rbox2_num, + rbox1.data(), + rbox2.data(), + output.mutable_data()); + })); + + return {output}; +} + + +#ifdef PADDLE_WITH_CUDA +std::vector RboxIouCUDAForward(const paddle::Tensor& rbox1, const paddle::Tensor& rbox2); +#endif + + +#define CHECK_INPUT_SAME(x1, x2) PD_CHECK(x1.place() == x2.place(), "input must be smae pacle.") + +std::vector RboxIouForward(const paddle::Tensor& rbox1, const paddle::Tensor& rbox2) { + CHECK_INPUT_SAME(rbox1, rbox2); + if (rbox1.place() == paddle::PlaceType::kCPU) { + return RboxIouCPUForward(rbox1, rbox2); +#ifdef PADDLE_WITH_CUDA + } else if (rbox1.place() == paddle::PlaceType::kGPU) { + return RboxIouCUDAForward(rbox1, rbox2); +#endif + } +} + +std::vector> InferShape(std::vector rbox1_shape, std::vector rbox2_shape) { + return {{rbox1_shape[0], rbox2_shape[0]}}; +} + +std::vector InferDtype(paddle::DataType t1, paddle::DataType t2) { + return {t1}; +} + +PD_BUILD_OP(rbox_iou) + .Inputs({"RBOX1", "RBOX2"}) + .Outputs({"Output"}) + .SetKernelFn(PD_KERNEL(RboxIouForward)) + .SetInferShapeFn(PD_INFER_SHAPE(InferShape)) + .SetInferDtypeFn(PD_INFER_DTYPE(InferDtype)); diff --git a/paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou_op.cu b/paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou_op.cu new file mode 100644 index 0000000..a61be13 --- /dev/null +++ b/paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou_op.cu @@ -0,0 +1,114 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// The code is based on +// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/box_iou_rotated + +#include "paddle/extension.h" +#include "rbox_iou_op.h" + +// 2D block with 32 * 16 = 512 threads per block +const int BLOCK_DIM_X = 32; +const int BLOCK_DIM_Y = 16; + +/** + Computes ceil(a / b) +*/ + +static inline int CeilDiv(const int a, const int b) { return (a + b - 1) / b; } + +template +__global__ void rbox_iou_cuda_kernel(const int rbox1_num, const int rbox2_num, + const T *rbox1_data_ptr, + const T *rbox2_data_ptr, + T *output_data_ptr) { + + // get row_start and col_start + const int rbox1_block_idx = blockIdx.x * blockDim.x; + const int rbox2_block_idx = blockIdx.y * blockDim.y; + + const int rbox1_thread_num = min(rbox1_num - rbox1_block_idx, blockDim.x); + const int rbox2_thread_num = min(rbox2_num - rbox2_block_idx, blockDim.y); + + __shared__ T block_boxes1[BLOCK_DIM_X * 5]; + __shared__ T block_boxes2[BLOCK_DIM_Y * 5]; + + // It's safe to copy using threadIdx.x since BLOCK_DIM_X >= BLOCK_DIM_Y + if (threadIdx.x < rbox1_thread_num && threadIdx.y == 0) { + block_boxes1[threadIdx.x * 5 + 0] = + rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 0]; + block_boxes1[threadIdx.x * 5 + 1] = + rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 1]; + block_boxes1[threadIdx.x * 5 + 2] = + rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 2]; + block_boxes1[threadIdx.x * 5 + 3] = + rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 3]; + block_boxes1[threadIdx.x * 5 + 4] = + rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 4]; + } + + // threadIdx.x < BLOCK_DIM_Y=rbox2_thread_num, just use same condition as + // above: threadIdx.y == 0 + if (threadIdx.x < rbox2_thread_num && threadIdx.y == 0) { + block_boxes2[threadIdx.x * 5 + 0] = + rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 0]; + block_boxes2[threadIdx.x * 5 + 1] = + rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 1]; + block_boxes2[threadIdx.x * 5 + 2] = + rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 2]; + block_boxes2[threadIdx.x * 5 + 3] = + rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 3]; + block_boxes2[threadIdx.x * 5 + 4] = + rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 4]; + } + + // sync + __syncthreads(); + + if (threadIdx.x < rbox1_thread_num && threadIdx.y < rbox2_thread_num) { + int offset = (rbox1_block_idx + threadIdx.x) * rbox2_num + rbox2_block_idx + + threadIdx.y; + output_data_ptr[offset] = rbox_iou_single( + block_boxes1 + threadIdx.x * 5, block_boxes2 + threadIdx.y * 5); + } +} + +#define CHECK_INPUT_GPU(x) \ + PD_CHECK(x.place() == paddle::PlaceType::kGPU, #x " must be a GPU Tensor.") + +std::vector RboxIouCUDAForward(const paddle::Tensor &rbox1, + const paddle::Tensor &rbox2) { + CHECK_INPUT_GPU(rbox1); + CHECK_INPUT_GPU(rbox2); + + auto rbox1_num = rbox1.shape()[0]; + auto rbox2_num = rbox2.shape()[0]; + + auto output = paddle::Tensor(paddle::PlaceType::kGPU, {rbox1_num, rbox2_num}); + + const int blocks_x = CeilDiv(rbox1_num, BLOCK_DIM_X); + const int blocks_y = CeilDiv(rbox2_num, BLOCK_DIM_Y); + + dim3 blocks(blocks_x, blocks_y); + dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); + + PD_DISPATCH_FLOATING_TYPES( + rbox1.type(), "rbox_iou_cuda_kernel", ([&] { + rbox_iou_cuda_kernel<<>>( + rbox1_num, rbox2_num, rbox1.data(), rbox2.data(), + output.mutable_data()); + })); + + return {output}; +} diff --git a/paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou_op.h b/paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou_op.h new file mode 100644 index 0000000..b592c39 --- /dev/null +++ b/paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou_op.h @@ -0,0 +1,348 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// The code is based on +// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/box_iou_rotated + +#pragma once + +#include +#include +#include + +#ifdef __CUDACC__ +// Designates functions callable from the host (CPU) and the device (GPU) +#define HOST_DEVICE __host__ __device__ +#define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__ +#else +#include +#define HOST_DEVICE +#define HOST_DEVICE_INLINE HOST_DEVICE inline +#endif + +namespace { + +template struct RotatedBox { T x_ctr, y_ctr, w, h, a; }; + +template struct Point { + T x, y; + HOST_DEVICE_INLINE Point(const T &px = 0, const T &py = 0) : x(px), y(py) {} + HOST_DEVICE_INLINE Point operator+(const Point &p) const { + return Point(x + p.x, y + p.y); + } + HOST_DEVICE_INLINE Point &operator+=(const Point &p) { + x += p.x; + y += p.y; + return *this; + } + HOST_DEVICE_INLINE Point operator-(const Point &p) const { + return Point(x - p.x, y - p.y); + } + HOST_DEVICE_INLINE Point operator*(const T coeff) const { + return Point(x * coeff, y * coeff); + } +}; + +template +HOST_DEVICE_INLINE T dot_2d(const Point &A, const Point &B) { + return A.x * B.x + A.y * B.y; +} + +template +HOST_DEVICE_INLINE T cross_2d(const Point &A, const Point &B) { + return A.x * B.y - B.x * A.y; +} + +template +HOST_DEVICE_INLINE void get_rotated_vertices(const RotatedBox &box, + Point (&pts)[4]) { + // M_PI / 180. == 0.01745329251 + // double theta = box.a * 0.01745329251; + // MODIFIED + double theta = box.a; + T cosTheta2 = (T)cos(theta) * 0.5f; + T sinTheta2 = (T)sin(theta) * 0.5f; + + // y: top --> down; x: left --> right + pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w; + pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w; + pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w; + pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w; + pts[2].x = 2 * box.x_ctr - pts[0].x; + pts[2].y = 2 * box.y_ctr - pts[0].y; + pts[3].x = 2 * box.x_ctr - pts[1].x; + pts[3].y = 2 * box.y_ctr - pts[1].y; +} + +template +HOST_DEVICE_INLINE int get_intersection_points(const Point (&pts1)[4], + const Point (&pts2)[4], + Point (&intersections)[24]) { + // Line vector + // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1] + Point vec1[4], vec2[4]; + for (int i = 0; i < 4; i++) { + vec1[i] = pts1[(i + 1) % 4] - pts1[i]; + vec2[i] = pts2[(i + 1) % 4] - pts2[i]; + } + + // Line test - test all line combos for intersection + int num = 0; // number of intersections + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + // Solve for 2x2 Ax=b + T det = cross_2d(vec2[j], vec1[i]); + + // This takes care of parallel lines + if (fabs(det) <= 1e-14) { + continue; + } + + auto vec12 = pts2[j] - pts1[i]; + + T t1 = cross_2d(vec2[j], vec12) / det; + T t2 = cross_2d(vec1[i], vec12) / det; + + if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) { + intersections[num++] = pts1[i] + vec1[i] * t1; + } + } + } + + // Check for vertices of rect1 inside rect2 + { + const auto &AB = vec2[0]; + const auto &DA = vec2[3]; + auto ABdotAB = dot_2d(AB, AB); + auto ADdotAD = dot_2d(DA, DA); + for (int i = 0; i < 4; i++) { + // assume ABCD is the rectangle, and P is the point to be judged + // P is inside ABCD iff. P's projection on AB lies within AB + // and P's projection on AD lies within AD + + auto AP = pts1[i] - pts2[0]; + + auto APdotAB = dot_2d(AP, AB); + auto APdotAD = -dot_2d(AP, DA); + + if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && + (APdotAD <= ADdotAD)) { + intersections[num++] = pts1[i]; + } + } + } + + // Reverse the check - check for vertices of rect2 inside rect1 + { + const auto &AB = vec1[0]; + const auto &DA = vec1[3]; + auto ABdotAB = dot_2d(AB, AB); + auto ADdotAD = dot_2d(DA, DA); + for (int i = 0; i < 4; i++) { + auto AP = pts2[i] - pts1[0]; + + auto APdotAB = dot_2d(AP, AB); + auto APdotAD = -dot_2d(AP, DA); + + if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && + (APdotAD <= ADdotAD)) { + intersections[num++] = pts2[i]; + } + } + } + + return num; +} + +template +HOST_DEVICE_INLINE int convex_hull_graham(const Point (&p)[24], + const int &num_in, Point (&q)[24], + bool shift_to_zero = false) { + assert(num_in >= 2); + + // Step 1: + // Find point with minimum y + // if more than 1 points have the same minimum y, + // pick the one with the minimum x. + int t = 0; + for (int i = 1; i < num_in; i++) { + if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) { + t = i; + } + } + auto &start = p[t]; // starting point + + // Step 2: + // Subtract starting point from every points (for sorting in the next step) + for (int i = 0; i < num_in; i++) { + q[i] = p[i] - start; + } + + // Swap the starting point to position 0 + auto tmp = q[0]; + q[0] = q[t]; + q[t] = tmp; + + // Step 3: + // Sort point 1 ~ num_in according to their relative cross-product values + // (essentially sorting according to angles) + // If the angles are the same, sort according to their distance to origin + T dist[24]; + for (int i = 0; i < num_in; i++) { + dist[i] = dot_2d(q[i], q[i]); + } + +#ifdef __CUDACC__ + // CUDA version + // In the future, we can potentially use thrust + // for sorting here to improve speed (though not guaranteed) + for (int i = 1; i < num_in - 1; i++) { + for (int j = i + 1; j < num_in; j++) { + T crossProduct = cross_2d(q[i], q[j]); + if ((crossProduct < -1e-6) || + (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) { + auto q_tmp = q[i]; + q[i] = q[j]; + q[j] = q_tmp; + auto dist_tmp = dist[i]; + dist[i] = dist[j]; + dist[j] = dist_tmp; + } + } + } +#else + // CPU version + std::sort(q + 1, q + num_in, + [](const Point &A, const Point &B) -> bool { + T temp = cross_2d(A, B); + if (fabs(temp) < 1e-6) { + return dot_2d(A, A) < dot_2d(B, B); + } else { + return temp > 0; + } + }); +#endif + + // Step 4: + // Make sure there are at least 2 points (that don't overlap with each other) + // in the stack + int k; // index of the non-overlapped second point + for (k = 1; k < num_in; k++) { + if (dist[k] > 1e-8) { + break; + } + } + if (k == num_in) { + // We reach the end, which means the convex hull is just one point + q[0] = p[t]; + return 1; + } + q[1] = q[k]; + int m = 2; // 2 points in the stack + // Step 5: + // Finally we can start the scanning process. + // When a non-convex relationship between the 3 points is found + // (either concave shape or duplicated points), + // we pop the previous point from the stack + // until the 3-point relationship is convex again, or + // until the stack only contains two points + for (int i = k + 1; i < num_in; i++) { + while (m > 1 && cross_2d(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) { + m--; + } + q[m++] = q[i]; + } + + // Step 6 (Optional): + // In general sense we need the original coordinates, so we + // need to shift the points back (reverting Step 2) + // But if we're only interested in getting the area/perimeter of the shape + // We can simply return. + if (!shift_to_zero) { + for (int i = 0; i < m; i++) { + q[i] += start; + } + } + + return m; +} + +template +HOST_DEVICE_INLINE T polygon_area(const Point (&q)[24], const int &m) { + if (m <= 2) { + return 0; + } + + T area = 0; + for (int i = 1; i < m - 1; i++) { + area += fabs(cross_2d(q[i] - q[0], q[i + 1] - q[0])); + } + + return area / 2.0; +} + +template +HOST_DEVICE_INLINE T rboxes_intersection(const RotatedBox &box1, + const RotatedBox &box2) { + // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned + // from rotated_rect_intersection_pts + Point intersectPts[24], orderedPts[24]; + + Point pts1[4]; + Point pts2[4]; + get_rotated_vertices(box1, pts1); + get_rotated_vertices(box2, pts2); + + int num = get_intersection_points(pts1, pts2, intersectPts); + + if (num <= 2) { + return 0.0; + } + + // Convex Hull to order the intersection points in clockwise order and find + // the contour area. + int num_convex = convex_hull_graham(intersectPts, num, orderedPts, true); + return polygon_area(orderedPts, num_convex); +} + +} // namespace + +template +HOST_DEVICE_INLINE T rbox_iou_single(T const *const box1_raw, + T const *const box2_raw) { + // shift center to the middle point to achieve higher precision in result + RotatedBox box1, box2; + auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0; + auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0; + box1.x_ctr = box1_raw[0] - center_shift_x; + box1.y_ctr = box1_raw[1] - center_shift_y; + box1.w = box1_raw[2]; + box1.h = box1_raw[3]; + box1.a = box1_raw[4]; + box2.x_ctr = box2_raw[0] - center_shift_x; + box2.y_ctr = box2_raw[1] - center_shift_y; + box2.w = box2_raw[2]; + box2.h = box2_raw[3]; + box2.a = box2_raw[4]; + + const T area1 = box1.w * box1.h; + const T area2 = box2.w * box2.h; + if (area1 < 1e-14 || area2 < 1e-14) { + return 0.f; + } + + const T intersection = rboxes_intersection(box1, box2); + const T iou = intersection / (area1 + area2 - intersection); + return iou; +} diff --git a/paddlers/models/ppdet/ext_op/setup.py b/paddlers/models/ppdet/ext_op/setup.py new file mode 100644 index 0000000..5892f46 --- /dev/null +++ b/paddlers/models/ppdet/ext_op/setup.py @@ -0,0 +1,33 @@ +import os +import glob +import paddle +from paddle.utils.cpp_extension import CppExtension, CUDAExtension, setup + + +def get_extensions(): + root_dir = os.path.dirname(os.path.abspath(__file__)) + ext_root_dir = os.path.join(root_dir, 'csrc') + sources = [] + for ext_name in os.listdir(ext_root_dir): + ext_dir = os.path.join(ext_root_dir, ext_name) + source = glob.glob(os.path.join(ext_dir, '*.cc')) + kwargs = dict() + if paddle.device.is_compiled_with_cuda(): + source += glob.glob(os.path.join(ext_dir, '*.cu')) + + if not source: + continue + + sources += source + + if paddle.device.is_compiled_with_cuda(): + extension = CUDAExtension( + sources, extra_compile_args={'cxx': ['-DPADDLE_WITH_CUDA']}) + else: + extension = CppExtension(sources) + + return extension + + +if __name__ == "__main__": + setup(name='ext_op', ext_modules=get_extensions()) diff --git a/paddlers/models/ppdet/ext_op/unittest/test_matched_rbox_iou.py b/paddlers/models/ppdet/ext_op/unittest/test_matched_rbox_iou.py new file mode 100644 index 0000000..bff95b3 --- /dev/null +++ b/paddlers/models/ppdet/ext_op/unittest/test_matched_rbox_iou.py @@ -0,0 +1,149 @@ +import numpy as np +import sys +import time +from shapely.geometry import Polygon +import paddle +import unittest + +from ext_op import matched_rbox_iou + + +def rbox2poly_single(rrect, get_best_begin_point=False): + """ + rrect:[x_ctr,y_ctr,w,h,angle] + to + poly:[x0,y0,x1,y1,x2,y2,x3,y3] + """ + x_ctr, y_ctr, width, height, angle = rrect[:5] + tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2 + # rect 2x4 + rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]]) + R = np.array([[np.cos(angle), -np.sin(angle)], + [np.sin(angle), np.cos(angle)]]) + # poly + poly = R.dot(rect) + x0, x1, x2, x3 = poly[0, :4] + x_ctr + y0, y1, y2, y3 = poly[1, :4] + y_ctr + poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float64) + return poly + + +def intersection(g, p): + """ + Intersection. + """ + + g = g[:8].reshape((4, 2)) + p = p[:8].reshape((4, 2)) + + a = g + b = p + + use_filter = True + if use_filter: + # step1: + inter_x1 = np.maximum(np.min(a[:, 0]), np.min(b[:, 0])) + inter_x2 = np.minimum(np.max(a[:, 0]), np.max(b[:, 0])) + inter_y1 = np.maximum(np.min(a[:, 1]), np.min(b[:, 1])) + inter_y2 = np.minimum(np.max(a[:, 1]), np.max(b[:, 1])) + if inter_x1 >= inter_x2 or inter_y1 >= inter_y2: + return 0. + x1 = np.minimum(np.min(a[:, 0]), np.min(b[:, 0])) + x2 = np.maximum(np.max(a[:, 0]), np.max(b[:, 0])) + y1 = np.minimum(np.min(a[:, 1]), np.min(b[:, 1])) + y2 = np.maximum(np.max(a[:, 1]), np.max(b[:, 1])) + if x1 >= x2 or y1 >= y2 or (x2 - x1) < 2 or (y2 - y1) < 2: + return 0. + + g = Polygon(g) + p = Polygon(p) + if not g.is_valid or not p.is_valid: + return 0 + + inter = Polygon(g).intersection(Polygon(p)).area + union = g.area + p.area - inter + if union == 0: + return 0 + else: + return inter / union + + +def matched_rbox_overlaps(anchors, gt_bboxes, use_cv2=False): + """ + + Args: + anchors: [M, 5] x1,y1,x2,y2,angle + gt_bboxes: [M, 5] x1,y1,x2,y2,angle + + Returns: + macthed_iou: [M] + """ + assert anchors.shape[1] == 5 + assert gt_bboxes.shape[1] == 5 + + gt_bboxes_ploy = [rbox2poly_single(e) for e in gt_bboxes] + anchors_ploy = [rbox2poly_single(e) for e in anchors] + + num = len(anchors_ploy) + iou = np.zeros((num, ), dtype=np.float64) + + start_time = time.time() + for i in range(num): + try: + iou[i] = intersection(gt_bboxes_ploy[i], anchors_ploy[i]) + except Exception as e: + print('cur gt_bboxes_ploy[i]', gt_bboxes_ploy[i], 'anchors_ploy[j]', + anchors_ploy[i], e) + return iou + + +def gen_sample(n): + rbox = np.random.rand(n, 5) + rbox[:, 0:4] = rbox[:, 0:4] * 0.45 + 0.001 + rbox[:, 4] = rbox[:, 4] - 0.5 + return rbox + + +class MatchedRBoxIoUTest(unittest.TestCase): + def setUp(self): + self.initTestCase() + self.rbox1 = gen_sample(self.n) + self.rbox2 = gen_sample(self.n) + + def initTestCase(self): + self.n = 1000 + + def assertAllClose(self, x, y, msg, atol=5e-1, rtol=1e-2): + self.assertTrue(np.allclose(x, y, atol=atol, rtol=rtol), msg=msg) + + def get_places(self): + places = [paddle.CPUPlace()] + if paddle.device.is_compiled_with_cuda(): + places.append(paddle.CUDAPlace(0)) + + return places + + def check_output(self, place): + paddle.disable_static() + pd_rbox1 = paddle.to_tensor(self.rbox1, place=place) + pd_rbox2 = paddle.to_tensor(self.rbox2, place=place) + actual_t = matched_rbox_iou(pd_rbox1, pd_rbox2).numpy() + poly_rbox1 = self.rbox1 + poly_rbox2 = self.rbox2 + poly_rbox1[:, 0:4] = self.rbox1[:, 0:4] * 1024 + poly_rbox2[:, 0:4] = self.rbox2[:, 0:4] * 1024 + expect_t = matched_rbox_overlaps(poly_rbox1, poly_rbox2, use_cv2=False) + self.assertAllClose( + actual_t, + expect_t, + msg="rbox_iou has diff at {} \nExpect {}\nBut got {}".format( + str(place), str(expect_t), str(actual_t))) + + def test_output(self): + places = self.get_places() + for place in places: + self.check_output(place) + + +if __name__ == "__main__": + unittest.main() diff --git a/paddlers/models/ppdet/ext_op/unittest/test_rbox_iou.py b/paddlers/models/ppdet/ext_op/unittest/test_rbox_iou.py new file mode 100644 index 0000000..8ef19ae --- /dev/null +++ b/paddlers/models/ppdet/ext_op/unittest/test_rbox_iou.py @@ -0,0 +1,151 @@ +import numpy as np +import sys +import time +from shapely.geometry import Polygon +import paddle +import unittest + +from ext_op import rbox_iou + + +def rbox2poly_single(rrect, get_best_begin_point=False): + """ + rrect:[x_ctr,y_ctr,w,h,angle] + to + poly:[x0,y0,x1,y1,x2,y2,x3,y3] + """ + x_ctr, y_ctr, width, height, angle = rrect[:5] + tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2 + # rect 2x4 + rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]]) + R = np.array([[np.cos(angle), -np.sin(angle)], + [np.sin(angle), np.cos(angle)]]) + # poly + poly = R.dot(rect) + x0, x1, x2, x3 = poly[0, :4] + x_ctr + y0, y1, y2, y3 = poly[1, :4] + y_ctr + poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float64) + return poly + + +def intersection(g, p): + """ + Intersection. + """ + + g = g[:8].reshape((4, 2)) + p = p[:8].reshape((4, 2)) + + a = g + b = p + + use_filter = True + if use_filter: + # step1: + inter_x1 = np.maximum(np.min(a[:, 0]), np.min(b[:, 0])) + inter_x2 = np.minimum(np.max(a[:, 0]), np.max(b[:, 0])) + inter_y1 = np.maximum(np.min(a[:, 1]), np.min(b[:, 1])) + inter_y2 = np.minimum(np.max(a[:, 1]), np.max(b[:, 1])) + if inter_x1 >= inter_x2 or inter_y1 >= inter_y2: + return 0. + x1 = np.minimum(np.min(a[:, 0]), np.min(b[:, 0])) + x2 = np.maximum(np.max(a[:, 0]), np.max(b[:, 0])) + y1 = np.minimum(np.min(a[:, 1]), np.min(b[:, 1])) + y2 = np.maximum(np.max(a[:, 1]), np.max(b[:, 1])) + if x1 >= x2 or y1 >= y2 or (x2 - x1) < 2 or (y2 - y1) < 2: + return 0. + + g = Polygon(g) + p = Polygon(p) + if not g.is_valid or not p.is_valid: + return 0 + + inter = Polygon(g).intersection(Polygon(p)).area + union = g.area + p.area - inter + if union == 0: + return 0 + else: + return inter / union + + +def rbox_overlaps(anchors, gt_bboxes, use_cv2=False): + """ + + Args: + anchors: [NA, 5] x1,y1,x2,y2,angle + gt_bboxes: [M, 5] x1,y1,x2,y2,angle + + Returns: + iou: [NA, M] + """ + assert anchors.shape[1] == 5 + assert gt_bboxes.shape[1] == 5 + + gt_bboxes_ploy = [rbox2poly_single(e) for e in gt_bboxes] + anchors_ploy = [rbox2poly_single(e) for e in anchors] + + num_gt, num_anchors = len(gt_bboxes_ploy), len(anchors_ploy) + iou = np.zeros((num_anchors, num_gt), dtype=np.float64) + + start_time = time.time() + for i in range(num_anchors): + for j in range(num_gt): + try: + iou[i, j] = intersection(anchors_ploy[i], gt_bboxes_ploy[j]) + except Exception as e: + print('cur anchors_ploy[i]', anchors_ploy[i], + 'gt_bboxes_ploy[j]', gt_bboxes_ploy[j], e) + return iou + + +def gen_sample(n): + rbox = np.random.rand(n, 5) + rbox[:, 0:4] = rbox[:, 0:4] * 0.45 + 0.001 + rbox[:, 4] = rbox[:, 4] - 0.5 + return rbox + + +class RBoxIoUTest(unittest.TestCase): + def setUp(self): + self.initTestCase() + self.rbox1 = gen_sample(self.n) + self.rbox2 = gen_sample(self.m) + + def initTestCase(self): + self.n = 13000 + self.m = 7 + + def assertAllClose(self, x, y, msg, atol=5e-1, rtol=1e-2): + self.assertTrue(np.allclose(x, y, atol=atol, rtol=rtol), msg=msg) + + def get_places(self): + places = [paddle.CPUPlace()] + if paddle.device.is_compiled_with_cuda(): + places.append(paddle.CUDAPlace(0)) + + return places + + def check_output(self, place): + paddle.disable_static() + pd_rbox1 = paddle.to_tensor(self.rbox1, place=place) + pd_rbox2 = paddle.to_tensor(self.rbox2, place=place) + actual_t = rbox_iou(pd_rbox1, pd_rbox2).numpy() + poly_rbox1 = self.rbox1 + poly_rbox2 = self.rbox2 + poly_rbox1[:, 0:4] = self.rbox1[:, 0:4] * 1024 + poly_rbox2[:, 0:4] = self.rbox2[:, 0:4] * 1024 + expect_t = rbox_overlaps(poly_rbox1, poly_rbox2, use_cv2=False) + self.assertAllClose( + actual_t, + expect_t, + msg="rbox_iou has diff at {} \nExpect {}\nBut got {}".format( + str(place), str(expect_t), str(actual_t))) + + def test_output(self): + places = self.get_places() + for place in places: + self.check_output(place) + + +if __name__ == "__main__": + unittest.main() diff --git a/paddlers/models/ppdet/metrics/__init__.py b/paddlers/models/ppdet/metrics/__init__.py index e0659bd..dc1a53b 100644 --- a/paddlers/models/ppdet/metrics/__init__.py +++ b/paddlers/models/ppdet/metrics/__init__.py @@ -26,4 +26,4 @@ __all__ = metrics.__all__ + mot_metrics.__all__ from . import mcmot_metrics from .mcmot_metrics import * -__all__ = metrics.__all__ + mcmot_metrics.__all__ +__all__ = metrics.__all__ + mcmot_metrics.__all__ \ No newline at end of file diff --git a/paddlers/models/ppdet/metrics/coco_utils.py b/paddlers/models/ppdet/metrics/coco_utils.py index c920fd4..b6a1cff 100644 --- a/paddlers/models/ppdet/metrics/coco_utils.py +++ b/paddlers/models/ppdet/metrics/coco_utils.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import diff --git a/paddlers/models/ppdet/metrics/json_results.py b/paddlers/models/ppdet/metrics/json_results.py old mode 100644 new mode 100755 index aab0fbe..b20c30e --- a/paddlers/models/ppdet/metrics/json_results.py +++ b/paddlers/models/ppdet/metrics/json_results.py @@ -65,6 +65,14 @@ def get_det_poly_res(bboxes, bbox_nums, image_id, label_to_cat_id_map, bias=0): return det_res +def strip_mask(mask): + row = mask[0, 0, :] + col = mask[0, :, 0] + im_h = len(col) - np.count_nonzero(col == -1) + im_w = len(row) - np.count_nonzero(row == -1) + return mask[:, :im_h, :im_w] + + def get_seg_res(masks, bboxes, mask_nums, image_id, label_to_cat_id_map): import pycocotools.mask as mask_util seg_res = [] @@ -72,8 +80,10 @@ def get_seg_res(masks, bboxes, mask_nums, image_id, label_to_cat_id_map): for i in range(len(mask_nums)): cur_image_id = int(image_id[i][0]) det_nums = mask_nums[i] + mask_i = masks[k:k + det_nums] + mask_i = strip_mask(mask_i) for j in range(det_nums): - mask = masks[k].astype(np.uint8) + mask = mask_i[j].astype(np.uint8) score = float(bboxes[k][1]) label = int(bboxes[k][0]) k = k + 1 diff --git a/paddlers/models/ppdet/metrics/keypoint_metrics.py b/paddlers/models/ppdet/metrics/keypoint_metrics.py index 54eeda2..6e10a0c 100644 --- a/paddlers/models/ppdet/metrics/keypoint_metrics.py +++ b/paddlers/models/ppdet/metrics/keypoint_metrics.py @@ -1,21 +1,22 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. import os import json from collections import defaultdict, OrderedDict import numpy as np +import paddle from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval from ..modeling.keypoint_utils import oks_nms @@ -70,15 +71,23 @@ class KeyPointTopDownCOCOEval(object): self.results['all_preds'][self.idx:self.idx + num_images, :, 0: 3] = kpts[:, :, 0:3] self.results['all_boxes'][self.idx:self.idx + num_images, 0:2] = inputs[ - 'center'].numpy()[:, 0:2] + 'center'].numpy()[:, 0:2] if isinstance( + inputs['center'], paddle.Tensor) else inputs['center'][:, 0:2] self.results['all_boxes'][self.idx:self.idx + num_images, 2:4] = inputs[ - 'scale'].numpy()[:, 0:2] + 'scale'].numpy()[:, 0:2] if isinstance( + inputs['scale'], paddle.Tensor) else inputs['scale'][:, 0:2] self.results['all_boxes'][self.idx:self.idx + num_images, 4] = np.prod( - inputs['scale'].numpy() * 200, 1) - self.results['all_boxes'][self.idx:self.idx + num_images, - 5] = np.squeeze(inputs['score'].numpy()) - self.results['image_path'].extend(inputs['im_id'].numpy()) - + inputs['scale'].numpy() * 200, + 1) if isinstance(inputs['scale'], paddle.Tensor) else np.prod( + inputs['scale'] * 200, 1) + self.results['all_boxes'][ + self.idx:self.idx + num_images, + 5] = np.squeeze(inputs['score'].numpy()) if isinstance( + inputs['score'], paddle.Tensor) else np.squeeze(inputs['score']) + if isinstance(inputs['im_id'], paddle.Tensor): + self.results['image_path'].extend(inputs['im_id'].numpy()) + else: + self.results['image_path'].extend(inputs['im_id']) self.idx += num_images def _write_coco_keypoint_results(self, keypoints): diff --git a/paddlers/models/ppdet/metrics/map_utils.py b/paddlers/models/ppdet/metrics/map_utils.py index a7d786e..32c7020 100644 --- a/paddlers/models/ppdet/metrics/map_utils.py +++ b/paddlers/models/ppdet/metrics/map_utils.py @@ -22,7 +22,7 @@ import sys import numpy as np import itertools import paddle -from paddlers.models.ppdet.modeling.bbox_utils import poly2rbox, rbox2poly_np +from paddlers.models.ppdet.modeling.rbox_utils import poly2rbox_np from paddlers.models.ppdet.utils.logger import setup_logger logger = setup_logger(__name__) @@ -91,15 +91,13 @@ def jaccard_overlap(pred, gt, is_bbox_normalized=False): return overlap -def calc_rbox_iou(pred, gt_rbox): +def calc_rbox_iou(pred, gt_poly): """ calc iou between rotated bbox """ # calc iou of bounding box for speedup - pred = np.array(pred, np.float32).reshape(-1, 8) - pred = pred.reshape(-1, 2) - gt_poly = rbox2poly_np(np.array(gt_rbox).reshape(-1, 5))[0] - gt_poly = gt_poly.reshape(-1, 2) + pred = np.array(pred, np.float32).reshape(-1, 2) + gt_poly = np.array(gt_poly, np.float32).reshape(-1, 2) pred_rect = [ np.min(pred[:, 0]), np.min(pred[:, 1]), np.max(pred[:, 0]), np.max(pred[:, 1]) @@ -114,20 +112,15 @@ def calc_rbox_iou(pred, gt_rbox): return iou # calc rbox iou - pred = pred.reshape(-1, 8) - - pred = np.array(pred, np.float32).reshape(-1, 8) - pred_rbox = poly2rbox(pred) - pred_rbox = pred_rbox.reshape(-1, 5) - pred_rbox = pred_rbox.reshape(-1, 5) + pred_rbox = poly2rbox_np(pred.reshape(-1, 8)).reshape(-1, 5) + gt_rbox = poly2rbox_np(gt_poly.reshape(-1, 8)).reshape(-1, 5) try: - from rbox_iou_ops import rbox_iou + from ext_op import rbox_iou except Exception as e: - print("import custom_ops error, try install rbox_iou_ops " \ + print("import custom_ops error, try install ext_op " \ "following ppdet/ext_op/README.md", e) sys.stdout.flush() sys.exit(-1) - gt_rbox = np.array(gt_rbox, np.float32).reshape(-1, 5) pd_gt_rbox = paddle.to_tensor(gt_rbox, dtype='float32') pd_pred_rbox = paddle.to_tensor(pred_rbox, dtype='float32') iou = rbox_iou(pd_gt_rbox, pd_pred_rbox) @@ -138,8 +131,7 @@ def calc_rbox_iou(pred, gt_rbox): def prune_zero_padding(gt_box, gt_label, difficult=None): valid_cnt = 0 for i in range(len(gt_box)): - if gt_box[i, 0] == 0 and gt_box[i, 1] == 0 and \ - gt_box[i, 2] == 0 and gt_box[i, 3] == 0: + if (gt_box[i] == 0).all(): break valid_cnt += 1 return (gt_box[:valid_cnt], gt_label[:valid_cnt], difficult[:valid_cnt] @@ -154,8 +146,8 @@ class DetectionMAP(object): Args: class_num (int): The class number. overlap_thresh (float): The threshold of overlap - ratio between prediction bounding box and - ground truth bounding box for deciding + ratio between prediction bounding box and + ground truth bounding box for deciding true/false positive. Default 0.5. map_type (str): Calculation method of mean average precision, currently support '11point' and @@ -212,7 +204,7 @@ class DetectionMAP(object): max_overlap = -1.0 for i, gl in enumerate(gt_label): if int(gl) == int(l): - if len(gt_box[i]) == 5: + if len(gt_box[i]) == 8: overlap = calc_rbox_iou(pred, gt_box[i]) else: overlap = jaccard_overlap(pred, gt_box[i], @@ -363,7 +355,7 @@ def ap_per_class(tp, conf, pred_cls, target_cls): """ Computes the average precision, given the recall and precision curves. Method originally from https://github.com/rafaelpadilla/Object-Detection-Metrics. - + Args: tp (list): True positives. conf (list): Objectness value from 0-1. @@ -417,7 +409,7 @@ def compute_ap(recall, precision): """ Computes the average precision, given the recall and precision curves. Code originally from https://github.com/rbgirshick/py-faster-rcnn. - + Args: recall (list): The recall curve. precision (list): The precision curve. diff --git a/paddlers/models/ppdet/metrics/mcmot_metrics.py b/paddlers/models/ppdet/metrics/mcmot_metrics.py index 75bbca5..50c6aa3 100644 --- a/paddlers/models/ppdet/metrics/mcmot_metrics.py +++ b/paddlers/models/ppdet/metrics/mcmot_metrics.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import @@ -21,18 +21,21 @@ import copy import sys import math from collections import defaultdict -from motmetrics.math_util import quiet_divide import numpy as np import pandas as pd -import paddle -import paddle.nn.functional as F from .metrics import Metric -import motmetrics as mm -import openpyxl -metrics = mm.metrics.motchallenge_metrics -mh = mm.metrics.create() +try: + import motmetrics as mm + from motmetrics.math_util import quiet_divide + metrics = mm.metrics.motchallenge_metrics + mh = mm.metrics.create() +except: + print( + 'Warning: Unable to use MCMOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics' + ) + pass from paddlers.models.ppdet.utils.logger import setup_logger logger = setup_logger(__name__) @@ -78,7 +81,7 @@ NAME_MAP = { def parse_accs_metrics(seq_acc, index_name, verbose=False): """ - Parse the evaluation indicators of multiple MOTAccumulator + Parse the evaluation indicators of multiple MOTAccumulator """ mh = mm.metrics.create() summary = MCMOTEvaluator.get_summary(seq_acc, index_name, METRICS_LIST) @@ -302,24 +305,30 @@ class MCMOTEvaluator(object): self.num_classes = num_classes self.load_annotations() + try: + import motmetrics as mm + mm.lap.default_solver = 'lap' + except Exception as e: + raise RuntimeError( + 'Unable to use MCMOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics' + ) self.reset_accumulator() self.class_accs = [] def load_annotations(self): assert self.data_type == 'mcmot' - self.gt_filename = os.path.join(self.data_root, '../', '../', - 'sequences', + self.gt_filename = os.path.join(self.data_root, '../', 'sequences', '{}.txt'.format(self.seq_name)) + if not os.path.exists(self.gt_filename): + logger.warning( + "gt_filename '{}' of MCMOTEvaluator is not exist, so the MOTA will be -INF." + ) def reset_accumulator(self): - import motmetrics as mm - mm.lap.default_solver = 'lap' self.acc = mm.MOTAccumulator(auto_id=True) def eval_frame_dict(self, trk_objs, gt_objs, rtn_events=False, union=False): - import motmetrics as mm - mm.lap.default_solver = 'lap' if union: trk_tlwhs, trk_ids, trk_cls = unzip_objs_cls(trk_objs)[:3] gt_tlwhs, gt_ids, gt_cls = unzip_objs_cls(gt_objs)[:3] @@ -393,9 +402,6 @@ class MCMOTEvaluator(object): names, metrics=('mota', 'num_switches', 'idp', 'idr', 'idf1', 'precision', 'recall')): - import motmetrics as mm - mm.lap.default_solver = 'lap' - names = copy.deepcopy(names) if metrics is None: metrics = mm.metrics.motchallenge_metrics diff --git a/paddlers/models/ppdet/metrics/metrics.py b/paddlers/models/ppdet/metrics/metrics.py index a23e5cc..577bf6b 100644 --- a/paddlers/models/ppdet/metrics/metrics.py +++ b/paddlers/models/ppdet/metrics/metrics.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import @@ -22,11 +22,14 @@ import json import paddle import numpy as np import typing +from collections import defaultdict +from pathlib import Path from .map_utils import prune_zero_padding, DetectionMAP from .coco_utils import get_infer_results, cocoapi_eval from .widerface_utils import face_eval_run from paddlers.models.ppdet.data.source.category import get_categories +from paddlers.models.ppdet.modeling.rbox_utils import poly2rbox_np from paddlers.models.ppdet.utils.logger import setup_logger logger = setup_logger(__name__) @@ -69,8 +72,6 @@ class Metric(paddle.metric.Metric): class COCOMetric(Metric): def __init__(self, anno_file, **kwargs): - assert os.path.isfile(anno_file), \ - "anno_file {} not a file".format(anno_file) self.anno_file = anno_file self.clsid2catid = kwargs.get('clsid2catid', None) if self.clsid2catid is None: @@ -81,6 +82,14 @@ class COCOMetric(Metric): self.bias = kwargs.get('bias', 0) self.save_prediction_only = kwargs.get('save_prediction_only', False) self.iou_type = kwargs.get('IouType', 'bbox') + + if not self.save_prediction_only: + assert os.path.isfile(anno_file), \ + "anno_file {} not a file".format(anno_file) + + if self.output_eval is not None: + Path(self.output_eval).mkdir(exist_ok=True) + self.reset() def reset(self): @@ -218,7 +227,9 @@ class VOCMetric(Metric): map_type='11point', is_bbox_normalized=False, evaluate_difficult=False, - classwise=False): + classwise=False, + output_eval=None, + save_prediction_only=False): assert os.path.isfile(label_list), \ "label_list {} not a file".format(label_list) self.clsid2catid, self.catid2name = get_categories('VOC', label_list) @@ -226,6 +237,8 @@ class VOCMetric(Metric): self.overlap_thresh = overlap_thresh self.map_type = map_type self.evaluate_difficult = evaluate_difficult + self.output_eval = output_eval + self.save_prediction_only = save_prediction_only self.detection_map = DetectionMAP( class_num=class_num, overlap_thresh=overlap_thresh, @@ -238,34 +251,52 @@ class VOCMetric(Metric): self.reset() def reset(self): + self.results = {'bbox': [], 'score': [], 'label': []} self.detection_map.reset() def update(self, inputs, outputs): - bbox_np = outputs['bbox'].numpy() + bbox_np = outputs['bbox'].numpy() if isinstance( + outputs['bbox'], paddle.Tensor) else outputs['bbox'] bboxes = bbox_np[:, 2:] scores = bbox_np[:, 1] labels = bbox_np[:, 0] - bbox_lengths = outputs['bbox_num'].numpy() + bbox_lengths = outputs['bbox_num'].numpy() if isinstance( + outputs['bbox_num'], paddle.Tensor) else outputs['bbox_num'] + + self.results['bbox'].append(bboxes.tolist()) + self.results['score'].append(scores.tolist()) + self.results['label'].append(labels.tolist()) if bboxes.shape == (1, 1) or bboxes is None: return + if self.save_prediction_only: + return + gt_boxes = inputs['gt_bbox'] gt_labels = inputs['gt_class'] difficults = inputs['difficult'] if not self.evaluate_difficult \ else None - scale_factor = inputs['scale_factor'].numpy( - ) if 'scale_factor' in inputs else np.ones( - (gt_boxes.shape[0], 2)).astype('float32') + if 'scale_factor' in inputs: + scale_factor = inputs['scale_factor'].numpy() if isinstance( + inputs['scale_factor'], + paddle.Tensor) else inputs['scale_factor'] + else: + scale_factor = np.ones((gt_boxes.shape[0], 2)).astype('float32') bbox_idx = 0 for i in range(len(gt_boxes)): - gt_box = gt_boxes[i].numpy() + gt_box = gt_boxes[i].numpy() if isinstance( + gt_boxes[i], paddle.Tensor) else gt_boxes[i] h, w = scale_factor[i] gt_box = gt_box / np.array([w, h, w, h]) - gt_label = gt_labels[i].numpy() - difficult = None if difficults is None \ - else difficults[i].numpy() + gt_label = gt_labels[i].numpy() if isinstance( + gt_labels[i], paddle.Tensor) else gt_labels[i] + if difficults is not None: + difficult = difficults[i].numpy() if isinstance( + difficults[i], paddle.Tensor) else difficults[i] + else: + difficult = None bbox_num = bbox_lengths[i] bbox = bboxes[bbox_idx:bbox_idx + bbox_num] score = scores[bbox_idx:bbox_idx + bbox_num] @@ -277,6 +308,15 @@ class VOCMetric(Metric): bbox_idx += bbox_num def accumulate(self): + output = "bbox.json" + if self.output_eval: + output = os.path.join(self.output_eval, output) + with open(output, 'w') as f: + json.dump(self.results, f) + logger.info('The bbox result is saved to bbox.json.') + if self.save_prediction_only: + return + logger.info("Accumulating evaluatation results...") self.detection_map.accumulate() @@ -309,25 +349,16 @@ class WiderFaceMetric(Metric): class RBoxMetric(Metric): def __init__(self, anno_file, **kwargs): - assert os.path.isfile(anno_file), \ - "anno_file {} not a file".format(anno_file) - assert os.path.exists(anno_file), "anno_file {} not exists".format( - anno_file) self.anno_file = anno_file - self.gt_anno = json.load(open(self.anno_file)) - cats = self.gt_anno['categories'] - self.clsid2catid = {i: cat['id'] for i, cat in enumerate(cats)} - self.catid2clsid = {cat['id']: i for i, cat in enumerate(cats)} - self.catid2name = {cat['id']: cat['name'] for cat in cats} + self.clsid2catid, self.catid2name = get_categories('COCO', anno_file) + self.catid2clsid = {v: k for k, v in self.clsid2catid.items()} self.classwise = kwargs.get('classwise', False) self.output_eval = kwargs.get('output_eval', None) - # TODO: bias should be unified - self.bias = kwargs.get('bias', 0) self.save_prediction_only = kwargs.get('save_prediction_only', False) - self.iou_type = kwargs.get('IouType', 'bbox') self.overlap_thresh = kwargs.get('overlap_thresh', 0.5) self.map_type = kwargs.get('map_type', '11point') self.evaluate_difficult = kwargs.get('evaluate_difficult', False) + self.imid2path = kwargs.get('imid2path', None) class_num = len(self.catid2name) self.detection_map = DetectionMAP( class_num=class_num, @@ -341,7 +372,7 @@ class RBoxMetric(Metric): self.reset() def reset(self): - self.result_bbox = [] + self.results = [] self.detection_map.reset() def update(self, inputs, outputs): @@ -351,43 +382,83 @@ class RBoxMetric(Metric): outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v im_id = inputs['im_id'] - outs['im_id'] = im_id.numpy() if isinstance(im_id, - paddle.Tensor) else im_id + im_id = im_id.numpy() if isinstance(im_id, paddle.Tensor) else im_id + outs['im_id'] = im_id - infer_results = get_infer_results( - outs, self.clsid2catid, bias=self.bias) - self.result_bbox += infer_results[ - 'bbox'] if 'bbox' in infer_results else [] - bbox = [b['bbox'] for b in self.result_bbox] - score = [b['score'] for b in self.result_bbox] - label = [b['category_id'] for b in self.result_bbox] - label = [self.catid2clsid[e] for e in label] - gt_box = [ - e['bbox'] for e in self.gt_anno['annotations'] - if e['image_id'] == outs['im_id'] - ] - gt_label = [ - e['category_id'] for e in self.gt_anno['annotations'] - if e['image_id'] == outs['im_id'] - ] - gt_label = [self.catid2clsid[e] for e in gt_label] - self.detection_map.update(bbox, score, label, gt_box, gt_label) + infer_results = get_infer_results(outs, self.clsid2catid) + infer_results = infer_results['bbox'] if 'bbox' in infer_results else [] + self.results += infer_results + if self.save_prediction_only: + return - def accumulate(self): - if len(self.result_bbox) > 0: - output = "bbox.json" - if self.output_eval: - output = os.path.join(self.output_eval, output) + gt_boxes = inputs['gt_poly'] + gt_labels = inputs['gt_class'] + + if 'scale_factor' in inputs: + scale_factor = inputs['scale_factor'].numpy() if isinstance( + inputs['scale_factor'], + paddle.Tensor) else inputs['scale_factor'] + else: + scale_factor = np.ones((gt_boxes.shape[0], 2)).astype('float32') + + for i in range(len(gt_boxes)): + gt_box = gt_boxes[i].numpy() if isinstance( + gt_boxes[i], paddle.Tensor) else gt_boxes[i] + h, w = scale_factor[i] + gt_box = gt_box / np.array([w, h, w, h, w, h, w, h]) + gt_label = gt_labels[i].numpy() if isinstance( + gt_labels[i], paddle.Tensor) else gt_labels[i] + gt_box, gt_label, _ = prune_zero_padding(gt_box, gt_label) + bbox = [ + res['bbox'] for res in infer_results + if int(res['image_id']) == int(im_id[i]) + ] + score = [ + res['score'] for res in infer_results + if int(res['image_id']) == int(im_id[i]) + ] + label = [ + self.catid2clsid[int(res['category_id'])] + for res in infer_results + if int(res['image_id']) == int(im_id[i]) + ] + self.detection_map.update(bbox, score, label, gt_box, gt_label) + + def save_results(self, results, output_dir, imid2path): + if imid2path: + data_dicts = defaultdict(list) + for result in results: + image_id = result['image_id'] + data_dicts[image_id].append(result) + + for image_id, image_path in imid2path.items(): + basename = os.path.splitext(os.path.split(image_path)[-1])[0] + output = os.path.join(output_dir, "{}.txt".format(basename)) + dets = data_dicts.get(image_id, []) + with open(output, 'w') as f: + for det in dets: + catid, bbox, score = det['category_id'], det[ + 'bbox'], det['score'] + bbox_pred = '{} {} '.format(self.catid2name[catid], + score) + ' '.join( + [str(e) for e in bbox]) + f.write(bbox_pred + '\n') + + logger.info('The bbox result is saved to {}.'.format(output_dir)) + else: + output = os.path.join(output_dir, "bbox.json") with open(output, 'w') as f: - json.dump(self.result_bbox, f) - logger.info('The bbox result is saved to bbox.json.') + json.dump(results, f) - if self.save_prediction_only: - logger.info('The bbox result is saved to {} and do not ' - 'evaluate the mAP.'.format(output)) - else: - logger.info("Accumulating evaluatation results...") - self.detection_map.accumulate() + logger.info('The bbox result is saved to {}.'.format(output)) + + def accumulate(self): + if self.output_eval: + self.save_results(self.results, self.output_eval, self.imid2path) + + if not self.save_prediction_only: + logger.info("Accumulating evaluatation results...") + self.detection_map.accumulate() def log(self): map_stat = 100. * self.detection_map.get_map() diff --git a/paddlers/models/ppdet/metrics/mot_metrics.py b/paddlers/models/ppdet/metrics/mot_metrics.py index 1935840..e369edf 100644 --- a/paddlers/models/ppdet/metrics/mot_metrics.py +++ b/paddlers/models/ppdet/metrics/mot_metrics.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import @@ -22,13 +22,21 @@ import sys import math from collections import defaultdict import numpy as np -import paddle -import paddle.nn.functional as F + from paddlers.models.ppdet.modeling.bbox_utils import bbox_iou_np_expand from .map_utils import ap_per_class from .metrics import Metric from .munkres import Munkres +try: + import motmetrics as mm + mm.lap.default_solver = 'lap' +except: + print( + 'Warning: Unable to use MOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics' + ) + pass + from paddlers.models.ppdet.utils.logger import setup_logger logger = setup_logger(__name__) @@ -36,8 +44,13 @@ __all__ = ['MOTEvaluator', 'MOTMetric', 'JDEDetMetric', 'KITTIMOTMetric'] def read_mot_results(filename, is_gt=False, is_ignore=False): - valid_labels = {1} - ignore_labels = {2, 7, 8, 12} # only in motchallenge datasets like 'MOT16' + valid_label = [1] + ignore_labels = [2, 7, 8, 12] # only in motchallenge datasets like 'MOT16' + if is_gt: + logger.info( + "In MOT16/17 dataset the valid_label of ground truth is '{}', " + "in other dataset it should be '0' for single classs MOT.".format( + valid_label[0])) results_dict = dict() if os.path.isfile(filename): with open(filename, 'r') as f: @@ -50,12 +63,10 @@ def read_mot_results(filename, is_gt=False, is_ignore=False): continue results_dict.setdefault(fid, list()) - box_size = float(linelist[4]) * float(linelist[5]) - if is_gt: label = int(float(linelist[7])) mark = int(float(linelist[6])) - if mark == 0 or label not in valid_labels: + if mark == 0 or label not in valid_label: continue score = 1 elif is_ignore: @@ -112,24 +123,31 @@ class MOTEvaluator(object): self.data_type = data_type self.load_annotations() + try: + import motmetrics as mm + mm.lap.default_solver = 'lap' + except Exception as e: + raise RuntimeError( + 'Unable to use MOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics' + ) self.reset_accumulator() def load_annotations(self): assert self.data_type == 'mot' gt_filename = os.path.join(self.data_root, self.seq_name, 'gt', 'gt.txt') + if not os.path.exists(gt_filename): + logger.warning( + "gt_filename '{}' of MOTEvaluator is not exist, so the MOTA will be -INF." + ) self.gt_frame_dict = read_mot_results(gt_filename, is_gt=True) self.gt_ignore_frame_dict = read_mot_results( gt_filename, is_ignore=True) def reset_accumulator(self): - import motmetrics as mm - mm.lap.default_solver = 'lap' self.acc = mm.MOTAccumulator(auto_id=True) def eval_frame(self, frame_id, trk_tlwhs, trk_ids, rtn_events=False): - import motmetrics as mm - mm.lap.default_solver = 'lap' # results trk_tlwhs = np.copy(trk_tlwhs) trk_ids = np.copy(trk_ids) @@ -187,8 +205,6 @@ class MOTEvaluator(object): names, metrics=('mota', 'num_switches', 'idp', 'idr', 'idf1', 'precision', 'recall')): - import motmetrics as mm - mm.lap.default_solver = 'lap' names = copy.deepcopy(names) if metrics is None: metrics = mm.metrics.motchallenge_metrics @@ -225,8 +241,6 @@ class MOTMetric(Metric): self.result_root = result_root def accumulate(self): - import motmetrics as mm - import openpyxl metrics = mm.metrics.motchallenge_metrics mh = mm.metrics.create() summary = self.MOTEvaluator.get_summary(self.accs, self.seqs, metrics) @@ -422,7 +436,7 @@ class KITTIEvaluation(object): self.ifn = 0 # number of ignored false negatives self.ifns = [] # number of ignored false negatives PER SEQUENCE self.fp = 0 # number of false positives - # a bit tricky, the number of ignored false negatives and ignored true positives + # a bit tricky, the number of ignored false negatives and ignored true positives # is subtracted, but if both tracker detection and ground truth detection # are ignored this number is added again to avoid double counting self.fps = [] # above PER SEQUENCE @@ -551,7 +565,7 @@ class KITTIEvaluation(object): "track ids are not unique for sequence %d: frame %d" % (seq, t_data.frame)) logger.info( - "track id %d occured at least twice for this frame" + "track id %d occurred at least twice for this frame" % t_data.track_id) logger.info("Exiting...") #continue # this allows to evaluate non-unique result files diff --git a/paddlers/models/ppdet/metrics/munkres.py b/paddlers/models/ppdet/metrics/munkres.py index 58c95d6..307028b 100644 --- a/paddlers/models/ppdet/metrics/munkres.py +++ b/paddlers/models/ppdet/metrics/munkres.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. """ This code is borrow from https://github.com/xingyizhou/CenterTrack/blob/master/src/tools/eval_kitti_track/munkres.py diff --git a/paddlers/models/ppdet/model_zoo/.gitignore b/paddlers/models/ppdet/model_zoo/.gitignore new file mode 100644 index 0000000..f296851 --- /dev/null +++ b/paddlers/models/ppdet/model_zoo/.gitignore @@ -0,0 +1 @@ +MODEL_ZOO diff --git a/paddlers/models/ppdet/model_zoo/__init__.py b/paddlers/models/ppdet/model_zoo/__init__.py index caffdb5..8c3b37d 100644 --- a/paddlers/models/ppdet/model_zoo/__init__.py +++ b/paddlers/models/ppdet/model_zoo/__init__.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. from . import model_zoo diff --git a/paddlers/models/ppdet/model_zoo/model_zoo.py b/paddlers/models/ppdet/model_zoo/model_zoo.py index 2d0dbef..bbc9e9f 100644 --- a/paddlers/models/ppdet/model_zoo/model_zoo.py +++ b/paddlers/models/ppdet/model_zoo/model_zoo.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. import os.path as osp diff --git a/paddlers/models/ppdet/model_zoo/tests/__init__.py b/paddlers/models/ppdet/model_zoo/tests/__init__.py new file mode 100644 index 0000000..5135585 --- /dev/null +++ b/paddlers/models/ppdet/model_zoo/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlers/models/ppdet/model_zoo/tests/test_get_model.py b/paddlers/models/ppdet/model_zoo/tests/test_get_model.py new file mode 100644 index 0000000..ee47df5 --- /dev/null +++ b/paddlers/models/ppdet/model_zoo/tests/test_get_model.py @@ -0,0 +1,48 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import paddle +import paddlers.models.ppdet as ppdet +import unittest + +# NOTE: weights downloading costs time, we choose +# a small model for unittesting +MODEL_NAME = 'ppyolo/ppyolo_tiny_650e_coco' + + +class TestGetConfigFile(unittest.TestCase): + def test_main(self): + try: + cfg_file = ppdet.model_zoo.get_config_file(MODEL_NAME) + assert os.path.isfile(cfg_file) + except: + self.assertTrue(False) + + +class TestGetModel(unittest.TestCase): + def test_main(self): + try: + model = ppdet.model_zoo.get_model(MODEL_NAME) + assert isinstance(model, paddle.nn.Layer) + except: + self.assertTrue(False) + + +if __name__ == '__main__': + unittest.main() diff --git a/paddlers/models/ppdet/model_zoo/tests/test_list_model.py b/paddlers/models/ppdet/model_zoo/tests/test_list_model.py new file mode 100644 index 0000000..3dca71f --- /dev/null +++ b/paddlers/models/ppdet/model_zoo/tests/test_list_model.py @@ -0,0 +1,68 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import unittest +import paddlers.models.ppdet as ppdet + + +class TestListModel(unittest.TestCase): + def setUp(self): + self._filter = [] + + def test_main(self): + try: + ppdet.model_zoo.list_model(self._filter) + self.assertTrue(True) + except: + self.assertTrue(False) + + +class TestListModelYOLO(TestListModel): + def setUp(self): + self._filter = ['yolo'] + + +class TestListModelRCNN(TestListModel): + def setUp(self): + self._filter = ['rcnn'] + + +class TestListModelSSD(TestListModel): + def setUp(self): + self._filter = ['ssd'] + + +class TestListModelMultiFilter(TestListModel): + def setUp(self): + self._filter = ['yolo', 'darknet'] + + +class TestListModelError(unittest.TestCase): + def setUp(self): + self._filter = ['xxx'] + + def test_main(self): + try: + ppdet.model_zoo.list_model(self._filter) + self.assertTrue(False) + except ValueError: + self.assertTrue(True) + + +if __name__ == '__main__': + unittest.main() diff --git a/paddlers/models/ppdet/modeling/__init__.py b/paddlers/models/ppdet/modeling/__init__.py index 815d089..823602a 100644 --- a/paddlers/models/ppdet/modeling/__init__.py +++ b/paddlers/models/ppdet/modeling/__init__.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. import warnings @@ -29,6 +29,7 @@ from . import reid from . import mot from . import transformers from . import assigners +from . import rbox_utils from .ops import * from .backbones import * @@ -43,3 +44,4 @@ from .reid import * from .mot import * from .transformers import * from .assigners import * +from .rbox_utils import * diff --git a/paddlers/models/ppdet/modeling/architectures/__init__.py b/paddlers/models/ppdet/modeling/architectures/__init__.py index 8d34367..c4528e6 100644 --- a/paddlers/models/ppdet/modeling/architectures/__init__.py +++ b/paddlers/models/ppdet/modeling/architectures/__init__.py @@ -1,10 +1,17 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from . import meta_arch from . import faster_rcnn from . import mask_rcnn @@ -26,6 +33,9 @@ from . import picodet from . import detr from . import sparse_rcnn from . import tood +from . import retinanet +from . import bytetrack +from . import yolox from .meta_arch import * from .faster_rcnn import * @@ -49,3 +59,6 @@ from .picodet import * from .detr import * from .sparse_rcnn import * from .tood import * +from .retinanet import * +from .bytetrack import * +from .yolox import * diff --git a/paddlers/models/ppdet/modeling/architectures/bytetrack.py b/paddlers/models/ppdet/modeling/architectures/bytetrack.py new file mode 100644 index 0000000..435f953 --- /dev/null +++ b/paddlers/models/ppdet/modeling/architectures/bytetrack.py @@ -0,0 +1,79 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddlers.models.ppdet.core.workspace import register, create +from .meta_arch import BaseArch + +__all__ = ['ByteTrack'] + + +@register +class ByteTrack(BaseArch): + """ + ByteTrack network, see https://arxiv.org/abs/2110.06864 + + Args: + detector (object): detector model instance + reid (object): reid model instance, default None + tracker (object): tracker instance + """ + __category__ = 'architecture' + + def __init__(self, detector='YOLOX', reid=None, tracker='JDETracker'): + super(ByteTrack, self).__init__() + self.detector = detector + self.reid = reid + self.tracker = tracker + + @classmethod + def from_config(cls, cfg, *args, **kwargs): + detector = create(cfg['detector']) + + if cfg['reid'] != 'None': + reid = create(cfg['reid']) + else: + reid = None + + tracker = create(cfg['tracker']) + + return { + "detector": detector, + "reid": reid, + "tracker": tracker, + } + + def _forward(self): + det_outs = self.detector(self.inputs) + + if self.training: + return det_outs + else: + if self.reid is not None: + assert 'crops' in self.inputs + crops = self.inputs['crops'] + pred_embs = self.reid(crops) + else: + pred_embs = None + det_outs['embeddings'] = pred_embs + return det_outs + + def get_loss(self): + return self._forward() + + def get_pred(self): + return self._forward() diff --git a/paddlers/models/ppdet/modeling/architectures/cascade_rcnn.py b/paddlers/models/ppdet/modeling/architectures/cascade_rcnn.py index db1a100..6e8c330 100644 --- a/paddlers/models/ppdet/modeling/architectures/cascade_rcnn.py +++ b/paddlers/models/ppdet/modeling/architectures/cascade_rcnn.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import @@ -111,14 +111,14 @@ class CascadeRCNN(BaseArch): bbox, bbox_num = self.bbox_post_process( preds, (refined_rois, rois_num), im_shape, scale_factor) # rescale the prediction back to origin image - bbox_pred = self.bbox_post_process.get_pred(bbox, bbox_num, - im_shape, scale_factor) + bbox, bbox_pred, bbox_num = self.bbox_post_process.get_pred( + bbox, bbox_num, im_shape, scale_factor) if not self.with_mask: return bbox_pred, bbox_num, None mask_out = self.mask_head(body_feats, bbox, bbox_num, self.inputs) origin_shape = self.bbox_post_process.get_origin_shape() - mask_pred = self.mask_post_process(mask_out[:, 0, :, :], bbox_pred, - bbox_num, origin_shape) + mask_pred = self.mask_post_process(mask_out, bbox_pred, bbox_num, + origin_shape) return bbox_pred, bbox_num, mask_pred def get_loss(self, ): diff --git a/paddlers/models/ppdet/modeling/architectures/centernet.py b/paddlers/models/ppdet/modeling/architectures/centernet.py old mode 100644 new mode 100755 index e534fdd..52916ac --- a/paddlers/models/ppdet/modeling/architectures/centernet.py +++ b/paddlers/models/ppdet/modeling/architectures/centernet.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import diff --git a/paddlers/models/ppdet/modeling/architectures/deepsort.py b/paddlers/models/ppdet/modeling/architectures/deepsort.py index 14ed6cd..3caf4fd 100644 --- a/paddlers/models/ppdet/modeling/architectures/deepsort.py +++ b/paddlers/models/ppdet/modeling/architectures/deepsort.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import @@ -62,8 +62,9 @@ class DeepSORT(BaseArch): def _forward(self): crops = self.inputs['crops'] - features = self.reid(crops) - return features + outs = {} + outs['embeddings'] = self.reid(crops) + return outs def get_pred(self): return self._forward() diff --git a/paddlers/models/ppdet/modeling/architectures/fairmot.py b/paddlers/models/ppdet/modeling/architectures/fairmot.py old mode 100644 new mode 100755 index cf1127f..897bc46 --- a/paddlers/models/ppdet/modeling/architectures/fairmot.py +++ b/paddlers/models/ppdet/modeling/architectures/fairmot.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import diff --git a/paddlers/models/ppdet/modeling/architectures/faster_rcnn.py b/paddlers/models/ppdet/modeling/architectures/faster_rcnn.py index 23fe0e0..49e5ad5 100644 --- a/paddlers/models/ppdet/modeling/architectures/faster_rcnn.py +++ b/paddlers/models/ppdet/modeling/architectures/faster_rcnn.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import @@ -87,8 +87,8 @@ class FasterRCNN(BaseArch): im_shape, scale_factor) # rescale the prediction back to origin image - bbox_pred = self.bbox_post_process.get_pred(bbox, bbox_num, - im_shape, scale_factor) + bboxes, bbox_pred, bbox_num = self.bbox_post_process.get_pred( + bbox, bbox_num, im_shape, scale_factor) return bbox_pred, bbox_num def get_loss(self, ): diff --git a/paddlers/models/ppdet/modeling/architectures/fcos.py b/paddlers/models/ppdet/modeling/architectures/fcos.py index c851416..5a3447b 100644 --- a/paddlers/models/ppdet/modeling/architectures/fcos.py +++ b/paddlers/models/ppdet/modeling/architectures/fcos.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import diff --git a/paddlers/models/ppdet/modeling/architectures/gfl.py b/paddlers/models/ppdet/modeling/architectures/gfl.py index fabd3d6..86471ea 100644 --- a/paddlers/models/ppdet/modeling/architectures/gfl.py +++ b/paddlers/models/ppdet/modeling/architectures/gfl.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import diff --git a/paddlers/models/ppdet/modeling/architectures/jde.py b/paddlers/models/ppdet/modeling/architectures/jde.py index 7210eeb..b0fd65c 100644 --- a/paddlers/models/ppdet/modeling/architectures/jde.py +++ b/paddlers/models/ppdet/modeling/architectures/jde.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import diff --git a/paddlers/models/ppdet/modeling/architectures/keypoint_hrhrnet.py b/paddlers/models/ppdet/modeling/architectures/keypoint_hrhrnet.py index 7c81727..cc980ee 100644 --- a/paddlers/models/ppdet/modeling/architectures/keypoint_hrhrnet.py +++ b/paddlers/models/ppdet/modeling/architectures/keypoint_hrhrnet.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import @@ -153,7 +153,7 @@ class HrHRNetPostProcess(object): heat_thresh (float): value of topk below this threshhold will be ignored tag_thresh (float): coord's value sampled in tagmap below this threshold belong to same people for init - inputs(list[heatmap]): the output list of modle, [heatmap, heatmap_maxpool, tagmap], heatmap_maxpool used to get topk + inputs(list[heatmap]): the output list of model, [heatmap, heatmap_maxpool, tagmap], heatmap_maxpool used to get topk original_height, original_width (float): the original image size ''' diff --git a/paddlers/models/ppdet/modeling/architectures/keypoint_hrnet.py b/paddlers/models/ppdet/modeling/architectures/keypoint_hrnet.py index 254a15b..6a4751d 100644 --- a/paddlers/models/ppdet/modeling/architectures/keypoint_hrnet.py +++ b/paddlers/models/ppdet/modeling/architectures/keypoint_hrnet.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import diff --git a/paddlers/models/ppdet/modeling/architectures/mask_rcnn.py b/paddlers/models/ppdet/modeling/architectures/mask_rcnn.py index 186cce6..f3a949e 100644 --- a/paddlers/models/ppdet/modeling/architectures/mask_rcnn.py +++ b/paddlers/models/ppdet/modeling/architectures/mask_rcnn.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import @@ -112,11 +112,11 @@ class MaskRCNN(BaseArch): body_feats, bbox, bbox_num, self.inputs, feat_func=feat_func) # rescale the prediction back to origin image - bbox_pred = self.bbox_post_process.get_pred(bbox, bbox_num, - im_shape, scale_factor) + bbox, bbox_pred, bbox_num = self.bbox_post_process.get_pred( + bbox, bbox_num, im_shape, scale_factor) origin_shape = self.bbox_post_process.get_origin_shape() - mask_pred = self.mask_post_process(mask_out[:, 0, :, :], bbox_pred, - bbox_num, origin_shape) + mask_pred = self.mask_post_process(mask_out, bbox_pred, bbox_num, + origin_shape) return bbox_pred, bbox_num, mask_pred def get_loss(self, ): diff --git a/paddlers/models/ppdet/modeling/architectures/meta_arch.py b/paddlers/models/ppdet/modeling/architectures/meta_arch.py index 77db52b..f32b542 100644 --- a/paddlers/models/ppdet/modeling/architectures/meta_arch.py +++ b/paddlers/models/ppdet/modeling/architectures/meta_arch.py @@ -22,22 +22,23 @@ class BaseArch(nn.Layer): self.fuse_norm = False def load_meanstd(self, cfg_transform): - self.scale = 1. - self.mean = paddle.to_tensor([0.485, 0.456, 0.406]).reshape( - (1, 3, 1, 1)) - self.std = paddle.to_tensor([0.229, 0.224, 0.225]).reshape((1, 3, 1, 1)) + scale = 1. + mean = np.array([0.485, 0.456, 0.406], dtype=np.float32) + std = np.array([0.229, 0.224, 0.225], dtype=np.float32) for item in cfg_transform: if 'NormalizeImage' in item: - self.mean = paddle.to_tensor(item['NormalizeImage'][ - 'mean']).reshape((1, 3, 1, 1)) - self.std = paddle.to_tensor(item['NormalizeImage'][ - 'std']).reshape((1, 3, 1, 1)) + mean = np.array( + item['NormalizeImage']['mean'], dtype=np.float32) + std = np.array(item['NormalizeImage']['std'], dtype=np.float32) if item['NormalizeImage'].get('is_scale', True): - self.scale = 1. / 255. + scale = 1. / 255. break if self.data_format == 'NHWC': - self.mean = self.mean.reshape(1, 1, 1, 3) - self.std = self.std.reshape(1, 1, 1, 3) + self.scale = paddle.to_tensor(scale / std).reshape((1, 1, 1, 3)) + self.bias = paddle.to_tensor(-mean / std).reshape((1, 1, 1, 3)) + else: + self.scale = paddle.to_tensor(scale / std).reshape((1, 3, 1, 1)) + self.bias = paddle.to_tensor(-mean / std).reshape((1, 3, 1, 1)) def forward(self, inputs): if self.data_format == 'NHWC': @@ -46,7 +47,7 @@ class BaseArch(nn.Layer): if self.fuse_norm: image = inputs['image'] - self.inputs['image'] = (image * self.scale - self.mean) / self.std + self.inputs['image'] = image * self.scale + self.bias self.inputs['im_shape'] = inputs['im_shape'] self.inputs['scale_factor'] = inputs['scale_factor'] else: @@ -63,10 +64,14 @@ class BaseArch(nn.Layer): inputs_list.append(inputs) else: inputs_list.extend(inputs) - outs = [] for inp in inputs_list: - self.inputs = inp + if self.fuse_norm: + self.inputs['image'] = inp['image'] * self.scale + self.bias + self.inputs['im_shape'] = inp['im_shape'] + self.inputs['scale_factor'] = inp['scale_factor'] + else: + self.inputs = inp outs.append(self.get_pred()) # multi-scale test @@ -124,16 +129,3 @@ class BaseArch(nn.Layer): def get_pred(self, ): raise NotImplementedError("Should implement get_pred method!") - - @classmethod - def convert_sync_batchnorm(cls, layer): - layer_output = layer - if getattr(layer, 'norm_type', None) == 'sync_bn': - layer_output = nn.SyncBatchNorm.convert_sync_batchnorm(layer) - else: - for name, sublayer in layer.named_children(): - layer_output.add_sublayer(name, - cls.convert_sync_batchnorm(sublayer)) - - del layer - return layer_output diff --git a/paddlers/models/ppdet/modeling/architectures/picodet.py b/paddlers/models/ppdet/modeling/architectures/picodet.py index baff894..f2a091b 100644 --- a/paddlers/models/ppdet/modeling/architectures/picodet.py +++ b/paddlers/models/ppdet/modeling/architectures/picodet.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import @@ -41,7 +41,8 @@ class PicoDet(BaseArch): self.backbone = backbone self.neck = neck self.head = head - self.deploy = False + self.export_post_process = True + self.export_nms = True @classmethod def from_config(cls, cfg, *args, **kwargs): @@ -62,14 +63,13 @@ class PicoDet(BaseArch): def _forward(self): body_feats = self.backbone(self.inputs) fpn_feats = self.neck(body_feats) - head_outs = self.head(fpn_feats, self.deploy) - if self.training or self.deploy: + head_outs = self.head(fpn_feats, self.export_post_process) + if self.training or not self.export_post_process: return head_outs, None else: - im_shape = self.inputs['im_shape'] scale_factor = self.inputs['scale_factor'] - bboxes, bbox_num = self.head.post_process(head_outs, im_shape, - scale_factor) + bboxes, bbox_num = self.head.post_process( + head_outs, scale_factor, export_nms=self.export_nms) return bboxes, bbox_num def get_loss(self, ): @@ -83,9 +83,13 @@ class PicoDet(BaseArch): return loss def get_pred(self): - if self.deploy: + if not self.export_post_process: return {'picodet': self._forward()[0]} - else: + elif self.export_nms: bbox_pred, bbox_num = self._forward() output = {'bbox': bbox_pred, 'bbox_num': bbox_num} return output + else: + bboxes, mlvl_scores = self._forward() + output = {'bbox': bboxes, 'scores': mlvl_scores} + return output diff --git a/paddlers/models/ppdet/modeling/architectures/retinanet.py b/paddlers/models/ppdet/modeling/architectures/retinanet.py new file mode 100644 index 0000000..fcba467 --- /dev/null +++ b/paddlers/models/ppdet/modeling/architectures/retinanet.py @@ -0,0 +1,68 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddlers.models.ppdet.core.workspace import register, create +from .meta_arch import BaseArch +import paddle + +__all__ = ['RetinaNet'] + + +@register +class RetinaNet(BaseArch): + __category__ = 'architecture' + + def __init__(self, backbone, neck, head): + super(RetinaNet, self).__init__() + self.backbone = backbone + self.neck = neck + self.head = head + + @classmethod + def from_config(cls, cfg, *args, **kwargs): + backbone = create(cfg['backbone']) + + kwargs = {'input_shape': backbone.out_shape} + neck = create(cfg['neck'], **kwargs) + + kwargs = {'input_shape': neck.out_shape} + head = create(cfg['head'], **kwargs) + + return { + 'backbone': backbone, + 'neck': neck, + 'head': head, + } + + def _forward(self): + body_feats = self.backbone(self.inputs) + neck_feats = self.neck(body_feats) + + if self.training: + return self.head(neck_feats, self.inputs) + else: + head_outs = self.head(neck_feats) + bbox, bbox_num = self.head.post_process( + head_outs, self.inputs['im_shape'], self.inputs['scale_factor']) + return {'bbox': bbox, 'bbox_num': bbox_num} + + def get_loss(self): + return self._forward() + + def get_pred(self): + return self._forward() diff --git a/paddlers/models/ppdet/modeling/architectures/s2anet.py b/paddlers/models/ppdet/modeling/architectures/s2anet.py index 5c412a3..0703f22 100644 --- a/paddlers/models/ppdet/modeling/architectures/s2anet.py +++ b/paddlers/models/ppdet/modeling/architectures/s2anet.py @@ -1,15 +1,15 @@ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import @@ -26,26 +26,21 @@ __all__ = ['S2ANet'] @register class S2ANet(BaseArch): __category__ = 'architecture' - __inject__ = [ - 's2anet_head', - 's2anet_bbox_post_process', - ] + __inject__ = ['head'] - def __init__(self, backbone, neck, s2anet_head, s2anet_bbox_post_process): + def __init__(self, backbone, neck, head): """ S2ANet, see https://arxiv.org/pdf/2008.09397.pdf Args: backbone (object): backbone instance neck (object): `FPN` instance - s2anet_head (object): `S2ANetHead` instance - s2anet_bbox_post_process (object): `S2ANetBBoxPostProcess` instance + head (object): `Head` instance """ super(S2ANet, self).__init__() self.backbone = backbone self.neck = neck - self.s2anet_head = s2anet_head - self.s2anet_bbox_post_process = s2anet_bbox_post_process + self.s2anet_head = head @classmethod def from_config(cls, cfg, *args, **kwargs): @@ -55,42 +50,28 @@ class S2ANet(BaseArch): out_shape = neck and neck.out_shape or backbone.out_shape kwargs = {'input_shape': out_shape} - s2anet_head = create(cfg['s2anet_head'], **kwargs) - s2anet_bbox_post_process = create(cfg['s2anet_bbox_post_process'], - **kwargs) + head = create(cfg['head'], **kwargs) - return { - 'backbone': backbone, - 'neck': neck, - "s2anet_head": s2anet_head, - "s2anet_bbox_post_process": s2anet_bbox_post_process, - } + return {'backbone': backbone, 'neck': neck, "head": head} def _forward(self): body_feats = self.backbone(self.inputs) if self.neck is not None: body_feats = self.neck(body_feats) - self.s2anet_head(body_feats) if self.training: - loss = self.s2anet_head.get_loss(self.inputs) - total_loss = paddle.add_n(list(loss.values())) - loss.update({'loss': total_loss}) + loss = self.s2anet_head(body_feats, self.inputs) return loss else: - im_shape = self.inputs['im_shape'] - scale_factor = self.inputs['scale_factor'] - nms_pre = self.s2anet_bbox_post_process.nms_pre - pred_scores, pred_bboxes = self.s2anet_head.get_prediction(nms_pre) - + head_outs = self.s2anet_head(body_feats) # post_process - pred_bboxes, bbox_num = self.s2anet_bbox_post_process(pred_scores, - pred_bboxes) + bboxes, bbox_num = self.s2anet_head.get_bboxes(head_outs) # rescale the prediction back to origin image - pred_bboxes = self.s2anet_bbox_post_process.get_pred( - pred_bboxes, bbox_num, im_shape, scale_factor) - + im_shape = self.inputs['im_shape'] + scale_factor = self.inputs['scale_factor'] + bboxes = self.s2anet_head.get_pred(bboxes, bbox_num, im_shape, + scale_factor) # output - output = {'bbox': pred_bboxes, 'bbox_num': bbox_num} + output = {'bbox': bboxes, 'bbox_num': bbox_num} return output def get_loss(self, ): diff --git a/paddlers/models/ppdet/modeling/architectures/ttfnet.py b/paddlers/models/ppdet/modeling/architectures/ttfnet.py index ec0916e..7441606 100644 --- a/paddlers/models/ppdet/modeling/architectures/ttfnet.py +++ b/paddlers/models/ppdet/modeling/architectures/ttfnet.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import diff --git a/paddlers/models/ppdet/modeling/architectures/yolo.py b/paddlers/models/ppdet/modeling/architectures/yolo.py index 83e2bed..51af34c 100644 --- a/paddlers/models/ppdet/modeling/architectures/yolo.py +++ b/paddlers/models/ppdet/modeling/architectures/yolo.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import @@ -109,10 +109,13 @@ class YOLOv3(BaseArch): if self.return_idx: _, bbox, bbox_num, _ = self.post_process( yolo_head_outs, self.yolo_head.mask_anchors) - else: + elif self.post_process is not None: bbox, bbox_num = self.post_process( yolo_head_outs, self.yolo_head.mask_anchors, self.inputs['im_shape'], self.inputs['scale_factor']) + else: + bbox, bbox_num = self.yolo_head.post_process( + yolo_head_outs, self.inputs['scale_factor']) output = {'bbox': bbox, 'bbox_num': bbox_num} return output diff --git a/paddlers/models/ppdet/modeling/architectures/yolox.py b/paddlers/models/ppdet/modeling/architectures/yolox.py new file mode 100644 index 0000000..3c8db24 --- /dev/null +++ b/paddlers/models/ppdet/modeling/architectures/yolox.py @@ -0,0 +1,138 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddlers.models.ppdet.core.workspace import register, create +from .meta_arch import BaseArch + +import random +import paddle +import paddle.nn.functional as F +import paddle.distributed as dist + +__all__ = ['YOLOX'] + + +@register +class YOLOX(BaseArch): + """ + YOLOX network, see https://arxiv.org/abs/2107.08430 + + Args: + backbone (nn.Layer): backbone instance + neck (nn.Layer): neck instance + head (nn.Layer): head instance + for_mot (bool): whether used for MOT or not + input_size (list[int]): initial scale, will be reset by self._preprocess() + size_stride (int): stride of the size range + size_range (list[int]): multi-scale range for training + random_interval (int): interval of iter to change self._input_size + """ + __category__ = 'architecture' + + def __init__(self, + backbone='CSPDarkNet', + neck='YOLOCSPPAN', + head='YOLOXHead', + for_mot=False, + input_size=[640, 640], + size_stride=32, + size_range=[15, 25], + random_interval=10): + super(YOLOX, self).__init__() + self.backbone = backbone + self.neck = neck + self.head = head + self.for_mot = for_mot + + self.input_size = input_size + self._input_size = paddle.to_tensor(input_size) + self.size_stride = size_stride + self.size_range = size_range + self.random_interval = random_interval + self._step = 0 + + @classmethod + def from_config(cls, cfg, *args, **kwargs): + # backbone + backbone = create(cfg['backbone']) + + # fpn + kwargs = {'input_shape': backbone.out_shape} + neck = create(cfg['neck'], **kwargs) + + # head + kwargs = {'input_shape': neck.out_shape} + head = create(cfg['head'], **kwargs) + + return { + 'backbone': backbone, + 'neck': neck, + "head": head, + } + + def _forward(self): + if self.training: + self._preprocess() + body_feats = self.backbone(self.inputs) + neck_feats = self.neck(body_feats, self.for_mot) + + if self.training: + yolox_losses = self.head(neck_feats, self.inputs) + yolox_losses.update({'size': self._input_size[0]}) + return yolox_losses + else: + head_outs = self.head(neck_feats) + bbox, bbox_num = self.head.post_process( + head_outs, self.inputs['im_shape'], self.inputs['scale_factor']) + return {'bbox': bbox, 'bbox_num': bbox_num} + + def get_loss(self): + return self._forward() + + def get_pred(self): + return self._forward() + + def _preprocess(self): + # YOLOX multi-scale training, interpolate resize before inputs of the network. + self._get_size() + scale_y = self._input_size[0] / self.input_size[0] + scale_x = self._input_size[1] / self.input_size[1] + if scale_x != 1 or scale_y != 1: + self.inputs['image'] = F.interpolate( + self.inputs['image'], + size=self._input_size, + mode='bilinear', + align_corners=False) + gt_bboxes = self.inputs['gt_bbox'] + for i in range(len(gt_bboxes)): + if len(gt_bboxes[i]) > 0: + gt_bboxes[i][:, 0::2] = gt_bboxes[i][:, 0::2] * scale_x + gt_bboxes[i][:, 1::2] = gt_bboxes[i][:, 1::2] * scale_y + self.inputs['gt_bbox'] = gt_bboxes + + def _get_size(self): + # random_interval = 10 as default, every 10 iters to change self._input_size + image_ratio = self.input_size[1] * 1.0 / self.input_size[0] + if self._step % self.random_interval == 0: + size_factor = random.randint(*self.size_range) + size = [ + self.size_stride * size_factor, + self.size_stride * int(size_factor * image_ratio) + ] + self._input_size = paddle.to_tensor(size) + self._step += 1 diff --git a/paddlers/models/ppdet/modeling/assigners/__init__.py b/paddlers/models/ppdet/modeling/assigners/__init__.py index b9b351e..fa51ef1 100644 --- a/paddlers/models/ppdet/modeling/assigners/__init__.py +++ b/paddlers/models/ppdet/modeling/assigners/__init__.py @@ -16,8 +16,10 @@ from . import utils from . import task_aligned_assigner from . import atss_assigner from . import simota_assigner +from . import max_iou_assigner from .utils import * from .task_aligned_assigner import * from .atss_assigner import * from .simota_assigner import * +from .max_iou_assigner import * diff --git a/paddlers/models/ppdet/modeling/assigners/atss_assigner.py b/paddlers/models/ppdet/modeling/assigners/atss_assigner.py index cbcee0c..57930d7 100644 --- a/paddlers/models/ppdet/modeling/assigners/atss_assigner.py +++ b/paddlers/models/ppdet/modeling/assigners/atss_assigner.py @@ -22,11 +22,13 @@ import paddle.nn as nn import paddle.nn.functional as F from paddlers.models.ppdet.core.workspace import register -from ..ops import iou_similarity +from ..bbox_utils import iou_similarity, batch_iou_similarity from ..bbox_utils import bbox_center -from .utils import (pad_gt, check_points_inside_bboxes, compute_max_iou_anchor, +from .utils import (check_points_inside_bboxes, compute_max_iou_anchor, compute_max_iou_gt) +__all__ = ['ATSSAssigner'] + @register class ATSSAssigner(nn.Layer): @@ -48,7 +50,6 @@ class ATSSAssigner(nn.Layer): def _gather_topk_pyramid(self, gt2anchor_distances, num_anchors_list, pad_gt_mask): - pad_gt_mask = pad_gt_mask.tile([1, 1, self.topk]).astype(paddle.bool) gt2anchor_distances_list = paddle.split( gt2anchor_distances, num_anchors_list, axis=-1) num_anchors_index = np.cumsum(num_anchors_list).tolist() @@ -58,15 +59,12 @@ class ATSSAssigner(nn.Layer): for distances, anchors_index in zip(gt2anchor_distances_list, num_anchors_index): num_anchors = distances.shape[-1] - topk_metrics, topk_idxs = paddle.topk( + _, topk_idxs = paddle.topk( distances, self.topk, axis=-1, largest=False) topk_idxs_list.append(topk_idxs + anchors_index) - topk_idxs = paddle.where(pad_gt_mask, topk_idxs, - paddle.zeros_like(topk_idxs)) - is_in_topk = F.one_hot(topk_idxs, num_anchors).sum(axis=-2) - is_in_topk = paddle.where(is_in_topk > 1, - paddle.zeros_like(is_in_topk), is_in_topk) - is_in_topk_list.append(is_in_topk.astype(gt2anchor_distances.dtype)) + is_in_topk = F.one_hot(topk_idxs, num_anchors).sum( + axis=-2).astype(gt2anchor_distances.dtype) + is_in_topk_list.append(is_in_topk * pad_gt_mask) is_in_topk_list = paddle.concat(is_in_topk_list, axis=-1) topk_idxs_list = paddle.concat(topk_idxs_list, axis=-1) return is_in_topk_list, topk_idxs_list @@ -77,8 +75,10 @@ class ATSSAssigner(nn.Layer): num_anchors_list, gt_labels, gt_bboxes, + pad_gt_mask, bg_index, - gt_scores=None): + gt_scores=None, + pred_bboxes=None): r"""This code is based on https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/atss_assigner.py @@ -99,18 +99,18 @@ class ATSSAssigner(nn.Layer): anchor_bboxes (Tensor, float32): pre-defined anchors, shape(L, 4), "xmin, xmax, ymin, ymax" format num_anchors_list (List): num of anchors in each level - gt_labels (Tensor|List[Tensor], int64): Label of gt_bboxes, shape(B, n, 1) - gt_bboxes (Tensor|List[Tensor], float32): Ground truth bboxes, shape(B, n, 4) + gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1) + gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4) + pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1) bg_index (int): background index - gt_scores (Tensor|List[Tensor]|None, float32) Score of gt_bboxes, + gt_scores (Tensor|None, float32) Score of gt_bboxes, shape(B, n, 1), if None, then it will initialize with one_hot label + pred_bboxes (Tensor, float32, optional): predicted bounding boxes, shape(B, L, 4) Returns: assigned_labels (Tensor): (B, L) assigned_bboxes (Tensor): (B, L, 4) - assigned_scores (Tensor): (B, L, C) + assigned_scores (Tensor): (B, L, C), if pred_bboxes is not None, then output ious """ - gt_labels, gt_bboxes, pad_gt_scores, pad_gt_mask = pad_gt( - gt_labels, gt_bboxes, gt_scores) assert gt_labels.ndim == gt_bboxes.ndim and \ gt_bboxes.ndim == 3 @@ -119,7 +119,8 @@ class ATSSAssigner(nn.Layer): # negative batch if num_max_boxes == 0: - assigned_labels = paddle.full([batch_size, num_anchors], bg_index) + assigned_labels = paddle.full( + [batch_size, num_anchors], bg_index, dtype='int32') assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4]) assigned_scores = paddle.zeros( [batch_size, num_anchors, self.num_classes]) @@ -149,9 +150,8 @@ class ATSSAssigner(nn.Layer): iou_threshold = iou_threshold.reshape([batch_size, num_max_boxes, -1]) iou_threshold = iou_threshold.mean(axis=-1, keepdim=True) + \ iou_threshold.std(axis=-1, keepdim=True) - is_in_topk = paddle.where( - iou_candidates > iou_threshold.tile([1, 1, num_anchors]), - is_in_topk, paddle.zeros_like(is_in_topk)) + is_in_topk = paddle.where(iou_candidates > iou_threshold, is_in_topk, + paddle.zeros_like(is_in_topk)) # 6. check the positive sample's center in gt, [B, n, L] is_in_gts = check_points_inside_bboxes(anchor_centers, gt_bboxes) @@ -178,9 +178,6 @@ class ATSSAssigner(nn.Layer): mask_positive) mask_positive_sum = mask_positive.sum(axis=-2) assigned_gt_index = mask_positive.argmax(axis=-2) - assert mask_positive_sum.max() == 1, \ - ("one anchor just assign one gt, but received not equals 1. " - "Received: %f" % mask_positive_sum.max().item()) # assigned target batch_ind = paddle.arange( @@ -197,10 +194,19 @@ class ATSSAssigner(nn.Layer): gt_bboxes.reshape([-1, 4]), assigned_gt_index.flatten(), axis=0) assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4]) - assigned_scores = F.one_hot(assigned_labels, self.num_classes) - if gt_scores is not None: + assigned_scores = F.one_hot(assigned_labels, self.num_classes + 1) + ind = list(range(self.num_classes + 1)) + ind.remove(bg_index) + assigned_scores = paddle.index_select( + assigned_scores, paddle.to_tensor(ind), axis=-1) + if pred_bboxes is not None: + # assigned iou + ious = batch_iou_similarity(gt_bboxes, pred_bboxes) * mask_positive + ious = ious.max(axis=-2).unsqueeze(-1) + assigned_scores *= ious + elif gt_scores is not None: gather_scores = paddle.gather( - pad_gt_scores.flatten(), assigned_gt_index.flatten(), axis=0) + gt_scores.flatten(), assigned_gt_index.flatten(), axis=0) gather_scores = gather_scores.reshape([batch_size, num_anchors]) gather_scores = paddle.where(mask_positive_sum > 0, gather_scores, paddle.zeros_like(gather_scores)) diff --git a/paddlers/models/ppdet/modeling/assigners/max_iou_assigner.py b/paddlers/models/ppdet/modeling/assigners/max_iou_assigner.py new file mode 100644 index 0000000..891b707 --- /dev/null +++ b/paddlers/models/ppdet/modeling/assigners/max_iou_assigner.py @@ -0,0 +1,54 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddlers.models.ppdet.core.workspace import register +from paddlers.models.ppdet.modeling.proposal_generator.target import label_box + +__all__ = ['MaxIoUAssigner'] + + +@register +class MaxIoUAssigner(object): + """a standard bbox assigner based on max IoU, use ppdet's label_box + as backend. + Args: + positive_overlap (float): threshold for defining positive samples + negative_overlap (float): threshold for denining negative samples + allow_low_quality (bool): whether to lower IoU thr if a GT poorly + overlaps with candidate bboxes + """ + + def __init__(self, + positive_overlap, + negative_overlap, + allow_low_quality=True): + self.positive_overlap = positive_overlap + self.negative_overlap = negative_overlap + self.allow_low_quality = allow_low_quality + + def __call__(self, bboxes, gt_bboxes): + matches, match_labels = label_box( + bboxes, + gt_bboxes, + positive_overlap=self.positive_overlap, + negative_overlap=self.negative_overlap, + allow_low_quality=self.allow_low_quality, + ignore_thresh=-1, + is_crowd=None, + assign_on_cpu=False) + return matches, match_labels diff --git a/paddlers/models/ppdet/modeling/assigners/simota_assigner.py b/paddlers/models/ppdet/modeling/assigners/simota_assigner.py index a0fe723..26618ac 100644 --- a/paddlers/models/ppdet/modeling/assigners/simota_assigner.py +++ b/paddlers/models/ppdet/modeling/assigners/simota_assigner.py @@ -115,7 +115,10 @@ class SimOTAAssigner(object): def dynamic_k_matching(self, cost_matrix, pairwise_ious, num_gt): match_matrix = np.zeros_like(cost_matrix.numpy()) # select candidate topk ious for dynamic-k calculation - topk_ious, _ = paddle.topk(pairwise_ious, self.candidate_topk, axis=0) + topk_ious, _ = paddle.topk( + pairwise_ious, + min(self.candidate_topk, pairwise_ious.shape[0]), + axis=0) # calculate dynamic k for each gt dynamic_ks = paddle.clip(topk_ious.sum(0).cast('int'), min=1) for gt_idx in range(num_gt): diff --git a/paddlers/models/ppdet/modeling/assigners/task_aligned_assigner.py b/paddlers/models/ppdet/modeling/assigners/task_aligned_assigner.py index 6dae235..5c82b36 100644 --- a/paddlers/models/ppdet/modeling/assigners/task_aligned_assigner.py +++ b/paddlers/models/ppdet/modeling/assigners/task_aligned_assigner.py @@ -21,10 +21,12 @@ import paddle.nn as nn import paddle.nn.functional as F from paddlers.models.ppdet.core.workspace import register -from ..bbox_utils import iou_similarity -from .utils import (pad_gt, gather_topk_anchors, check_points_inside_bboxes, +from ..bbox_utils import batch_iou_similarity +from .utils import (gather_topk_anchors, check_points_inside_bboxes, compute_max_iou_anchor) +__all__ = ['TaskAlignedAssigner'] + @register class TaskAlignedAssigner(nn.Layer): @@ -43,8 +45,10 @@ class TaskAlignedAssigner(nn.Layer): pred_scores, pred_bboxes, anchor_points, + num_anchors_list, gt_labels, gt_bboxes, + pad_gt_mask, bg_index, gt_scores=None): r"""This code is based on @@ -61,20 +65,18 @@ class TaskAlignedAssigner(nn.Layer): pred_scores (Tensor, float32): predicted class probability, shape(B, L, C) pred_bboxes (Tensor, float32): predicted bounding boxes, shape(B, L, 4) anchor_points (Tensor, float32): pre-defined anchors, shape(L, 2), "cxcy" format - gt_labels (Tensor|List[Tensor], int64): Label of gt_bboxes, shape(B, n, 1) - gt_bboxes (Tensor|List[Tensor], float32): Ground truth bboxes, shape(B, n, 4) + num_anchors_list (List): num of anchors in each level, shape(L) + gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1) + gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4) + pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1) bg_index (int): background index - gt_scores (Tensor|List[Tensor]|None, float32) Score of gt_bboxes, - shape(B, n, 1), if None, then it will initialize with one_hot label + gt_scores (Tensor|None, float32) Score of gt_bboxes, shape(B, n, 1) Returns: assigned_labels (Tensor): (B, L) assigned_bboxes (Tensor): (B, L, 4) assigned_scores (Tensor): (B, L, C) """ assert pred_scores.ndim == pred_bboxes.ndim - - gt_labels, gt_bboxes, pad_gt_scores, pad_gt_mask = pad_gt( - gt_labels, gt_bboxes, gt_scores) assert gt_labels.ndim == gt_bboxes.ndim and \ gt_bboxes.ndim == 3 @@ -83,14 +85,15 @@ class TaskAlignedAssigner(nn.Layer): # negative batch if num_max_boxes == 0: - assigned_labels = paddle.full([batch_size, num_anchors], bg_index) + assigned_labels = paddle.full( + [batch_size, num_anchors], bg_index, dtype='int32') assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4]) assigned_scores = paddle.zeros( [batch_size, num_anchors, num_classes]) return assigned_labels, assigned_bboxes, assigned_scores # compute iou between gt and pred bbox, [B, n, L] - ious = iou_similarity(gt_bboxes, pred_bboxes) + ious = batch_iou_similarity(gt_bboxes, pred_bboxes) # gather pred bboxes class score pred_scores = pred_scores.transpose([0, 2, 1]) batch_ind = paddle.arange( @@ -109,9 +112,7 @@ class TaskAlignedAssigner(nn.Layer): # select topk largest alignment metrics pred bbox as candidates # for each gt, [B, n, L] is_in_topk = gather_topk_anchors( - alignment_metrics * is_in_gts, - self.topk, - topk_mask=pad_gt_mask.tile([1, 1, self.topk]).astype(paddle.bool)) + alignment_metrics * is_in_gts, self.topk, topk_mask=pad_gt_mask) # select positive sample, [B, n, L] mask_positive = is_in_topk * is_in_gts * pad_gt_mask @@ -127,9 +128,6 @@ class TaskAlignedAssigner(nn.Layer): mask_positive) mask_positive_sum = mask_positive.sum(axis=-2) assigned_gt_index = mask_positive.argmax(axis=-2) - assert mask_positive_sum.max() == 1, \ - ("one anchor just assign one gt, but received not equals 1. " - "Received: %f" % mask_positive_sum.max().item()) # assigned target assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes @@ -144,7 +142,11 @@ class TaskAlignedAssigner(nn.Layer): gt_bboxes.reshape([-1, 4]), assigned_gt_index.flatten(), axis=0) assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4]) - assigned_scores = F.one_hot(assigned_labels, num_classes) + assigned_scores = F.one_hot(assigned_labels, num_classes + 1) + ind = list(range(num_classes + 1)) + ind.remove(bg_index) + assigned_scores = paddle.index_select( + assigned_scores, paddle.to_tensor(ind), axis=-1) # rescale alignment metrics alignment_metrics *= mask_positive max_metrics_per_instance = alignment_metrics.max(axis=-1, keepdim=True) diff --git a/paddlers/models/ppdet/modeling/assigners/utils.py b/paddlers/models/ppdet/modeling/assigners/utils.py index 0b81b45..01feaba 100644 --- a/paddlers/models/ppdet/modeling/assigners/utils.py +++ b/paddlers/models/ppdet/modeling/assigners/utils.py @@ -88,7 +88,7 @@ def gather_topk_anchors(metrics, topk, largest=True, topk_mask=None, eps=1e-9): largest (bool) : largest is a flag, if set to true, algorithm will sort by descending order, otherwise sort by ascending order. Default: True - topk_mask (Tensor, bool|None): shape[B, n, topk], ignore bbox mask, + topk_mask (Tensor, float32): shape[B, n, 1], ignore bbox mask, Default: None eps (float): Default: 1e-9 Returns: @@ -98,20 +98,22 @@ def gather_topk_anchors(metrics, topk, largest=True, topk_mask=None, eps=1e-9): topk_metrics, topk_idxs = paddle.topk( metrics, topk, axis=-1, largest=largest) if topk_mask is None: - topk_mask = (topk_metrics.max(axis=-1, keepdim=True) > eps).tile( - [1, 1, topk]) - topk_idxs = paddle.where(topk_mask, topk_idxs, paddle.zeros_like(topk_idxs)) - is_in_topk = F.one_hot(topk_idxs, num_anchors).sum(axis=-2) - is_in_topk = paddle.where(is_in_topk > 1, - paddle.zeros_like(is_in_topk), is_in_topk) - return is_in_topk.astype(metrics.dtype) + topk_mask = ( + topk_metrics.max(axis=-1, keepdim=True) > eps).astype(metrics.dtype) + is_in_topk = F.one_hot(topk_idxs, num_anchors).sum( + axis=-2).astype(metrics.dtype) + return is_in_topk * topk_mask -def check_points_inside_bboxes(points, bboxes, eps=1e-9): +def check_points_inside_bboxes(points, + bboxes, + center_radius_tensor=None, + eps=1e-9): r""" Args: points (Tensor, float32): shape[L, 2], "xy" format, L: num_anchors bboxes (Tensor, float32): shape[B, n, 4], "xmin, ymin, xmax, ymax" format + center_radius_tensor (Tensor, float32): shape [L, 1]. Default: None. eps (float): Default: 1e-9 Returns: is_in_bboxes (Tensor, float32): shape[B, n, L], value=1. means selected @@ -119,12 +121,28 @@ def check_points_inside_bboxes(points, bboxes, eps=1e-9): points = points.unsqueeze([0, 1]) x, y = points.chunk(2, axis=-1) xmin, ymin, xmax, ymax = bboxes.unsqueeze(2).chunk(4, axis=-1) + # check whether `points` is in `bboxes` l = x - xmin t = y - ymin r = xmax - x b = ymax - y - bbox_ltrb = paddle.concat([l, t, r, b], axis=-1) - return (bbox_ltrb.min(axis=-1) > eps).astype(bboxes.dtype) + delta_ltrb = paddle.concat([l, t, r, b], axis=-1) + is_in_bboxes = (delta_ltrb.min(axis=-1) > eps) + if center_radius_tensor is not None: + # check whether `points` is in `center_radius` + center_radius_tensor = center_radius_tensor.unsqueeze([0, 1]) + cx = (xmin + xmax) * 0.5 + cy = (ymin + ymax) * 0.5 + l = x - (cx - center_radius_tensor) + t = y - (cy - center_radius_tensor) + r = (cx + center_radius_tensor) - x + b = (cy + center_radius_tensor) - y + delta_ltrb_c = paddle.concat([l, t, r, b], axis=-1) + is_in_center = (delta_ltrb_c.min(axis=-1) > eps) + return (paddle.logical_and(is_in_bboxes, is_in_center), + paddle.logical_or(is_in_bboxes, is_in_center)) + + return is_in_bboxes.astype(bboxes.dtype) def compute_max_iou_anchor(ious): @@ -158,7 +176,8 @@ def compute_max_iou_gt(ious): def generate_anchors_for_grid_cell(feats, fpn_strides, grid_cell_size=5.0, - grid_cell_offset=0.5): + grid_cell_offset=0.5, + dtype='float32'): r""" Like ATSS, generate anchors based on grid size. Args: @@ -167,14 +186,16 @@ def generate_anchors_for_grid_cell(feats, grid_cell_size (float): anchor size grid_cell_offset (float): The range is between 0 and 1. Returns: - anchors (List[Tensor]): shape[s, (l, 4)] - num_anchors_list (List[int]): shape[s] - stride_tensor_list (List[Tensor]): shape[s, (l, 1)] + anchors (Tensor): shape[l, 4], "xmin, ymin, xmax, ymax" format. + anchor_points (Tensor): shape[l, 2], "x, y" format. + num_anchors_list (List[int]): shape[s], contains [s_1, s_2, ...]. + stride_tensor (Tensor): shape[l, 1], contains the stride for each scale. """ assert len(feats) == len(fpn_strides) anchors = [] + anchor_points = [] num_anchors_list = [] - stride_tensor_list = [] + stride_tensor = [] for feat, stride in zip(feats, fpn_strides): _, _, h, w = feat.shape cell_half_size = grid_cell_size * stride * 0.5 @@ -186,9 +207,19 @@ def generate_anchors_for_grid_cell(feats, shift_x - cell_half_size, shift_y - cell_half_size, shift_x + cell_half_size, shift_y + cell_half_size ], - axis=-1).astype(feat.dtype) + axis=-1).astype(dtype) + anchor_point = paddle.stack([shift_x, shift_y], axis=-1).astype(dtype) + anchors.append(anchor.reshape([-1, 4])) + anchor_points.append(anchor_point.reshape([-1, 2])) num_anchors_list.append(len(anchors[-1])) - stride_tensor_list.append( - paddle.full([num_anchors_list[-1], 1], stride)) - return anchors, num_anchors_list, stride_tensor_list + stride_tensor.append( + paddle.full( + [num_anchors_list[-1], 1], stride, dtype=dtype)) + anchors = paddle.concat(anchors) + anchors.stop_gradient = True + anchor_points = paddle.concat(anchor_points) + anchor_points.stop_gradient = True + stride_tensor = paddle.concat(stride_tensor) + stride_tensor.stop_gradient = True + return anchors, anchor_points, num_anchors_list, stride_tensor diff --git a/paddlers/models/ppdet/modeling/backbones/__init__.py b/paddlers/models/ppdet/modeling/backbones/__init__.py index 869955f..3447b7d 100644 --- a/paddlers/models/ppdet/modeling/backbones/__init__.py +++ b/paddlers/models/ppdet/modeling/backbones/__init__.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. from . import vgg @@ -29,6 +29,11 @@ from . import swin_transformer from . import lcnet from . import hardnet from . import esnet +from . import cspresnet +from . import csp_darknet +from . import convnext +from . import vision_transformer +from . import mobileone from .vgg import * from .resnet import * @@ -47,3 +52,9 @@ from .swin_transformer import * from .lcnet import * from .hardnet import * from .esnet import * +from .cspresnet import * +from .csp_darknet import * +from .convnext import * +from .vision_transformer import * +from .vision_transformer import * +from .mobileone import * diff --git a/paddlers/models/ppdet/modeling/backbones/blazenet.py b/paddlers/models/ppdet/modeling/backbones/blazenet.py index 89a0c62..277b5fe 100644 --- a/paddlers/models/ppdet/modeling/backbones/blazenet.py +++ b/paddlers/models/ppdet/modeling/backbones/blazenet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlers/models/ppdet/modeling/backbones/convnext.py b/paddlers/models/ppdet/modeling/backbones/convnext.py new file mode 100644 index 0000000..b7e4dff --- /dev/null +++ b/paddlers/models/ppdet/modeling/backbones/convnext.py @@ -0,0 +1,245 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +''' +Modified from https://github.com/facebookresearch/ConvNeXt +Copyright (c) Meta Platforms, Inc. and affiliates. +All rights reserved. +This source code is licensed under the license found in the +LICENSE file in the root directory of this source tree. +''' + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.nn.initializer import Constant + +import numpy as np + +from paddlers.models.ppdet.core.workspace import register, serializable +from ..shape_spec import ShapeSpec +from .transformer_utils import DropPath, trunc_normal_, zeros_ + +__all__ = ['ConvNeXt'] + + +class Block(nn.Layer): + r""" ConvNeXt Block. There are two equivalent implementations: + (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W) + (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back + We use (2) as we find it slightly faster in Pypaddle + + Args: + dim (int): Number of input channels. + drop_path (float): Stochastic depth rate. Default: 0.0 + layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. + """ + + def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6): + super().__init__() + self.dwconv = nn.Conv2D( + dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv + self.norm = LayerNorm(dim, eps=1e-6) + self.pwconv1 = nn.Linear( + dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers + self.act = nn.GELU() + self.pwconv2 = nn.Linear(4 * dim, dim) + + if layer_scale_init_value > 0: + self.gamma = self.create_parameter( + shape=(dim, ), + attr=ParamAttr(initializer=Constant(layer_scale_init_value))) + else: + self.gamma = None + + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity( + ) + + def forward(self, x): + input = x + x = self.dwconv(x) + x = x.transpose([0, 2, 3, 1]) + x = self.norm(x) + x = self.pwconv1(x) + x = self.act(x) + x = self.pwconv2(x) + if self.gamma is not None: + x = self.gamma * x + x = x.transpose([0, 3, 1, 2]) + x = input + self.drop_path(x) + return x + + +class LayerNorm(nn.Layer): + r""" LayerNorm that supports two data formats: channels_last (default) or channels_first. + The ordering of the dimensions in the inputs. channels_last corresponds to inputs with + shape (batch_size, height, width, channels) while channels_first corresponds to inputs + with shape (batch_size, channels, height, width). + """ + + def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"): + super().__init__() + + self.weight = self.create_parameter( + shape=(normalized_shape, ), + attr=ParamAttr(initializer=Constant(1.))) + self.bias = self.create_parameter( + shape=(normalized_shape, ), + attr=ParamAttr(initializer=Constant(0.))) + + self.eps = eps + self.data_format = data_format + if self.data_format not in ["channels_last", "channels_first"]: + raise NotImplementedError + self.normalized_shape = (normalized_shape, ) + + def forward(self, x): + if self.data_format == "channels_last": + return F.layer_norm(x, self.normalized_shape, self.weight, + self.bias, self.eps) + elif self.data_format == "channels_first": + u = x.mean(1, keepdim=True) + s = (x - u).pow(2).mean(1, keepdim=True) + x = (x - u) / paddle.sqrt(s + self.eps) + x = self.weight[:, None, None] * x + self.bias[:, None, None] + return x + + +@register +@serializable +class ConvNeXt(nn.Layer): + r""" ConvNeXt + A Pypaddle impl of : `A ConvNet for the 2020s` - + https://arxiv.org/pdf/2201.03545.pdf + + Args: + in_chans (int): Number of input image channels. Default: 3 + depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3] + dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768] + drop_path_rate (float): Stochastic depth rate. Default: 0. + layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. + """ + + arch_settings = { + 'tiny': { + 'depths': [3, 3, 9, 3], + 'dims': [96, 192, 384, 768] + }, + 'small': { + 'depths': [3, 3, 27, 3], + 'dims': [96, 192, 384, 768] + }, + 'base': { + 'depths': [3, 3, 27, 3], + 'dims': [128, 256, 512, 1024] + }, + 'large': { + 'depths': [3, 3, 27, 3], + 'dims': [192, 384, 768, 1536] + }, + 'xlarge': { + 'depths': [3, 3, 27, 3], + 'dims': [256, 512, 1024, 2048] + }, + } + + def __init__( + self, + arch='tiny', + in_chans=3, + drop_path_rate=0., + layer_scale_init_value=1e-6, + return_idx=[1, 2, 3], + norm_output=True, + pretrained=None, ): + super().__init__() + depths = self.arch_settings[arch]['depths'] + dims = self.arch_settings[arch]['dims'] + self.downsample_layers = nn.LayerList( + ) # stem and 3 intermediate downsampling conv layers + stem = nn.Sequential( + nn.Conv2D( + in_chans, dims[0], kernel_size=4, stride=4), + LayerNorm( + dims[0], eps=1e-6, data_format="channels_first")) + self.downsample_layers.append(stem) + for i in range(3): + downsample_layer = nn.Sequential( + LayerNorm( + dims[i], eps=1e-6, data_format="channels_first"), + nn.Conv2D( + dims[i], dims[i + 1], kernel_size=2, stride=2), ) + self.downsample_layers.append(downsample_layer) + + self.stages = nn.LayerList( + ) # 4 feature resolution stages, each consisting of multiple residual blocks + dp_rates = [x for x in np.linspace(0, drop_path_rate, sum(depths))] + cur = 0 + for i in range(4): + stage = nn.Sequential(*[ + Block( + dim=dims[i], + drop_path=dp_rates[cur + j], + layer_scale_init_value=layer_scale_init_value) + for j in range(depths[i]) + ]) + self.stages.append(stage) + cur += depths[i] + + self.return_idx = return_idx + self.dims = [dims[i] for i in return_idx] # [::-1] + + self.norm_output = norm_output + if norm_output: + self.norms = nn.LayerList([ + LayerNorm( + c, eps=1e-6, data_format="channels_first") + for c in self.dims + ]) + + self.apply(self._init_weights) + + if pretrained is not None: + if 'http' in pretrained: #URL + path = paddle.utils.download.get_weights_path_from_url( + pretrained) + else: #model in local path + path = pretrained + self.set_state_dict(paddle.load(path)) + + def _init_weights(self, m): + if isinstance(m, (nn.Conv2D, nn.Linear)): + trunc_normal_(m.weight) + zeros_(m.bias) + + def forward_features(self, x): + output = [] + for i in range(4): + x = self.downsample_layers[i](x) + x = self.stages[i](x) + output.append(x) + + outputs = [output[i] for i in self.return_idx] + if self.norm_output: + outputs = [self.norms[i](out) for i, out in enumerate(outputs)] + + return outputs + + def forward(self, x): + x = self.forward_features(x['image']) + return x + + @property + def out_shape(self): + return [ShapeSpec(channels=c) for c in self.dims] diff --git a/paddlers/models/ppdet/modeling/backbones/csp_darknet.py b/paddlers/models/ppdet/modeling/backbones/csp_darknet.py new file mode 100644 index 0000000..f350f85 --- /dev/null +++ b/paddlers/models/ppdet/modeling/backbones/csp_darknet.py @@ -0,0 +1,404 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.regularizer import L2Decay +from paddlers.models.ppdet.core.workspace import register, serializable +from paddlers.models.ppdet.modeling.initializer import conv_init_ +from ..shape_spec import ShapeSpec + +__all__ = [ + 'CSPDarkNet', 'BaseConv', 'DWConv', 'BottleNeck', 'SPPLayer', 'SPPFLayer' +] + + +class BaseConv(nn.Layer): + def __init__(self, + in_channels, + out_channels, + ksize, + stride, + groups=1, + bias=False, + act="silu"): + super(BaseConv, self).__init__() + self.conv = nn.Conv2D( + in_channels, + out_channels, + kernel_size=ksize, + stride=stride, + padding=(ksize - 1) // 2, + groups=groups, + bias_attr=bias) + self.bn = nn.BatchNorm2D( + out_channels, + weight_attr=ParamAttr(regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(regularizer=L2Decay(0.0))) + + self._init_weights() + + def _init_weights(self): + conv_init_(self.conv) + + def forward(self, x): + # use 'x * F.sigmoid(x)' replace 'silu' + x = self.bn(self.conv(x)) + y = x * F.sigmoid(x) + return y + + +class DWConv(nn.Layer): + """Depthwise Conv""" + + def __init__(self, + in_channels, + out_channels, + ksize, + stride=1, + bias=False, + act="silu"): + super(DWConv, self).__init__() + self.dw_conv = BaseConv( + in_channels, + in_channels, + ksize=ksize, + stride=stride, + groups=in_channels, + bias=bias, + act=act) + self.pw_conv = BaseConv( + in_channels, + out_channels, + ksize=1, + stride=1, + groups=1, + bias=bias, + act=act) + + def forward(self, x): + return self.pw_conv(self.dw_conv(x)) + + +class Focus(nn.Layer): + """Focus width and height information into channel space, used in YOLOX.""" + + def __init__(self, + in_channels, + out_channels, + ksize=3, + stride=1, + bias=False, + act="silu"): + super(Focus, self).__init__() + self.conv = BaseConv( + in_channels * 4, + out_channels, + ksize=ksize, + stride=stride, + bias=bias, + act=act) + + def forward(self, inputs): + # inputs [bs, C, H, W] -> outputs [bs, 4C, W/2, H/2] + top_left = inputs[:, :, 0::2, 0::2] + top_right = inputs[:, :, 0::2, 1::2] + bottom_left = inputs[:, :, 1::2, 0::2] + bottom_right = inputs[:, :, 1::2, 1::2] + outputs = paddle.concat( + [top_left, bottom_left, top_right, bottom_right], 1) + return self.conv(outputs) + + +class BottleNeck(nn.Layer): + def __init__(self, + in_channels, + out_channels, + shortcut=True, + expansion=0.5, + depthwise=False, + bias=False, + act="silu"): + super(BottleNeck, self).__init__() + hidden_channels = int(out_channels * expansion) + Conv = DWConv if depthwise else BaseConv + self.conv1 = BaseConv( + in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act) + self.conv2 = Conv( + hidden_channels, + out_channels, + ksize=3, + stride=1, + bias=bias, + act=act) + self.add_shortcut = shortcut and in_channels == out_channels + + def forward(self, x): + y = self.conv2(self.conv1(x)) + if self.add_shortcut: + y = y + x + return y + + +class SPPLayer(nn.Layer): + """Spatial Pyramid Pooling (SPP) layer used in YOLOv3-SPP and YOLOX""" + + def __init__(self, + in_channels, + out_channels, + kernel_sizes=(5, 9, 13), + bias=False, + act="silu"): + super(SPPLayer, self).__init__() + hidden_channels = in_channels // 2 + self.conv1 = BaseConv( + in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act) + self.maxpoolings = nn.LayerList([ + nn.MaxPool2D( + kernel_size=ks, stride=1, padding=ks // 2) + for ks in kernel_sizes + ]) + conv2_channels = hidden_channels * (len(kernel_sizes) + 1) + self.conv2 = BaseConv( + conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act) + + def forward(self, x): + x = self.conv1(x) + x = paddle.concat([x] + [mp(x) for mp in self.maxpoolings], axis=1) + x = self.conv2(x) + return x + + +class SPPFLayer(nn.Layer): + """ Spatial Pyramid Pooling - Fast (SPPF) layer used in YOLOv5 by Glenn Jocher, + equivalent to SPP(k=(5, 9, 13)) + """ + + def __init__(self, + in_channels, + out_channels, + ksize=5, + bias=False, + act='silu'): + super(SPPFLayer, self).__init__() + hidden_channels = in_channels // 2 + self.conv1 = BaseConv( + in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act) + self.maxpooling = nn.MaxPool2D( + kernel_size=ksize, stride=1, padding=ksize // 2) + conv2_channels = hidden_channels * 4 + self.conv2 = BaseConv( + conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act) + + def forward(self, x): + x = self.conv1(x) + y1 = self.maxpooling(x) + y2 = self.maxpooling(y1) + y3 = self.maxpooling(y2) + concats = paddle.concat([x, y1, y2, y3], axis=1) + out = self.conv2(concats) + return out + + +class CSPLayer(nn.Layer): + """CSP (Cross Stage Partial) layer with 3 convs, named C3 in YOLOv5""" + + def __init__(self, + in_channels, + out_channels, + num_blocks=1, + shortcut=True, + expansion=0.5, + depthwise=False, + bias=False, + act="silu"): + super(CSPLayer, self).__init__() + hidden_channels = int(out_channels * expansion) + self.conv1 = BaseConv( + in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act) + self.conv2 = BaseConv( + in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act) + self.bottlenecks = nn.Sequential(*[ + BottleNeck( + hidden_channels, + hidden_channels, + shortcut=shortcut, + expansion=1.0, + depthwise=depthwise, + bias=bias, + act=act) for _ in range(num_blocks) + ]) + self.conv3 = BaseConv( + hidden_channels * 2, + out_channels, + ksize=1, + stride=1, + bias=bias, + act=act) + + def forward(self, x): + x_1 = self.conv1(x) + x_1 = self.bottlenecks(x_1) + x_2 = self.conv2(x) + x = paddle.concat([x_1, x_2], axis=1) + x = self.conv3(x) + return x + + +@register +@serializable +class CSPDarkNet(nn.Layer): + """ + CSPDarkNet backbone. + Args: + arch (str): Architecture of CSPDarkNet, from {P5, P6, X}, default as X, + and 'X' means used in YOLOX, 'P5/P6' means used in YOLOv5. + depth_mult (float): Depth multiplier, multiply number of channels in + each layer, default as 1.0. + width_mult (float): Width multiplier, multiply number of blocks in + CSPLayer, default as 1.0. + depthwise (bool): Whether to use depth-wise conv layer. + act (str): Activation function type, default as 'silu'. + return_idx (list): Index of stages whose feature maps are returned. + """ + + __shared__ = ['depth_mult', 'width_mult', 'act', 'trt'] + + # in_channels, out_channels, num_blocks, add_shortcut, use_spp(use_sppf) + # 'X' means setting used in YOLOX, 'P5/P6' means setting used in YOLOv5. + arch_settings = { + 'X': [[64, 128, 3, True, False], [128, 256, 9, True, False], + [256, 512, 9, True, False], [512, 1024, 3, False, True]], + 'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False], + [256, 512, 9, True, False], [512, 1024, 3, True, True]], + 'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False], + [256, 512, 9, True, False], [512, 768, 3, True, False], + [768, 1024, 3, True, True]], + } + + def __init__(self, + arch='X', + depth_mult=1.0, + width_mult=1.0, + depthwise=False, + act='silu', + trt=False, + return_idx=[2, 3, 4]): + super(CSPDarkNet, self).__init__() + self.arch = arch + self.return_idx = return_idx + Conv = DWConv if depthwise else BaseConv + arch_setting = self.arch_settings[arch] + base_channels = int(arch_setting[0][0] * width_mult) + + # Note: differences between the latest YOLOv5 and the original YOLOX + # 1. self.stem, use SPPF(in YOLOv5) or SPP(in YOLOX) + # 2. use SPPF(in YOLOv5) or SPP(in YOLOX) + # 3. put SPPF before(YOLOv5) or SPP after(YOLOX) the last cspdark block's CSPLayer + # 4. whether SPPF(SPP)'CSPLayer add shortcut, True in YOLOv5, False in YOLOX + if arch in ['P5', 'P6']: + # in the latest YOLOv5, use Conv stem, and SPPF (fast, only single spp kernal size) + self.stem = Conv( + 3, base_channels, ksize=6, stride=2, bias=False, act=act) + spp_kernal_sizes = 5 + elif arch in ['X']: + # in the original YOLOX, use Focus stem, and SPP (three spp kernal sizes) + self.stem = Focus( + 3, base_channels, ksize=3, stride=1, bias=False, act=act) + spp_kernal_sizes = (5, 9, 13) + else: + raise AttributeError("Unsupported arch type: {}".format(arch)) + + _out_channels = [base_channels] + layers_num = 1 + self.csp_dark_blocks = [] + + for i, (in_channels, out_channels, num_blocks, shortcut, + use_spp) in enumerate(arch_setting): + in_channels = int(in_channels * width_mult) + out_channels = int(out_channels * width_mult) + _out_channels.append(out_channels) + num_blocks = max(round(num_blocks * depth_mult), 1) + stage = [] + + conv_layer = self.add_sublayer( + 'layers{}.stage{}.conv_layer'.format(layers_num, i + 1), + Conv( + in_channels, out_channels, 3, 2, bias=False, act=act)) + stage.append(conv_layer) + layers_num += 1 + + if use_spp and arch in ['X']: + # in YOLOX use SPPLayer + spp_layer = self.add_sublayer( + 'layers{}.stage{}.spp_layer'.format(layers_num, i + 1), + SPPLayer( + out_channels, + out_channels, + kernel_sizes=spp_kernal_sizes, + bias=False, + act=act)) + stage.append(spp_layer) + layers_num += 1 + + csp_layer = self.add_sublayer( + 'layers{}.stage{}.csp_layer'.format(layers_num, i + 1), + CSPLayer( + out_channels, + out_channels, + num_blocks=num_blocks, + shortcut=shortcut, + depthwise=depthwise, + bias=False, + act=act)) + stage.append(csp_layer) + layers_num += 1 + + if use_spp and arch in ['P5', 'P6']: + # in latest YOLOv5 use SPPFLayer instead of SPPLayer + sppf_layer = self.add_sublayer( + 'layers{}.stage{}.sppf_layer'.format(layers_num, i + 1), + SPPFLayer( + out_channels, + out_channels, + ksize=5, + bias=False, + act=act)) + stage.append(sppf_layer) + layers_num += 1 + + self.csp_dark_blocks.append(nn.Sequential(*stage)) + + self._out_channels = [_out_channels[i] for i in self.return_idx] + self.strides = [[2, 4, 8, 16, 32, 64][i] for i in self.return_idx] + + def forward(self, inputs): + x = inputs['image'] + outputs = [] + x = self.stem(x) + for i, layer in enumerate(self.csp_dark_blocks): + x = layer(x) + if i + 1 in self.return_idx: + outputs.append(x) + return outputs + + @property + def out_shape(self): + return [ + ShapeSpec( + channels=c, stride=s) + for c, s in zip(self._out_channels, self.strides) + ] diff --git a/paddlers/models/ppdet/modeling/backbones/cspresnet.py b/paddlers/models/ppdet/modeling/backbones/cspresnet.py new file mode 100644 index 0000000..f286c6d --- /dev/null +++ b/paddlers/models/ppdet/modeling/backbones/cspresnet.py @@ -0,0 +1,321 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.regularizer import L2Decay +from paddle.nn.initializer import Constant + +from paddlers.models.ppdet.modeling.ops import get_act_fn +from paddlers.models.ppdet.core.workspace import register, serializable +from ..shape_spec import ShapeSpec + +__all__ = ['CSPResNet', 'BasicBlock', 'EffectiveSELayer', 'ConvBNLayer'] + + +class ConvBNLayer(nn.Layer): + def __init__(self, + ch_in, + ch_out, + filter_size=3, + stride=1, + groups=1, + padding=0, + act=None): + super(ConvBNLayer, self).__init__() + + self.conv = nn.Conv2D( + in_channels=ch_in, + out_channels=ch_out, + kernel_size=filter_size, + stride=stride, + padding=padding, + groups=groups, + bias_attr=False) + + self.bn = nn.BatchNorm2D( + ch_out, + weight_attr=ParamAttr(regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(regularizer=L2Decay(0.0))) + self.act = get_act_fn(act) if act is None or isinstance(act, ( + str, dict)) else act + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.act(x) + + return x + + +class RepVggBlock(nn.Layer): + def __init__(self, ch_in, ch_out, act='relu', alpha=False): + super(RepVggBlock, self).__init__() + self.ch_in = ch_in + self.ch_out = ch_out + self.conv1 = ConvBNLayer( + ch_in, ch_out, 3, stride=1, padding=1, act=None) + self.conv2 = ConvBNLayer( + ch_in, ch_out, 1, stride=1, padding=0, act=None) + self.act = get_act_fn(act) if act is None or isinstance(act, ( + str, dict)) else act + if alpha: + self.alpha = self.create_parameter( + shape=[1], + attr=ParamAttr(initializer=Constant(value=1.)), + dtype="float32") + else: + self.alpha = None + + def forward(self, x): + if hasattr(self, 'conv'): + y = self.conv(x) + else: + if self.alpha: + y = self.conv1(x) + self.alpha * self.conv2(x) + else: + y = self.conv1(x) + self.conv2(x) + y = self.act(y) + return y + + def convert_to_deploy(self): + if not hasattr(self, 'conv'): + self.conv = nn.Conv2D( + in_channels=self.ch_in, + out_channels=self.ch_out, + kernel_size=3, + stride=1, + padding=1, + groups=1) + kernel, bias = self.get_equivalent_kernel_bias() + self.conv.weight.set_value(kernel) + self.conv.bias.set_value(bias) + self.__delattr__('conv1') + self.__delattr__('conv2') + + def get_equivalent_kernel_bias(self): + kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1) + kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2) + if self.alpha: + return kernel3x3 + self.alpha * self._pad_1x1_to_3x3_tensor( + kernel1x1), bias3x3 + self.alpha * bias1x1 + else: + return kernel3x3 + self._pad_1x1_to_3x3_tensor( + kernel1x1), bias3x3 + bias1x1 + + def _pad_1x1_to_3x3_tensor(self, kernel1x1): + if kernel1x1 is None: + return 0 + else: + return nn.functional.pad(kernel1x1, [1, 1, 1, 1]) + + def _fuse_bn_tensor(self, branch): + if branch is None: + return 0, 0 + kernel = branch.conv.weight + running_mean = branch.bn._mean + running_var = branch.bn._variance + gamma = branch.bn.weight + beta = branch.bn.bias + eps = branch.bn._epsilon + std = (running_var + eps).sqrt() + t = (gamma / std).reshape((-1, 1, 1, 1)) + return kernel * t, beta - running_mean * gamma / std + + +class BasicBlock(nn.Layer): + def __init__(self, + ch_in, + ch_out, + act='relu', + shortcut=True, + use_alpha=False): + super(BasicBlock, self).__init__() + assert ch_in == ch_out + self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=act) + self.conv2 = RepVggBlock(ch_out, ch_out, act=act, alpha=use_alpha) + self.shortcut = shortcut + + def forward(self, x): + y = self.conv1(x) + y = self.conv2(y) + if self.shortcut: + return paddle.add(x, y) + else: + return y + + +class EffectiveSELayer(nn.Layer): + """ Effective Squeeze-Excitation + From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667 + """ + + def __init__(self, channels, act='hardsigmoid'): + super(EffectiveSELayer, self).__init__() + self.fc = nn.Conv2D(channels, channels, kernel_size=1, padding=0) + self.act = get_act_fn(act) if act is None or isinstance(act, ( + str, dict)) else act + + def forward(self, x): + x_se = x.mean((2, 3), keepdim=True) + x_se = self.fc(x_se) + return x * self.act(x_se) + + +class CSPResStage(nn.Layer): + def __init__(self, + block_fn, + ch_in, + ch_out, + n, + stride, + act='relu', + attn='eca', + use_alpha=False): + super(CSPResStage, self).__init__() + + ch_mid = (ch_in + ch_out) // 2 + if stride == 2: + self.conv_down = ConvBNLayer( + ch_in, ch_mid, 3, stride=2, padding=1, act=act) + else: + self.conv_down = None + self.conv1 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act) + self.conv2 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act) + self.blocks = nn.Sequential(*[ + block_fn( + ch_mid // 2, + ch_mid // 2, + act=act, + shortcut=True, + use_alpha=use_alpha) for i in range(n) + ]) + if attn: + self.attn = EffectiveSELayer(ch_mid, act='hardsigmoid') + else: + self.attn = None + + self.conv3 = ConvBNLayer(ch_mid, ch_out, 1, act=act) + + def forward(self, x): + if self.conv_down is not None: + x = self.conv_down(x) + y1 = self.conv1(x) + y2 = self.blocks(self.conv2(x)) + y = paddle.concat([y1, y2], axis=1) + if self.attn is not None: + y = self.attn(y) + y = self.conv3(y) + return y + + +@register +@serializable +class CSPResNet(nn.Layer): + __shared__ = ['width_mult', 'depth_mult', 'trt'] + + def __init__(self, + layers=[3, 6, 6, 3], + channels=[64, 128, 256, 512, 1024], + act='swish', + return_idx=[1, 2, 3], + depth_wise=False, + use_large_stem=False, + width_mult=1.0, + depth_mult=1.0, + trt=False, + use_checkpoint=False, + use_alpha=False, + **args): + super(CSPResNet, self).__init__() + self.use_checkpoint = use_checkpoint + channels = [max(round(c * width_mult), 1) for c in channels] + layers = [max(round(l * depth_mult), 1) for l in layers] + act = get_act_fn( + act, trt=trt) if act is None or isinstance(act, + (str, dict)) else act + + if use_large_stem: + self.stem = nn.Sequential( + ('conv1', ConvBNLayer( + 3, channels[0] // 2, 3, stride=2, padding=1, act=act)), + ('conv2', ConvBNLayer( + channels[0] // 2, + channels[0] // 2, + 3, + stride=1, + padding=1, + act=act)), ('conv3', ConvBNLayer( + channels[0] // 2, + channels[0], + 3, + stride=1, + padding=1, + act=act))) + else: + self.stem = nn.Sequential( + ('conv1', ConvBNLayer( + 3, channels[0] // 2, 3, stride=2, padding=1, act=act)), + ('conv2', ConvBNLayer( + channels[0] // 2, + channels[0], + 3, + stride=1, + padding=1, + act=act))) + + n = len(channels) - 1 + self.stages = nn.Sequential(*[(str(i), CSPResStage( + BasicBlock, + channels[i], + channels[i + 1], + layers[i], + 2, + act=act, + use_alpha=use_alpha)) for i in range(n)]) + + self._out_channels = channels[1:] + self._out_strides = [4 * 2**i for i in range(n)] + self.return_idx = return_idx + if use_checkpoint: + paddle.seed(0) + + def forward(self, inputs): + x = inputs['image'] + x = self.stem(x) + outs = [] + for idx, stage in enumerate(self.stages): + if self.use_checkpoint and self.training: + x = paddle.distributed.fleet.utils.recompute( + stage, x, **{"preserve_rng_state": True}) + else: + x = stage(x) + if idx in self.return_idx: + outs.append(x) + + return outs + + @property + def out_shape(self): + return [ + ShapeSpec( + channels=self._out_channels[i], stride=self._out_strides[i]) + for i in self.return_idx + ] diff --git a/paddlers/models/ppdet/modeling/backbones/darknet.py b/paddlers/models/ppdet/modeling/backbones/darknet.py old mode 100644 new mode 100755 index 160395e..f730e05 --- a/paddlers/models/ppdet/modeling/backbones/darknet.py +++ b/paddlers/models/ppdet/modeling/backbones/darknet.py @@ -77,8 +77,8 @@ class ConvBNLayer(nn.Layer): out = self.batch_norm(out) if self.act == 'leaky': out = F.leaky_relu(out, 0.1) - elif self.act == 'mish': - out = mish(out) + else: + out = getattr(F, self.act)(out) return out @@ -149,9 +149,14 @@ class BasicBlock(nn.Layer): super(BasicBlock, self).__init__() + assert ch_in == ch_out and (ch_in % 2) == 0, \ + f"ch_in and ch_out should be the same even int, but the input \'ch_in is {ch_in}, \'ch_out is {ch_out}" + # example: + # --------------{conv1} --> {conv2} + # channel route: 10-->5 --> 5-->10 self.conv1 = ConvBNLayer( ch_in=ch_in, - ch_out=ch_out, + ch_out=int(ch_out / 2), filter_size=1, stride=1, padding=0, @@ -160,8 +165,8 @@ class BasicBlock(nn.Layer): freeze_norm=freeze_norm, data_format=data_format) self.conv2 = ConvBNLayer( - ch_in=ch_out, - ch_out=ch_out * 2, + ch_in=int(ch_out / 2), + ch_out=ch_out, filter_size=3, stride=1, padding=1, @@ -215,7 +220,7 @@ class Blocks(nn.Layer): res_out = self.add_sublayer( block_name, BasicBlock( - ch_out * 2, + ch_out, ch_out, norm_type=norm_type, norm_decay=norm_decay, @@ -296,7 +301,7 @@ class DarkNet(nn.Layer): name, Blocks( int(ch_in[i]), - 32 * (2**i), + int(ch_in[i]), stage, norm_type=norm_type, norm_decay=norm_decay, @@ -305,14 +310,14 @@ class DarkNet(nn.Layer): name=name)) self.darknet_conv_block_list.append(conv_block) if i in return_idx: - self._out_channels.append(64 * (2**i)) + self._out_channels.append(int(ch_in[i])) for i in range(num_stages - 1): down_name = 'stage.{}.downsample'.format(i) downsample = self.add_sublayer( down_name, DownSample( - ch_in=32 * (2**(i + 1)), - ch_out=32 * (2**(i + 2)), + ch_in=int(ch_in[i]), + ch_out=int(ch_in[i + 1]), norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, diff --git a/paddlers/models/ppdet/modeling/backbones/dla.py b/paddlers/models/ppdet/modeling/backbones/dla.py old mode 100644 new mode 100755 index 17966fa..70c52a8 --- a/paddlers/models/ppdet/modeling/backbones/dla.py +++ b/paddlers/models/ppdet/modeling/backbones/dla.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. import paddle diff --git a/paddlers/models/ppdet/modeling/backbones/esnet.py b/paddlers/models/ppdet/modeling/backbones/esnet.py index 9a18d9b..fb13330 100644 --- a/paddlers/models/ppdet/modeling/backbones/esnet.py +++ b/paddlers/models/ppdet/modeling/backbones/esnet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -20,7 +20,7 @@ import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr -from paddle.nn import Conv2D, MaxPool2D, AdaptiveAvgPool2D +from paddle.nn import Conv2D, MaxPool2D, AdaptiveAvgPool2D, BatchNorm from paddle.nn.initializer import KaimingNormal from paddle.regularizer import L2Decay diff --git a/paddlers/models/ppdet/modeling/backbones/ghostnet.py b/paddlers/models/ppdet/modeling/backbones/ghostnet.py index 4236f04..ef4ac8a 100644 --- a/paddlers/models/ppdet/modeling/backbones/ghostnet.py +++ b/paddlers/models/ppdet/modeling/backbones/ghostnet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -299,17 +299,17 @@ class GhostBottleneck(nn.Layer): class GhostNet(nn.Layer): __shared__ = ['norm_type'] - def __init__(self, - scale=1.3, - feature_maps=[6, 12, 15], - with_extra_blocks=False, - extra_block_filters=[[256, 512], [128, 256], [128, 256], - [64, 128]], - lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0], - conv_decay=0., - norm_type='bn', - norm_decay=0.0, - freeze_norm=False): + def __init__( + self, + scale=1.3, + feature_maps=[6, 12, 15], + with_extra_blocks=False, + extra_block_filters=[[256, 512], [128, 256], [128, 256], [64, 128]], + lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0], + conv_decay=0., + norm_type='bn', + norm_decay=0.0, + freeze_norm=False): super(GhostNet, self).__init__() if isinstance(feature_maps, Integral): feature_maps = [feature_maps] diff --git a/paddlers/models/ppdet/modeling/backbones/hardnet.py b/paddlers/models/ppdet/modeling/backbones/hardnet.py index 71c5a09..87a2c51 100644 --- a/paddlers/models/ppdet/modeling/backbones/hardnet.py +++ b/paddlers/models/ppdet/modeling/backbones/hardnet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -146,7 +146,7 @@ class HarDBlock(nn.Layer): class HarDNet(nn.Layer): def __init__(self, depth_wise=False, return_idx=[1, 3, 8, 13], arch=85): super(HarDNet, self).__init__() - assert arch in [39, 68, 85], "HarDNet-{} not support.".format(arch) + assert arch in [68, 85], "HarDNet-{} is not supported.".format(arch) if arch == 85: first_ch = [48, 96] second_kernel = 3 @@ -161,6 +161,8 @@ class HarDNet(nn.Layer): grmul = 1.7 gr = [14, 16, 20, 40] n_layers = [8, 16, 16, 16] + else: + raise ValueError("HarDNet-{} is not supported.".format(arch)) self.return_idx = return_idx self._out_channels = [96, 214, 458, 784] diff --git a/paddlers/models/ppdet/modeling/backbones/lcnet.py b/paddlers/models/ppdet/modeling/backbones/lcnet.py index 3ac51ae..90bbcc8 100644 --- a/paddlers/models/ppdet/modeling/backbones/lcnet.py +++ b/paddlers/models/ppdet/modeling/backbones/lcnet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -68,7 +68,8 @@ class ConvBNLayer(nn.Layer): filter_size, num_filters, stride, - num_groups=1): + num_groups=1, + act='hard_swish'): super().__init__() self.conv = Conv2D( @@ -85,12 +86,15 @@ class ConvBNLayer(nn.Layer): num_filters, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))) - self.hardswish = nn.Hardswish() + if act == 'hard_swish': + self.act = nn.Hardswish() + elif act == 'relu6': + self.act = nn.ReLU6() def forward(self, x): x = self.conv(x) x = self.bn(x) - x = self.hardswish(x) + x = self.act(x) return x @@ -100,7 +104,8 @@ class DepthwiseSeparable(nn.Layer): num_filters, stride, dw_size=3, - use_se=False): + use_se=False, + act='hard_swish'): super().__init__() self.use_se = use_se self.dw_conv = ConvBNLayer( @@ -108,14 +113,16 @@ class DepthwiseSeparable(nn.Layer): num_filters=num_channels, filter_size=dw_size, stride=stride, - num_groups=num_channels) + num_groups=num_channels, + act=act) if use_se: self.se = SEModule(num_channels) self.pw_conv = ConvBNLayer( num_channels=num_channels, filter_size=1, num_filters=num_filters, - stride=1) + stride=1, + act=act) def forward(self, x): x = self.dw_conv(x) @@ -158,7 +165,7 @@ class SEModule(nn.Layer): @register @serializable class LCNet(nn.Layer): - def __init__(self, scale=1.0, feature_maps=[3, 4, 5]): + def __init__(self, scale=1.0, feature_maps=[3, 4, 5], act='hard_swish'): super().__init__() self.scale = scale self.feature_maps = feature_maps @@ -169,7 +176,8 @@ class LCNet(nn.Layer): num_channels=3, filter_size=3, num_filters=make_divisible(16 * scale), - stride=2) + stride=2, + act=act) self.blocks2 = nn.Sequential(*[ DepthwiseSeparable( @@ -177,7 +185,8 @@ class LCNet(nn.Layer): num_filters=make_divisible(out_c * scale), dw_size=k, stride=s, - use_se=se) + use_se=se, + act=act) for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks2"]) ]) @@ -187,7 +196,8 @@ class LCNet(nn.Layer): num_filters=make_divisible(out_c * scale), dw_size=k, stride=s, - use_se=se) + use_se=se, + act=act) for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks3"]) ]) @@ -200,7 +210,8 @@ class LCNet(nn.Layer): num_filters=make_divisible(out_c * scale), dw_size=k, stride=s, - use_se=se) + use_se=se, + act=act) for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks4"]) ]) @@ -213,7 +224,8 @@ class LCNet(nn.Layer): num_filters=make_divisible(out_c * scale), dw_size=k, stride=s, - use_se=se) + use_se=se, + act=act) for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks5"]) ]) @@ -226,7 +238,8 @@ class LCNet(nn.Layer): num_filters=make_divisible(out_c * scale), dw_size=k, stride=s, - use_se=se) + use_se=se, + act=act) for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks6"]) ]) diff --git a/paddlers/models/ppdet/modeling/backbones/mobilenet_v1.py b/paddlers/models/ppdet/modeling/backbones/mobilenet_v1.py index 0e8b5e1..a839efe 100644 --- a/paddlers/models/ppdet/modeling/backbones/mobilenet_v1.py +++ b/paddlers/models/ppdet/modeling/backbones/mobilenet_v1.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlers/models/ppdet/modeling/backbones/mobilenet_v3.py b/paddlers/models/ppdet/modeling/backbones/mobilenet_v3.py index c7e75bb..be4e7e9 100644 --- a/paddlers/models/ppdet/modeling/backbones/mobilenet_v3.py +++ b/paddlers/models/ppdet/modeling/backbones/mobilenet_v3.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -282,19 +282,19 @@ class ExtraBlockDW(nn.Layer): class MobileNetV3(nn.Layer): __shared__ = ['norm_type'] - def __init__(self, - scale=1.0, - model_name="large", - feature_maps=[6, 12, 15], - with_extra_blocks=False, - extra_block_filters=[[256, 512], [128, 256], [128, 256], - [64, 128]], - lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0], - conv_decay=0.0, - multiplier=1.0, - norm_type='bn', - norm_decay=0.0, - freeze_norm=False): + def __init__( + self, + scale=1.0, + model_name="large", + feature_maps=[6, 12, 15], + with_extra_blocks=False, + extra_block_filters=[[256, 512], [128, 256], [128, 256], [64, 128]], + lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0], + conv_decay=0.0, + multiplier=1.0, + norm_type='bn', + norm_decay=0.0, + freeze_norm=False): super(MobileNetV3, self).__init__() if isinstance(feature_maps, Integral): feature_maps = [feature_maps] diff --git a/paddlers/models/ppdet/modeling/backbones/mobileone.py b/paddlers/models/ppdet/modeling/backbones/mobileone.py new file mode 100644 index 0000000..fe09e45 --- /dev/null +++ b/paddlers/models/ppdet/modeling/backbones/mobileone.py @@ -0,0 +1,266 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is the paddle implementation of MobileOne block, see: https://arxiv.org/pdf/2206.04040.pdf. +Some codes are based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py +Ths copyright of microsoft/Swin-Transformer is as follows: +MIT License [see LICENSE for details] +""" + +import paddle +import paddle.nn as nn +from paddle import ParamAttr +from paddle.regularizer import L2Decay +from paddle.nn.initializer import Normal, Constant + +from paddlers.models.ppdet.modeling.ops import get_act_fn +from paddlers.models.ppdet.modeling.layers import ConvNormLayer + + +class MobileOneBlock(nn.Layer): + def __init__( + self, + ch_in, + ch_out, + stride, + kernel_size, + conv_num=1, + norm_type='bn', + norm_decay=0., + norm_groups=32, + bias_on=False, + lr_scale=1., + freeze_norm=False, + initializer=Normal( + mean=0., std=0.01), + skip_quant=False, + act='relu', ): + super(MobileOneBlock, self).__init__() + + self.ch_in = ch_in + self.ch_out = ch_out + self.kernel_size = kernel_size + self.stride = stride + self.padding = (kernel_size - 1) // 2 + self.k = conv_num + + self.depth_conv = nn.LayerList() + self.point_conv = nn.LayerList() + for _ in range(self.k): + self.depth_conv.append( + ConvNormLayer( + ch_in, + ch_in, + kernel_size, + stride=stride, + groups=ch_in, + norm_type=norm_type, + norm_decay=norm_decay, + norm_groups=norm_groups, + bias_on=bias_on, + lr_scale=lr_scale, + freeze_norm=freeze_norm, + initializer=initializer, + skip_quant=skip_quant)) + self.point_conv.append( + ConvNormLayer( + ch_in, + ch_out, + 1, + stride=1, + groups=1, + norm_type=norm_type, + norm_decay=norm_decay, + norm_groups=norm_groups, + bias_on=bias_on, + lr_scale=lr_scale, + freeze_norm=freeze_norm, + initializer=initializer, + skip_quant=skip_quant)) + self.rbr_1x1 = ConvNormLayer( + ch_in, + ch_in, + 1, + stride=self.stride, + groups=ch_in, + norm_type=norm_type, + norm_decay=norm_decay, + norm_groups=norm_groups, + bias_on=bias_on, + lr_scale=lr_scale, + freeze_norm=freeze_norm, + initializer=initializer, + skip_quant=skip_quant) + self.rbr_identity_st1 = nn.BatchNorm2D( + num_features=ch_in, + weight_attr=ParamAttr(regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(regularizer=L2Decay( + 0.0))) if ch_in == ch_out and self.stride == 1 else None + self.rbr_identity_st2 = nn.BatchNorm2D( + num_features=ch_out, + weight_attr=ParamAttr(regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(regularizer=L2Decay( + 0.0))) if ch_in == ch_out and self.stride == 1 else None + self.act = get_act_fn(act) if act is None or isinstance(act, ( + str, dict)) else act + + def forward(self, x): + if hasattr(self, "conv1") and hasattr(self, "conv2"): + y = self.act(self.conv2(self.act(self.conv1(x)))) + else: + if self.rbr_identity_st1 is None: + id_out_st1 = 0 + else: + id_out_st1 = self.rbr_identity_st1(x) + + x1_1 = 0 + for i in range(self.k): + x1_1 += self.depth_conv[i](x) + + x1_2 = self.rbr_1x1(x) + x1 = self.act(x1_1 + x1_2 + id_out_st1) + + if self.rbr_identity_st2 is None: + id_out_st2 = 0 + else: + id_out_st2 = self.rbr_identity_st2(x1) + + x2_1 = 0 + for i in range(self.k): + x2_1 += self.point_conv[i](x1) + y = self.act(x2_1 + id_out_st2) + + return y + + def convert_to_deploy(self): + if not hasattr(self, 'conv1'): + self.conv1 = nn.Conv2D( + in_channels=self.ch_in, + out_channels=self.ch_in, + kernel_size=self.kernel_size, + stride=self.stride, + padding=self.padding, + groups=self.ch_in, + bias_attr=ParamAttr( + initializer=Constant(value=0.), learning_rate=1.)) + if not hasattr(self, 'conv2'): + self.conv2 = nn.Conv2D( + in_channels=self.ch_in, + out_channels=self.ch_out, + kernel_size=1, + stride=1, + padding='SAME', + groups=1, + bias_attr=ParamAttr( + initializer=Constant(value=0.), learning_rate=1.)) + + conv1_kernel, conv1_bias, conv2_kernel, conv2_bias = self.get_equivalent_kernel_bias( + ) + self.conv1.weight.set_value(conv1_kernel) + self.conv1.bias.set_value(conv1_bias) + self.conv2.weight.set_value(conv2_kernel) + self.conv2.bias.set_value(conv2_bias) + self.__delattr__('depth_conv') + self.__delattr__('point_conv') + self.__delattr__('rbr_1x1') + if hasattr(self, 'rbr_identity_st1'): + self.__delattr__('rbr_identity_st1') + if hasattr(self, 'rbr_identity_st2'): + self.__delattr__('rbr_identity_st2') + + def get_equivalent_kernel_bias(self): + st1_kernel3x3, st1_bias3x3 = self._fuse_bn_tensor(self.depth_conv) + st1_kernel1x1, st1_bias1x1 = self._fuse_bn_tensor(self.rbr_1x1) + st1_kernelid, st1_biasid = self._fuse_bn_tensor( + self.rbr_identity_st1, kernel_size=self.kernel_size) + + st2_kernel1x1, st2_bias1x1 = self._fuse_bn_tensor(self.point_conv) + st2_kernelid, st2_biasid = self._fuse_bn_tensor( + self.rbr_identity_st2, kernel_size=1) + + conv1_kernel = st1_kernel3x3 + self._pad_1x1_to_3x3_tensor( + st1_kernel1x1) + st1_kernelid + + conv1_bias = st1_bias3x3 + st1_bias1x1 + st1_biasid + + conv2_kernel = st2_kernel1x1 + st2_kernelid + conv2_bias = st2_bias1x1 + st2_biasid + + return conv1_kernel, conv1_bias, conv2_kernel, conv2_bias + + def _pad_1x1_to_3x3_tensor(self, kernel1x1): + if kernel1x1 is None: + return 0 + else: + padding_size = (self.kernel_size - 1) // 2 + return nn.functional.pad( + kernel1x1, + [padding_size, padding_size, padding_size, padding_size]) + + def _fuse_bn_tensor(self, branch, kernel_size=3): + if branch is None: + return 0, 0 + + if isinstance(branch, nn.LayerList): + fused_kernels = [] + fused_bias = [] + for block in branch: + kernel = block.conv.weight + running_mean = block.norm._mean + running_var = block.norm._variance + gamma = block.norm.weight + beta = block.norm.bias + eps = block.norm._epsilon + + std = (running_var + eps).sqrt() + t = (gamma / std).reshape((-1, 1, 1, 1)) + + fused_kernels.append(kernel * t) + fused_bias.append(beta - running_mean * gamma / std) + + return sum(fused_kernels), sum(fused_bias) + + elif isinstance(branch, ConvNormLayer): + kernel = branch.conv.weight + running_mean = branch.norm._mean + running_var = branch.norm._variance + gamma = branch.norm.weight + beta = branch.norm.bias + eps = branch.norm._epsilon + else: + assert isinstance(branch, nn.BatchNorm2D) + input_dim = self.ch_in if kernel_size == 1 else 1 + kernel_value = paddle.zeros( + shape=[self.ch_in, input_dim, kernel_size, kernel_size], + dtype='float32') + if kernel_size > 1: + for i in range(self.ch_in): + kernel_value[i, i % input_dim, (kernel_size - 1) // 2, ( + kernel_size - 1) // 2] = 1 + elif kernel_size == 1: + for i in range(self.ch_in): + kernel_value[i, i % input_dim, 0, 0] = 1 + else: + raise ValueError("Invalid kernel size recieved!") + kernel = paddle.to_tensor(kernel_value, place=branch.weight.place) + running_mean = branch._mean + running_var = branch._variance + gamma = branch.weight + beta = branch.bias + eps = branch._epsilon + + std = (running_var + eps).sqrt() + t = (gamma / std).reshape((-1, 1, 1, 1)) + + return kernel * t, beta - running_mean * gamma / std diff --git a/paddlers/models/ppdet/modeling/backbones/resnet.py b/paddlers/models/ppdet/modeling/backbones/resnet.py old mode 100644 new mode 100755 index 9287cd4..d46a664 --- a/paddlers/models/ppdet/modeling/backbones/resnet.py +++ b/paddlers/models/ppdet/modeling/backbones/resnet.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. import math @@ -446,13 +446,13 @@ class ResNet(nn.Layer): std_senet=False): """ Residual Network, see https://arxiv.org/abs/1512.03385 - + Args: depth (int): ResNet depth, should be 18, 34, 50, 101, 152. ch_in (int): output channel of first stage, default 64 variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5), - lower learning rate ratio is need for pretrained model + lower learning rate ratio is need for pretrained model got using distillation(default as [1.0, 1.0, 1.0, 1.0]). groups (int): group convolution cardinality base_width (int): base width of each group convolution diff --git a/paddlers/models/ppdet/modeling/backbones/senet.py b/paddlers/models/ppdet/modeling/backbones/senet.py index d83dc42..de61e85 100644 --- a/paddlers/models/ppdet/modeling/backbones/senet.py +++ b/paddlers/models/ppdet/modeling/backbones/senet.py @@ -1,21 +1,23 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. import paddle.nn as nn from paddlers.models.ppdet.core.workspace import register, serializable from .resnet import ResNet, Blocks, BasicBlock, BottleNeck +from ..shape_spec import ShapeSpec +from .name_adapter import NameAdapter __all__ = ['SENet', 'SERes5Head'] @@ -41,12 +43,12 @@ class SENet(ResNet): num_stages=4): """ Squeeze-and-Excitation Networks, see https://arxiv.org/abs/1709.01507 - + Args: depth (int): SENet depth, should be 50, 101, 152 variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5), - lower learning rate ratio is need for pretrained model + lower learning rate ratio is need for pretrained model got using distillation(default as [1.0, 1.0, 1.0, 1.0]). groups (int): group convolution cardinality base_width (int): base width of each group convolution @@ -103,7 +105,7 @@ class SERes5Head(nn.Layer): norm_decay (float): weight decay for normalization layer weights dcn_v2_stages (list): index of stages who select deformable conv v2 std_senet (bool): whether use senet, default True - + """ super(SERes5Head, self).__init__() ch_out = 512 diff --git a/paddlers/models/ppdet/modeling/backbones/shufflenet_v2.py b/paddlers/models/ppdet/modeling/backbones/shufflenet_v2.py index be3b86f..ce9a82d 100644 --- a/paddlers/models/ppdet/modeling/backbones/shufflenet_v2.py +++ b/paddlers/models/ppdet/modeling/backbones/shufflenet_v2.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -188,11 +188,10 @@ class ShuffleNetV2(nn.Layer): elif scale == 1.5: stage_out_channels = [-1, 24, 176, 352, 704, 1024] elif scale == 2.0: - stage_out_channels = [-1, 24, 224, 488, 976, 2048] + stage_out_channels = [-1, 24, 244, 488, 976, 2048] else: raise NotImplementedError("This scale size:[" + str(scale) + "] is not implemented!") - self._out_channels = [] self._feature_idx = 0 # 1. conv1 diff --git a/paddlers/models/ppdet/modeling/backbones/swin_transformer.py b/paddlers/models/ppdet/modeling/backbones/swin_transformer.py index b35fe71..616964f 100644 --- a/paddlers/models/ppdet/modeling/backbones/swin_transformer.py +++ b/paddlers/models/ppdet/modeling/backbones/swin_transformer.py @@ -20,62 +20,13 @@ MIT License [see LICENSE for details] import paddle import paddle.nn as nn import paddle.nn.functional as F -from paddle.nn.initializer import TruncatedNormal, Constant, Assign from paddlers.models.ppdet.modeling.shape_spec import ShapeSpec from paddlers.models.ppdet.core.workspace import register, serializable import numpy as np -# Common initializations -ones_ = Constant(value=1.) -zeros_ = Constant(value=0.) -trunc_normal_ = TruncatedNormal(std=.02) - - -# Common Functions -def to_2tuple(x): - return tuple([x] * 2) - - -def add_parameter(layer, datas, name=None): - parameter = layer.create_parameter( - shape=(datas.shape), default_initializer=Assign(datas)) - if name: - layer.add_parameter(name, parameter) - return parameter - - -# Common Layers -def drop_path(x, drop_prob=0., training=False): - """ - Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). - the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... - See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... - """ - if drop_prob == 0. or not training: - return x - keep_prob = paddle.to_tensor(1 - drop_prob) - shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1) - random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype) - random_tensor = paddle.floor(random_tensor) # binarize - output = x.divide(keep_prob) * random_tensor - return output - - -class DropPath(nn.Layer): - def __init__(self, drop_prob=None): - super(DropPath, self).__init__() - self.drop_prob = drop_prob - - def forward(self, x): - return drop_path(x, self.drop_prob, self.training) - - -class Identity(nn.Layer): - def __init__(self): - super(Identity, self).__init__() - - def forward(self, input): - return input +from .transformer_utils import DropPath, Identity +from .transformer_utils import add_parameter, to_2tuple +from .transformer_utils import ones_, zeros_, trunc_normal_ class Mlp(nn.Layer): @@ -112,7 +63,7 @@ def window_partition(x, window_size): """ B, H, W, C = x.shape x = x.reshape( - [B, H // window_size, window_size, W // window_size, window_size, C]) + [-1, H // window_size, window_size, W // window_size, window_size, C]) windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape( [-1, window_size, window_size, C]) return windows @@ -128,10 +79,11 @@ def window_reverse(windows, window_size, H, W): Returns: x: (B, H, W, C) """ + _, _, _, C = windows.shape B = int(windows.shape[0] / (H * W / window_size / window_size)) x = windows.reshape( - [B, H // window_size, W // window_size, window_size, window_size, -1]) - x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([B, H, W, -1]) + [-1, H // window_size, W // window_size, window_size, window_size, C]) + x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, H, W, C]) return x @@ -206,14 +158,14 @@ class WindowAttention(nn.Layer): """ B_, N, C = x.shape qkv = self.qkv(x).reshape( - [B_, N, 3, self.num_heads, C // self.num_heads]).transpose( + [-1, N, 3, self.num_heads, C // self.num_heads]).transpose( [2, 0, 3, 1, 4]) q, k, v = qkv[0], qkv[1], qkv[2] q = q * self.scale attn = paddle.mm(q, k.transpose([0, 1, 3, 2])) - index = self.relative_position_index.reshape([-1]) + index = self.relative_position_index.flatten() relative_position_bias = paddle.index_select( self.relative_position_bias_table, index) @@ -227,7 +179,7 @@ class WindowAttention(nn.Layer): if mask is not None: nW = mask.shape[0] - attn = attn.reshape([B_ // nW, nW, self.num_heads, N, N + attn = attn.reshape([-1, nW, self.num_heads, N, N ]) + mask.unsqueeze(1).unsqueeze(0) attn = attn.reshape([-1, self.num_heads, N, N]) attn = self.softmax(attn) @@ -237,7 +189,7 @@ class WindowAttention(nn.Layer): attn = self.attn_drop(attn) # x = (attn @ v).transpose(1, 2).reshape([B_, N, C]) - x = paddle.mm(attn, v).transpose([0, 2, 1, 3]).reshape([B_, N, C]) + x = paddle.mm(attn, v).transpose([0, 2, 1, 3]).reshape([-1, N, C]) x = self.proj(x) x = self.proj_drop(x) return x @@ -315,7 +267,7 @@ class SwinTransformerBlock(nn.Layer): shortcut = x x = self.norm1(x) - x = x.reshape([B, H, W, C]) + x = x.reshape([-1, H, W, C]) # pad feature maps to multiples of window size pad_l = pad_t = 0 @@ -337,7 +289,7 @@ class SwinTransformerBlock(nn.Layer): x_windows = window_partition( shifted_x, self.window_size) # nW*B, window_size, window_size, C x_windows = x_windows.reshape( - [-1, self.window_size * self.window_size, + [x_windows.shape[0], self.window_size * self.window_size, C]) # nW*B, window_size*window_size, C # W-MSA/SW-MSA @@ -346,7 +298,7 @@ class SwinTransformerBlock(nn.Layer): # merge windows attn_windows = attn_windows.reshape( - [-1, self.window_size, self.window_size, C]) + [x_windows.shape[0], self.window_size, self.window_size, C]) shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C @@ -362,7 +314,7 @@ class SwinTransformerBlock(nn.Layer): if pad_r > 0 or pad_b > 0: x = x[:, :H, :W, :] - x = x.reshape([B, H * W, C]) + x = x.reshape([-1, H * W, C]) # FFN x = shortcut + self.drop_path(x) @@ -393,7 +345,7 @@ class PatchMerging(nn.Layer): B, L, C = x.shape assert L == H * W, "input feature has wrong size" - x = x.reshape([B, H, W, C]) + x = x.reshape([-1, H, W, C]) # padding pad_input = (H % 2 == 1) or (W % 2 == 1) @@ -405,7 +357,7 @@ class PatchMerging(nn.Layer): x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C x = paddle.concat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C - x = x.reshape([B, H * W // 4, 4 * C]) # B H/2*W/2 4*C + x = x.reshape([-1, H * W // 4, 4 * C]) # B H/2*W/2 4*C x = self.norm(x) x = self.reduction(x) @@ -482,8 +434,7 @@ class BasicLayer(nn.Layer): # calculate attention mask for SW-MSA Hp = int(np.ceil(H / self.window_size)) * self.window_size Wp = int(np.ceil(W / self.window_size)) * self.window_size - img_mask = paddle.fluid.layers.zeros( - [1, Hp, Wp, 1], dtype='float32') # 1 Hp Wp 1 + img_mask = paddle.zeros([1, Hp, Wp, 1], dtype='float32') # 1 Hp Wp 1 h_slices = (slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None)) @@ -688,10 +639,10 @@ class SwinTransformer(nn.Layer): if self.frozen_stages >= 0: self.patch_embed.eval() for param in self.patch_embed.parameters(): - param.requires_grad = False + param.stop_gradient = True if self.frozen_stages >= 1 and self.ape: - self.absolute_pos_embed.requires_grad = False + self.absolute_pos_embed.stop_gradient = True if self.frozen_stages >= 2: self.pos_drop.eval() @@ -699,7 +650,7 @@ class SwinTransformer(nn.Layer): m = self.layers[i] m.eval() for param in m.parameters(): - param.requires_grad = False + param.stop_gradient = True def _init_weights(self, m): if isinstance(m, nn.Linear): @@ -713,7 +664,7 @@ class SwinTransformer(nn.Layer): def forward(self, x): """Forward function.""" x = self.patch_embed(x['image']) - _, _, Wh, Ww = x.shape + B, _, Wh, Ww = x.shape if self.ape: # interpolate the position embedding to the corresponding size absolute_pos_embed = F.interpolate( diff --git a/paddlers/models/ppdet/modeling/backbones/transformer_utils.py b/paddlers/models/ppdet/modeling/backbones/transformer_utils.py new file mode 100644 index 0000000..bc10652 --- /dev/null +++ b/paddlers/models/ppdet/modeling/backbones/transformer_utils.py @@ -0,0 +1,74 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn + +from paddle.nn.initializer import TruncatedNormal, Constant, Assign + +# Common initializations +ones_ = Constant(value=1.) +zeros_ = Constant(value=0.) +trunc_normal_ = TruncatedNormal(std=.02) + + +# Common Layers +def drop_path(x, drop_prob=0., training=False): + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... + """ + if drop_prob == 0. or not training: + return x + keep_prob = paddle.to_tensor(1 - drop_prob) + shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1) + random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype) + random_tensor = paddle.floor(random_tensor) # binarize + output = x.divide(keep_prob) * random_tensor + return output + + +class DropPath(nn.Layer): + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + +class Identity(nn.Layer): + def __init__(self): + super(Identity, self).__init__() + + def forward(self, input): + return input + + +# common funcs + + +def to_2tuple(x): + if isinstance(x, (list, tuple)): + return x + return tuple([x] * 2) + + +def add_parameter(layer, datas, name=None): + parameter = layer.create_parameter( + shape=(datas.shape), default_initializer=Assign(datas)) + if name: + layer.add_parameter(name, parameter) + return parameter diff --git a/paddlers/models/ppdet/modeling/backbones/vgg.py b/paddlers/models/ppdet/modeling/backbones/vgg.py old mode 100644 new mode 100755 index 1b9e19a..10ddd7b --- a/paddlers/models/ppdet/modeling/backbones/vgg.py +++ b/paddlers/models/ppdet/modeling/backbones/vgg.py @@ -168,9 +168,9 @@ class VGG(nn.Layer): self.norms = [] for i, n in enumerate(self.normalizations): if n != -1: - norm = self.add_sublayer( - "norm{}".format(i), - L2NormScale(self.extra_block_filters[i][1], n)) + norm = self.add_sublayer("norm{}".format(i), + L2NormScale( + self.extra_block_filters[i][1], n)) else: norm = None self.norms.append(norm) diff --git a/paddlers/models/ppdet/modeling/backbones/vision_transformer.py b/paddlers/models/ppdet/modeling/backbones/vision_transformer.py new file mode 100644 index 0000000..586b6f2 --- /dev/null +++ b/paddlers/models/ppdet/modeling/backbones/vision_transformer.py @@ -0,0 +1,634 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import numpy as np +from paddle.nn.initializer import Constant + +from paddlers.models.ppdet.modeling.shape_spec import ShapeSpec +from paddlers.models.ppdet.core.workspace import register, serializable + +from .transformer_utils import zeros_, DropPath, Identity + + +class Mlp(nn.Layer): + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Layer): + def __init__(self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0., + proj_drop=0., + window_size=None): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias_attr=False) + + if qkv_bias: + self.q_bias = self.create_parameter( + shape=([dim]), default_initializer=zeros_) + self.v_bias = self.create_parameter( + shape=([dim]), default_initializer=zeros_) + else: + self.q_bias = None + self.v_bias = None + if window_size: + self.window_size = window_size + self.num_relative_distance = (2 * window_size[0] - 1) * ( + 2 * window_size[1] - 1) + 3 + self.relative_position_bias_table = self.create_parameter( + shape=(self.num_relative_distance, num_heads), + default_initializer=zeros_) # 2*Wh-1 * 2*Ww-1, nH + # cls to token & token 2 cls & cls to cls + + # get pair-wise relative position index for each token inside the window + coords_h = paddle.arange(window_size[0]) + coords_w = paddle.arange(window_size[1]) + coords = paddle.stack(paddle.meshgrid( + [coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww + coords_flatten_1 = paddle.unsqueeze(coords_flatten, 2) + coords_flatten_2 = paddle.unsqueeze(coords_flatten, 1) + relative_coords = coords_flatten_1.clone() - coords_flatten_2.clone( + ) + + #relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Wh + relative_coords = relative_coords.transpose( + (1, 2, 0)) #.contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += window_size[ + 0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * window_size[1] - 1 + relative_position_index = \ + paddle.zeros(shape=(window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype) + relative_position_index[1:, 1:] = relative_coords.sum( + -1) # Wh*Ww, Wh*Ww + relative_position_index[0, 0:] = self.num_relative_distance - 3 + relative_position_index[0:, 0] = self.num_relative_distance - 2 + relative_position_index[0, 0] = self.num_relative_distance - 1 + + self.register_buffer("relative_position_index", + relative_position_index) + # trunc_normal_(self.relative_position_bias_table, std=.0) + else: + self.window_size = None + self.relative_position_bias_table = None + self.relative_position_index = None + + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x, rel_pos_bias=None): + x_shape = paddle.shape(x) + N, C = x_shape[1], x_shape[2] + + qkv_bias = None + if self.q_bias is not None: + qkv_bias = paddle.concat( + (self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias)) + qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias) + + qkv = qkv.reshape((-1, N, 3, self.num_heads, + C // self.num_heads)).transpose((2, 0, 3, 1, 4)) + q, k, v = qkv[0], qkv[1], qkv[2] + attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale + + if self.relative_position_bias_table is not None: + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.reshape([-1])].reshape([ + self.window_size[0] * self.window_size[1] + 1, + self.window_size[0] * self.window_size[1] + 1, -1 + ]) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.transpose( + (2, 0, 1)) #.contiguous() # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + if rel_pos_bias is not None: + attn = attn + rel_pos_bias + + attn = nn.functional.softmax(attn, axis=-1) + attn = self.attn_drop(attn) + + x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C)) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Layer): + def __init__(self, + dim, + num_heads, + mlp_ratio=4., + qkv_bias=False, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=0., + window_size=None, + init_values=None, + act_layer=nn.GELU, + norm_layer='nn.LayerNorm', + epsilon=1e-5): + super().__init__() + self.norm1 = nn.LayerNorm(dim, epsilon=1e-6) + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + window_size=window_size) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() + self.norm2 = eval(norm_layer)(dim, epsilon=epsilon) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop) + if init_values is not None: + self.gamma_1 = self.create_parameter( + shape=([dim]), default_initializer=Constant(value=init_values)) + self.gamma_2 = self.create_parameter( + shape=([dim]), default_initializer=Constant(value=init_values)) + else: + self.gamma_1, self.gamma_2 = None, None + + def forward(self, x, rel_pos_bias=None): + + if self.gamma_1 is None: + x = x + self.drop_path( + self.attn( + self.norm1(x), rel_pos_bias=rel_pos_bias)) + x = x + self.drop_path(self.mlp(self.norm2(x))) + else: + x = x + self.drop_path(self.gamma_1 * self.attn( + self.norm1(x), rel_pos_bias=rel_pos_bias)) + x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) + return x + + +class PatchEmbed(nn.Layer): + """ Image to Patch Embedding + """ + + def __init__(self, + img_size=[224, 224], + patch_size=16, + in_chans=3, + embed_dim=768): + super().__init__() + self.num_patches_w = img_size[0] // patch_size + self.num_patches_h = img_size[1] // patch_size + + num_patches = self.num_patches_w * self.num_patches_h + self.patch_shape = (img_size[0] // patch_size, + img_size[1] // patch_size) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + + self.proj = nn.Conv2D( + in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + + @property + def num_patches_in_h(self): + return self.img_size[1] // self.patch_size + + @property + def num_patches_in_w(self): + return self.img_size[0] // self.patch_size + + def forward(self, x, mask=None): + B, C, H, W = x.shape + return self.proj(x) + + +class RelativePositionBias(nn.Layer): + def __init__(self, window_size, num_heads): + super().__init__() + self.window_size = window_size + self.num_relative_distance = (2 * window_size[0] - 1) * ( + 2 * window_size[1] - 1) + 3 + self.relative_position_bias_table = self.create_parameter( + shape=(self.num_relative_distance, num_heads), + default_initialize=zeros_) + # cls to token & token 2 cls & cls to cls + + # get pair-wise relative position index for each token inside the window + coords_h = paddle.arange(window_size[0]) + coords_w = paddle.arange(window_size[1]) + coords = paddle.stack(paddle.meshgrid( + [coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = coords.flatten(1) # 2, Wh*Ww + + relative_coords = coords_flatten[:, :, + None] - coords_flatten[:, + None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.transpos( + (1, 2, 0)) # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * window_size[1] - 1 + relative_position_index = \ + paddle.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype) + relative_position_index[1:, 1:] = relative_coords.sum( + -1) # Wh*Ww, Wh*Ww + relative_position_index[0, 0:] = self.num_relative_distance - 3 + relative_position_index[0:, 0] = self.num_relative_distance - 2 + relative_position_index[0, 0] = self.num_relative_distance - 1 + self.register_buffer("relative_position_index", relative_position_index) + + def forward(self): + relative_position_bias = \ + self.relative_position_bias_table[self.relative_position_index.view(-1)].view( + self.window_size[0] * self.window_size[1] + 1, + self.window_size[0] * self.window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH + return relative_position_bias.transpose((2, 0, 1)) # nH, Wh*Ww, Wh*Ww + + +def get_sinusoid_encoding_table(n_position, d_hid, token=False): + ''' Sinusoid position encoding table ''' + + def get_position_angle_vec(position): + return [ + position / np.power(10000, 2 * (hid_j // 2) / d_hid) + for hid_j in range(d_hid) + ] + + sinusoid_table = np.array( + [get_position_angle_vec(pos_i) for pos_i in range(n_position)]) + sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i + sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 + if token: + sinusoid_table = np.concatenate( + [sinusoid_table, np.zeros([1, d_hid])], dim=0) + + return paddle.to_tensor(sinusoid_table, dtype=paddle.float32).unsqueeze(0) + + +@register +@serializable +class VisionTransformer(nn.Layer): + """ Vision Transformer with support for patch input + """ + + def __init__(self, + img_size=[672, 1092], + patch_size=16, + in_chans=3, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + qkv_bias=False, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + norm_layer='nn.LayerNorm', + init_values=None, + use_rel_pos_bias=False, + use_shared_rel_pos_bias=False, + epsilon=1e-5, + final_norm=False, + pretrained=None, + out_indices=[3, 5, 7, 11], + use_abs_pos_emb=False, + use_sincos_pos_emb=True, + with_fpn=True, + use_checkpoint=False, + **args): + super().__init__() + self.img_size = img_size + self.embed_dim = embed_dim + self.with_fpn = with_fpn + self.use_checkpoint = use_checkpoint + self.use_sincos_pos_emb = use_sincos_pos_emb + self.use_rel_pos_bias = use_rel_pos_bias + self.final_norm = final_norm + + if use_checkpoint: + paddle.seed(0) + + self.patch_embed = PatchEmbed( + img_size=img_size, + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim) + + self.pos_w = self.patch_embed.num_patches_in_w + self.pos_h = self.patch_embed.num_patches_in_h + + self.cls_token = self.create_parameter( + shape=(1, 1, embed_dim), + default_initializer=paddle.nn.initializer.Constant(value=0.)) + + if use_abs_pos_emb: + self.pos_embed = self.create_parameter( + shape=(1, self.pos_w * self.pos_h + 1, embed_dim), + default_initializer=paddle.nn.initializer.TruncatedNormal( + std=.02)) + elif use_sincos_pos_emb: + pos_embed = self.build_2d_sincos_position_embedding(embed_dim) + + self.pos_embed = pos_embed + self.pos_embed = self.create_parameter(shape=pos_embed.shape) + self.pos_embed.set_value(pos_embed.numpy()) + self.pos_embed.stop_gradient = True + + else: + self.pos_embed = None + + self.pos_drop = nn.Dropout(p=drop_rate) + + if use_shared_rel_pos_bias: + self.rel_pos_bias = RelativePositionBias( + window_size=self.patch_embed.patch_shape, num_heads=num_heads) + else: + self.rel_pos_bias = None + + dpr = np.linspace(0, drop_path_rate, depth) + + self.blocks = nn.LayerList([ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + init_values=init_values, + window_size=self.patch_embed.patch_shape + if use_rel_pos_bias else None, + epsilon=epsilon) for i in range(depth) + ]) + + self.pretrained = pretrained + self.init_weight() + + assert len(out_indices) <= 4, '' + self.out_indices = out_indices + self.out_channels = [embed_dim for _ in range(len(out_indices))] + self.out_strides = [4, 8, 16, 32][-len(out_indices):] if with_fpn else [ + 8 for _ in range(len(out_indices)) + ] + + self.norm = Identity() + + if self.with_fpn: + self.init_fpn( + embed_dim=embed_dim, + patch_size=patch_size, ) + + def init_weight(self): + pretrained = self.pretrained + + if pretrained: + if 'http' in pretrained: #URL + path = paddle.utils.download.get_weights_path_from_url( + pretrained) + else: #model in local path + path = pretrained + + load_state_dict = paddle.load(path) + model_state_dict = self.state_dict() + pos_embed_name = "pos_embed" + + if pos_embed_name in load_state_dict.keys(): + load_pos_embed = paddle.to_tensor( + load_state_dict[pos_embed_name], dtype="float32") + if self.pos_embed.shape != load_pos_embed.shape: + pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1)) + model_state_dict[pos_embed_name] = self.resize_pos_embed( + load_pos_embed, (pos_size, pos_size), + (self.pos_h, self.pos_w)) + + # self.set_state_dict(model_state_dict) + load_state_dict[pos_embed_name] = model_state_dict[ + pos_embed_name] + + print("Load pos_embed and resize it from {} to {} .".format( + load_pos_embed.shape, self.pos_embed.shape)) + + self.set_state_dict(load_state_dict) + print("Load load_state_dict....") + + def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False): + if patch_size == 16: + self.fpn1 = nn.Sequential( + nn.Conv2DTranspose( + embed_dim, embed_dim, kernel_size=2, stride=2), + nn.BatchNorm2D(embed_dim), + nn.GELU(), + nn.Conv2DTranspose( + embed_dim, embed_dim, kernel_size=2, stride=2), ) + + self.fpn2 = nn.Sequential( + nn.Conv2DTranspose( + embed_dim, embed_dim, kernel_size=2, stride=2), ) + + self.fpn3 = Identity() + + self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2) + elif patch_size == 8: + self.fpn1 = nn.Sequential( + nn.Conv2DTranspose( + embed_dim, embed_dim, kernel_size=2, stride=2), ) + + self.fpn2 = Identity() + + self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), ) + + self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), ) + + if not out_with_norm: + self.norm = Identity() + else: + self.norm = nn.LayerNorm(embed_dim, epsilon=1e-6) + + def interpolate_pos_encoding(self, x, w, h): + npatch = x.shape[1] - 1 + N = self.pos_embed.shape[1] - 1 + w0 = w // self.patch_embed.patch_size + h0 = h // self.patch_embed.patch_size + if npatch == N and w0 == self.patch_embed.num_patches_w and h0 == self.patch_embed.num_patches_h: + return self.pos_embed + class_pos_embed = self.pos_embed[:, 0] + patch_pos_embed = self.pos_embed[:, 1:] + dim = x.shape[-1] + # we add a small number to avoid floating point error in the interpolation + # see discussion at https://github.com/facebookresearch/dino/issues/8 + w0, h0 = w0 + 0.1, h0 + 0.1 + + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed.reshape([ + 1, self.patch_embed.num_patches_w, + self.patch_embed.num_patches_h, dim + ]).transpose((0, 3, 1, 2)), + scale_factor=(w0 / self.patch_embed.num_patches_w, + h0 / self.patch_embed.num_patches_h), + mode='bicubic', ) + assert int(w0) == patch_pos_embed.shape[-2] and int( + h0) == patch_pos_embed.shape[-1] + patch_pos_embed = patch_pos_embed.transpose( + (0, 2, 3, 1)).reshape([1, -1, dim]) + return paddle.concat( + (class_pos_embed.unsqueeze(0), patch_pos_embed), axis=1) + + def resize_pos_embed(self, pos_embed, old_hw, new_hw): + """ + Resize pos_embed weight. + Args: + pos_embed (Tensor): the pos_embed weight + old_hw (list[int]): the height and width of old pos_embed + new_hw (list[int]): the height and width of new pos_embed + Returns: + Tensor: the resized pos_embed weight + """ + cls_pos_embed = pos_embed[:, :1, :] + pos_embed = pos_embed[:, 1:, :] + + pos_embed = pos_embed.transpose([0, 2, 1]) + pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]]) + pos_embed = F.interpolate( + pos_embed, new_hw, mode='bicubic', align_corners=False) + pos_embed = pos_embed.flatten(2).transpose([0, 2, 1]) + pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1) + + return pos_embed + + def build_2d_sincos_position_embedding( + self, + embed_dim=768, + temperature=10000., ): + h, w = self.patch_embed.patch_shape + grid_w = paddle.arange(w, dtype=paddle.float32) + grid_h = paddle.arange(h, dtype=paddle.float32) + grid_w, grid_h = paddle.meshgrid(grid_w, grid_h) + assert embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding' + pos_dim = embed_dim // 4 + omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim + omega = 1. / (temperature**omega) + + out_w = grid_w.flatten()[..., None] @omega[None] + out_h = grid_h.flatten()[..., None] @omega[None] + + pos_emb = paddle.concat( + [ + paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h), + paddle.cos(out_h) + ], + axis=1)[None, :, :] + + pe_token = paddle.zeros([1, 1, embed_dim], dtype=paddle.float32) + pos_embed = paddle.concat([pe_token, pos_emb], axis=1) + # pos_embed.stop_gradient = True + + return pos_embed + + def forward(self, x): + x = x['image'] if isinstance(x, dict) else x + _, _, h, w = x.shape + + x = self.patch_embed(x) + + B, D, Hp, Wp = x.shape # b * c * h * w + + cls_tokens = self.cls_token.expand( + (B, self.cls_token.shape[-2], self.cls_token.shape[-1])) + x = x.flatten(2).transpose([0, 2, 1]) # b * hw * c + x = paddle.concat([cls_tokens, x], axis=1) + + if self.pos_embed is not None: + # x = x + self.interpolate_pos_encoding(x, w, h) + x = x + self.interpolate_pos_encoding(x, h, w) + + x = self.pos_drop(x) + + rel_pos_bias = self.rel_pos_bias( + ) if self.rel_pos_bias is not None else None + + feats = [] + for idx, blk in enumerate(self.blocks): + if self.use_checkpoint and self.training: + x = paddle.distributed.fleet.utils.recompute( + blk, x, rel_pos_bias, **{"preserve_rng_state": True}) + else: + x = blk(x, rel_pos_bias) + + if idx in self.out_indices: + xp = paddle.reshape( + paddle.transpose( + self.norm(x[:, 1:, :]), perm=[0, 2, 1]), + shape=[B, D, Hp, Wp]) + feats.append(xp) + + if self.with_fpn: + fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4] + for i in range(len(feats)): + feats[i] = fpns[i](feats[i]) + + return feats + + @property + def num_layers(self): + return len(self.blocks) + + @property + def no_weight_decay(self): + return {'pos_embed', 'cls_token'} + + @property + def out_shape(self): + return [ + ShapeSpec( + channels=c, stride=s) + for c, s in zip(self.out_channels, self.out_strides) + ] diff --git a/paddlers/models/ppdet/modeling/bbox_utils.py b/paddlers/models/ppdet/modeling/bbox_utils.py index d5d376c..a656c35 100644 --- a/paddlers/models/ppdet/modeling/bbox_utils.py +++ b/paddlers/models/ppdet/modeling/bbox_utils.py @@ -278,8 +278,8 @@ def decode_yolo(box, anchor, downsample_ratio): return [x1, y1, w1, h1] -def iou_similarity(box1, box2, eps=1e-9): - """Calculate iou of box1 and box2 +def batch_iou_similarity(box1, box2, eps=1e-9): + """Calculate iou of box1 and box2 in batch Args: box1 (Tensor): box with the shape [N, M1, 4] @@ -359,295 +359,6 @@ def bbox_iou(box1, box2, giou=False, diou=False, ciou=False, eps=1e-9): return iou -def rect2rbox(bboxes): - """ - :param bboxes: shape (n, 4) (xmin, ymin, xmax, ymax) - :return: dbboxes: shape (n, 5) (x_ctr, y_ctr, w, h, angle) - """ - bboxes = bboxes.reshape(-1, 4) - num_boxes = bboxes.shape[0] - - x_ctr = (bboxes[:, 2] + bboxes[:, 0]) / 2.0 - y_ctr = (bboxes[:, 3] + bboxes[:, 1]) / 2.0 - edges1 = np.abs(bboxes[:, 2] - bboxes[:, 0]) - edges2 = np.abs(bboxes[:, 3] - bboxes[:, 1]) - angles = np.zeros([num_boxes], dtype=bboxes.dtype) - - inds = edges1 < edges2 - - rboxes = np.stack((x_ctr, y_ctr, edges1, edges2, angles), axis=1) - rboxes[inds, 2] = edges2[inds] - rboxes[inds, 3] = edges1[inds] - rboxes[inds, 4] = np.pi / 2.0 - return rboxes - - -def delta2rbox(rrois, - deltas, - means=[0, 0, 0, 0, 0], - stds=[1, 1, 1, 1, 1], - wh_ratio_clip=1e-6): - """ - :param rrois: (cx, cy, w, h, theta) - :param deltas: (dx, dy, dw, dh, dtheta) - :param means: - :param stds: - :param wh_ratio_clip: - :return: - """ - means = paddle.to_tensor(means) - stds = paddle.to_tensor(stds) - deltas = paddle.reshape(deltas, [-1, deltas.shape[-1]]) - denorm_deltas = deltas * stds + means - - dx = denorm_deltas[:, 0] - dy = denorm_deltas[:, 1] - dw = denorm_deltas[:, 2] - dh = denorm_deltas[:, 3] - dangle = denorm_deltas[:, 4] - - max_ratio = np.abs(np.log(wh_ratio_clip)) - dw = paddle.clip(dw, min=-max_ratio, max=max_ratio) - dh = paddle.clip(dh, min=-max_ratio, max=max_ratio) - - rroi_x = rrois[:, 0] - rroi_y = rrois[:, 1] - rroi_w = rrois[:, 2] - rroi_h = rrois[:, 3] - rroi_angle = rrois[:, 4] - - gx = dx * rroi_w * paddle.cos(rroi_angle) - dy * rroi_h * paddle.sin( - rroi_angle) + rroi_x - gy = dx * rroi_w * paddle.sin(rroi_angle) + dy * rroi_h * paddle.cos( - rroi_angle) + rroi_y - gw = rroi_w * dw.exp() - gh = rroi_h * dh.exp() - ga = np.pi * dangle + rroi_angle - ga = (ga + np.pi / 4) % np.pi - np.pi / 4 - ga = paddle.to_tensor(ga) - - gw = paddle.to_tensor(gw, dtype='float32') - gh = paddle.to_tensor(gh, dtype='float32') - bboxes = paddle.stack([gx, gy, gw, gh, ga], axis=-1) - return bboxes - - -def rbox2delta(proposals, gt, means=[0, 0, 0, 0, 0], stds=[1, 1, 1, 1, 1]): - """ - - Args: - proposals: - gt: - means: 1x5 - stds: 1x5 - - Returns: - - """ - proposals = proposals.astype(np.float64) - - PI = np.pi - - gt_widths = gt[..., 2] - gt_heights = gt[..., 3] - gt_angle = gt[..., 4] - - proposals_widths = proposals[..., 2] - proposals_heights = proposals[..., 3] - proposals_angle = proposals[..., 4] - - coord = gt[..., 0:2] - proposals[..., 0:2] - dx = (np.cos(proposals[..., 4]) * coord[..., 0] + np.sin(proposals[..., 4]) - * coord[..., 1]) / proposals_widths - dy = (-np.sin(proposals[..., 4]) * coord[..., 0] + np.cos(proposals[..., 4]) - * coord[..., 1]) / proposals_heights - dw = np.log(gt_widths / proposals_widths) - dh = np.log(gt_heights / proposals_heights) - da = (gt_angle - proposals_angle) - - da = (da + PI / 4) % PI - PI / 4 - da /= PI - - deltas = np.stack([dx, dy, dw, dh, da], axis=-1) - means = np.array(means, dtype=deltas.dtype) - stds = np.array(stds, dtype=deltas.dtype) - deltas = (deltas - means) / stds - deltas = deltas.astype(np.float32) - return deltas - - -def bbox_decode(bbox_preds, - anchors, - means=[0, 0, 0, 0, 0], - stds=[1, 1, 1, 1, 1]): - """decode bbox from deltas - Args: - bbox_preds: [N,H,W,5] - anchors: [H*W,5] - return: - bboxes: [N,H,W,5] - """ - means = paddle.to_tensor(means) - stds = paddle.to_tensor(stds) - num_imgs, H, W, _ = bbox_preds.shape - bboxes_list = [] - for img_id in range(num_imgs): - bbox_pred = bbox_preds[img_id] - # bbox_pred.shape=[5,H,W] - bbox_delta = bbox_pred - anchors = paddle.to_tensor(anchors) - bboxes = delta2rbox( - anchors, bbox_delta, means, stds, wh_ratio_clip=1e-6) - bboxes = paddle.reshape(bboxes, [H, W, 5]) - bboxes_list.append(bboxes) - return paddle.stack(bboxes_list, axis=0) - - -def poly2rbox(polys): - """ - poly:[x0,y0,x1,y1,x2,y2,x3,y3] - to - rotated_boxes:[x_ctr,y_ctr,w,h,angle] - """ - rotated_boxes = [] - for poly in polys: - poly = np.array(poly[:8], dtype=np.float32) - - pt1 = (poly[0], poly[1]) - pt2 = (poly[2], poly[3]) - pt3 = (poly[4], poly[5]) - pt4 = (poly[6], poly[7]) - - edge1 = np.sqrt((pt1[0] - pt2[0]) * (pt1[0] - pt2[0]) + (pt1[1] - pt2[ - 1]) * (pt1[1] - pt2[1])) - edge2 = np.sqrt((pt2[0] - pt3[0]) * (pt2[0] - pt3[0]) + (pt2[1] - pt3[ - 1]) * (pt2[1] - pt3[1])) - - width = max(edge1, edge2) - height = min(edge1, edge2) - - rbox_angle = 0 - if edge1 > edge2: - rbox_angle = np.arctan2( - float(pt2[1] - pt1[1]), float(pt2[0] - pt1[0])) - elif edge2 >= edge1: - rbox_angle = np.arctan2( - float(pt4[1] - pt1[1]), float(pt4[0] - pt1[0])) - - def norm_angle(angle, range=[-np.pi / 4, np.pi]): - return (angle - range[0]) % range[1] + range[0] - - rbox_angle = norm_angle(rbox_angle) - - x_ctr = float(pt1[0] + pt3[0]) / 2 - y_ctr = float(pt1[1] + pt3[1]) / 2 - rotated_box = np.array([x_ctr, y_ctr, width, height, rbox_angle]) - rotated_boxes.append(rotated_box) - ret_rotated_boxes = np.array(rotated_boxes) - assert ret_rotated_boxes.shape[1] == 5 - return ret_rotated_boxes - - -def cal_line_length(point1, point2): - import math - return math.sqrt( - math.pow(point1[0] - point2[0], 2) + math.pow(point1[1] - point2[1], 2)) - - -def get_best_begin_point_single(coordinate): - x1, y1, x2, y2, x3, y3, x4, y4 = coordinate - xmin = min(x1, x2, x3, x4) - ymin = min(y1, y2, y3, y4) - xmax = max(x1, x2, x3, x4) - ymax = max(y1, y2, y3, y4) - combinate = [[[x1, y1], [x2, y2], [x3, y3], [x4, y4]], - [[x4, y4], [x1, y1], [x2, y2], [x3, y3]], - [[x3, y3], [x4, y4], [x1, y1], [x2, y2]], - [[x2, y2], [x3, y3], [x4, y4], [x1, y1]]] - dst_coordinate = [[xmin, ymin], [xmax, ymin], [xmax, ymax], [xmin, ymax]] - force = 100000000.0 - force_flag = 0 - for i in range(4): - temp_force = cal_line_length(combinate[i][0], dst_coordinate[0]) \ - + cal_line_length(combinate[i][1], dst_coordinate[1]) \ - + cal_line_length(combinate[i][2], dst_coordinate[2]) \ - + cal_line_length(combinate[i][3], dst_coordinate[3]) - if temp_force < force: - force = temp_force - force_flag = i - if force_flag != 0: - pass - return np.array(combinate[force_flag]).reshape(8) - - -def rbox2poly_np(rrects): - """ - rrect:[x_ctr,y_ctr,w,h,angle] - to - poly:[x0,y0,x1,y1,x2,y2,x3,y3] - """ - polys = [] - for i in range(rrects.shape[0]): - rrect = rrects[i] - # x_ctr, y_ctr, width, height, angle = rrect[:5] - x_ctr = rrect[0] - y_ctr = rrect[1] - width = rrect[2] - height = rrect[3] - angle = rrect[4] - tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2 - rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]]) - R = np.array([[np.cos(angle), -np.sin(angle)], - [np.sin(angle), np.cos(angle)]]) - poly = R.dot(rect) - x0, x1, x2, x3 = poly[0, :4] + x_ctr - y0, y1, y2, y3 = poly[1, :4] + y_ctr - poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float32) - poly = get_best_begin_point_single(poly) - polys.append(poly) - polys = np.array(polys) - return polys - - -def rbox2poly(rrects): - """ - rrect:[x_ctr,y_ctr,w,h,angle] - to - poly:[x0,y0,x1,y1,x2,y2,x3,y3] - """ - N = paddle.shape(rrects)[0] - - x_ctr = rrects[:, 0] - y_ctr = rrects[:, 1] - width = rrects[:, 2] - height = rrects[:, 3] - angle = rrects[:, 4] - - tl_x, tl_y, br_x, br_y = -width * 0.5, -height * 0.5, width * 0.5, height * 0.5 - - normal_rects = paddle.stack( - [tl_x, br_x, br_x, tl_x, tl_y, tl_y, br_y, br_y], axis=0) - normal_rects = paddle.reshape(normal_rects, [2, 4, N]) - normal_rects = paddle.transpose(normal_rects, [2, 0, 1]) - - sin, cos = paddle.sin(angle), paddle.cos(angle) - # M.shape=[N,2,2] - M = paddle.stack([cos, -sin, sin, cos], axis=0) - M = paddle.reshape(M, [2, 2, N]) - M = paddle.transpose(M, [2, 0, 1]) - - # polys:[N,8] - polys = paddle.matmul(M, normal_rects) - polys = paddle.transpose(polys, [2, 1, 0]) - polys = paddle.reshape(polys, [-1, N]) - polys = paddle.transpose(polys, [1, 0]) - - tmp = paddle.stack( - [x_ctr, y_ctr, x_ctr, y_ctr, x_ctr, y_ctr, x_ctr, y_ctr], axis=1) - polys = polys + tmp - return polys - - def bbox_iou_np_expand(box1, box2, x1y1x2y2=True, eps=1e-16): """ Calculate the iou of box1 and box2 with numpy. @@ -744,9 +455,9 @@ def distance2bbox(points, distance, max_shape=None): def bbox_center(boxes): """Get bbox centers from boxes. Args: - boxes (Tensor): boxes with shape (N, 4), "xmin, ymin, xmax, ymax" format. + boxes (Tensor): boxes with shape (..., 4), "xmin, ymin, xmax, ymax" format. Returns: - Tensor: boxes centers with shape (N, 2), "cx, cy" format. + Tensor: boxes centers with shape (..., 2), "cx, cy" format. """ boxes_cx = (boxes[..., 0] + boxes[..., 2]) / 2 boxes_cy = (boxes[..., 1] + boxes[..., 3]) / 2 @@ -756,20 +467,136 @@ def bbox_center(boxes): def batch_distance2bbox(points, distance, max_shapes=None): """Decode distance prediction to bounding box for batch. Args: - points (Tensor): [B, ..., 2] - distance (Tensor): [B, ..., 4] - max_shapes (tuple): [B, 2], "h,w" format, Shape of the image. + points (Tensor): [B, ..., 2], "xy" format + distance (Tensor): [B, ..., 4], "ltrb" format + max_shapes (Tensor): [B, 2], "h,w" format, Shape of the image. Returns: - Tensor: Decoded bboxes. + Tensor: Decoded bboxes, "x1y1x2y2" format. """ - x1 = points[..., 0] - distance[..., 0] - y1 = points[..., 1] - distance[..., 1] - x2 = points[..., 0] + distance[..., 2] - y2 = points[..., 1] + distance[..., 3] + lt, rb = paddle.split(distance, 2, -1) + # while tensor add parameters, parameters should be better placed on the second place + x1y1 = -lt + points + x2y2 = rb + points + out_bbox = paddle.concat([x1y1, x2y2], -1) if max_shapes is not None: - for i, max_shape in enumerate(max_shapes): - x1[i] = x1[i].clip(min=0, max=max_shape[1]) - y1[i] = y1[i].clip(min=0, max=max_shape[0]) - x2[i] = x2[i].clip(min=0, max=max_shape[1]) - y2[i] = y2[i].clip(min=0, max=max_shape[0]) - return paddle.stack([x1, y1, x2, y2], -1) + max_shapes = max_shapes.flip(-1).tile([1, 2]) + delta_dim = out_bbox.ndim - max_shapes.ndim + for _ in range(delta_dim): + max_shapes.unsqueeze_(1) + out_bbox = paddle.where(out_bbox < max_shapes, out_bbox, max_shapes) + out_bbox = paddle.where(out_bbox > 0, out_bbox, + paddle.zeros_like(out_bbox)) + return out_bbox + + +def delta2bbox_v2(rois, + deltas, + means=(0.0, 0.0, 0.0, 0.0), + stds=(1.0, 1.0, 1.0, 1.0), + max_shape=None, + wh_ratio_clip=16.0 / 1000.0, + ctr_clip=None): + """Transform network output(delta) to bboxes. + Based on https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/ + bbox/coder/delta_xywh_bbox_coder.py + Args: + rois (Tensor): shape [..., 4], base bboxes, typical examples include + anchor and rois + deltas (Tensor): shape [..., 4], offset relative to base bboxes + means (list[float]): the mean that was used to normalize deltas, + must be of size 4 + stds (list[float]): the std that was used to normalize deltas, + must be of size 4 + max_shape (list[float] or None): height and width of image, will be + used to clip bboxes if not None + wh_ratio_clip (float): to clip delta wh of decoded bboxes + ctr_clip (float or None): whether to clip delta xy of decoded bboxes + """ + if rois.size == 0: + return paddle.empty_like(rois) + means = paddle.to_tensor(means) + stds = paddle.to_tensor(stds) + deltas = deltas * stds + means + + dxy = deltas[..., :2] + dwh = deltas[..., 2:] + + pxy = (rois[..., :2] + rois[..., 2:]) * 0.5 + pwh = rois[..., 2:] - rois[..., :2] + dxy_wh = pwh * dxy + + max_ratio = np.abs(np.log(wh_ratio_clip)) + if ctr_clip is not None: + dxy_wh = paddle.clip(dxy_wh, max=ctr_clip, min=-ctr_clip) + dwh = paddle.clip(dwh, max=max_ratio) + else: + dwh = dwh.clip(min=-max_ratio, max=max_ratio) + + gxy = pxy + dxy_wh + gwh = pwh * dwh.exp() + x1y1 = gxy - (gwh * 0.5) + x2y2 = gxy + (gwh * 0.5) + bboxes = paddle.concat([x1y1, x2y2], axis=-1) + if max_shape is not None: + bboxes[..., 0::2] = bboxes[..., 0::2].clip(min=0, max=max_shape[1]) + bboxes[..., 1::2] = bboxes[..., 1::2].clip(min=0, max=max_shape[0]) + return bboxes + + +def bbox2delta_v2(src_boxes, + tgt_boxes, + means=(0.0, 0.0, 0.0, 0.0), + stds=(1.0, 1.0, 1.0, 1.0)): + """Encode bboxes to deltas. + Modified from paddlers.models.ppdet.modeling.bbox_utils.bbox2delta. + Args: + src_boxes (Tensor[..., 4]): base bboxes + tgt_boxes (Tensor[..., 4]): target bboxes + means (list[float]): the mean that will be used to normalize delta + stds (list[float]): the std that will be used to normalize delta + """ + if src_boxes.size == 0: + return paddle.empty_like(src_boxes) + src_w = src_boxes[..., 2] - src_boxes[..., 0] + src_h = src_boxes[..., 3] - src_boxes[..., 1] + src_ctr_x = src_boxes[..., 0] + 0.5 * src_w + src_ctr_y = src_boxes[..., 1] + 0.5 * src_h + + tgt_w = tgt_boxes[..., 2] - tgt_boxes[..., 0] + tgt_h = tgt_boxes[..., 3] - tgt_boxes[..., 1] + tgt_ctr_x = tgt_boxes[..., 0] + 0.5 * tgt_w + tgt_ctr_y = tgt_boxes[..., 1] + 0.5 * tgt_h + + dx = (tgt_ctr_x - src_ctr_x) / src_w + dy = (tgt_ctr_y - src_ctr_y) / src_h + dw = paddle.log(tgt_w / src_w) + dh = paddle.log(tgt_h / src_h) + + deltas = paddle.stack((dx, dy, dw, dh), axis=1) # [n, 4] + means = paddle.to_tensor(means, place=src_boxes.place) + stds = paddle.to_tensor(stds, place=src_boxes.place) + deltas = (deltas - means) / stds + return deltas + + +def iou_similarity(box1, box2, eps=1e-10): + """Calculate iou of box1 and box2 + + Args: + box1 (Tensor): box with the shape [M1, 4] + box2 (Tensor): box with the shape [M2, 4] + + Return: + iou (Tensor): iou between box1 and box2 with the shape [M1, M2] + """ + box1 = box1.unsqueeze(1) # [M1, 4] -> [M1, 1, 4] + box2 = box2.unsqueeze(0) # [M2, 4] -> [1, M2, 4] + px1y1, px2y2 = box1[:, :, 0:2], box1[:, :, 2:4] + gx1y1, gx2y2 = box2[:, :, 0:2], box2[:, :, 2:4] + x1y1 = paddle.maximum(px1y1, gx1y1) + x2y2 = paddle.minimum(px2y2, gx2y2) + overlap = (x2y2 - x1y1).clip(0).prod(-1) + area1 = (px2y2 - px1y1).clip(0).prod(-1) + area2 = (gx2y2 - gx1y1).clip(0).prod(-1) + union = area1 + area2 - overlap + eps + return overlap / union diff --git a/paddlers/models/ppdet/modeling/cls_utils.py b/paddlers/models/ppdet/modeling/cls_utils.py new file mode 100644 index 0000000..3ae8d11 --- /dev/null +++ b/paddlers/models/ppdet/modeling/cls_utils.py @@ -0,0 +1,40 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def _get_class_default_kwargs(cls, *args, **kwargs): + """ + Get default arguments of a class in dict format, if args and + kwargs is specified, it will replace default arguments + """ + varnames = cls.__init__.__code__.co_varnames + argcount = cls.__init__.__code__.co_argcount + keys = varnames[:argcount] + assert keys[0] == 'self' + keys = keys[1:] + + values = list(cls.__init__.__defaults__) + assert len(values) == len(keys) + + if len(args) > 0: + for i, arg in enumerate(args): + values[i] = arg + + default_kwargs = dict(zip(keys, values)) + + if len(kwargs) > 0: + for k, v in kwargs.items(): + default_kwargs[k] = v + + return default_kwargs diff --git a/paddlers/models/ppdet/modeling/heads/__init__.py b/paddlers/models/ppdet/modeling/heads/__init__.py index 46caf7a..ec2e227 100644 --- a/paddlers/models/ppdet/modeling/heads/__init__.py +++ b/paddlers/models/ppdet/modeling/heads/__init__.py @@ -31,6 +31,8 @@ from . import pico_head from . import detr_head from . import sparsercnn_head from . import tood_head +from . import retina_head +from . import ppyoloe_head from .bbox_head import * from .mask_head import * @@ -51,3 +53,5 @@ from .pico_head import * from .detr_head import * from .sparsercnn_head import * from .tood_head import * +from .retina_head import * +from .ppyoloe_head import * diff --git a/paddlers/models/ppdet/modeling/heads/bbox_head.py b/paddlers/models/ppdet/modeling/heads/bbox_head.py index 2654dc9..fbb7f05 100644 --- a/paddlers/models/ppdet/modeling/heads/bbox_head.py +++ b/paddlers/models/ppdet/modeling/heads/bbox_head.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. import numpy as np @@ -24,6 +24,7 @@ from paddlers.models.ppdet.core.workspace import register, create from .roi_extractor import RoIAlign from ..shape_spec import ShapeSpec from ..bbox_utils import bbox2delta +from ..cls_utils import _get_class_default_kwargs from paddlers.models.ppdet.modeling.layers import ConvNormLayer __all__ = ['TwoFCHead', 'XConvNormHead', 'BBoxHead'] @@ -89,7 +90,7 @@ class XConvNormHead(nn.Layer): conv_dim (int): The number of channels for the conv layers out_channel (int): Output channels resolution (int): Resolution of input feature map - norm_type (string): Norm type, bn, gn, sync_bn are available, + norm_type (string): Norm type, bn, gn, sync_bn are available, default `gn` freeze_norm (bool): Whether to freeze the norm stage_name (string): Prefix name for conv layer, '' by default @@ -168,22 +169,23 @@ class BBoxHead(nn.Layer): head (nn.Layer): Extract feature in bbox head in_channel (int): Input channel after RoI extractor roi_extractor (object): The module of RoI Extractor - bbox_assigner (object): The module of Box Assigner, label and sample the + bbox_assigner (object): The module of Box Assigner, label and sample the box. with_pool (bool): Whether to use pooling for the RoI feature. num_classes (int): The number of classes - bbox_weight (List[float]): The weight to get the decode box + bbox_weight (List[float]): The weight to get the decode box """ def __init__(self, head, in_channel, - roi_extractor=RoIAlign().__dict__, + roi_extractor=_get_class_default_kwargs(RoIAlign), bbox_assigner='BboxAssigner', with_pool=False, num_classes=80, bbox_weight=[10., 10., 5., 5.], - bbox_loss=None): + bbox_loss=None, + loss_normalize_pos=False): super(BBoxHead, self).__init__() self.head = head self.roi_extractor = roi_extractor @@ -195,6 +197,7 @@ class BBoxHead(nn.Layer): self.num_classes = num_classes self.bbox_weight = bbox_weight self.bbox_loss = bbox_loss + self.loss_normalize_pos = loss_normalize_pos self.bbox_score = nn.Linear( in_channel, @@ -249,14 +252,25 @@ class BBoxHead(nn.Layer): deltas = self.bbox_delta(feat) if self.training: - loss = self.get_loss(scores, deltas, targets, rois, - self.bbox_weight) + loss = self.get_loss( + scores, + deltas, + targets, + rois, + self.bbox_weight, + loss_normalize_pos=self.loss_normalize_pos) return loss, bbox_feat else: pred = self.get_prediction(scores, deltas) return pred, self.head - def get_loss(self, scores, deltas, targets, rois, bbox_weight): + def get_loss(self, + scores, + deltas, + targets, + rois, + bbox_weight, + loss_normalize_pos=False): """ scores (Tensor): scores from bbox head outputs deltas (Tensor): deltas from bbox head outputs @@ -279,8 +293,15 @@ class BBoxHead(nn.Layer): else: tgt_labels = tgt_labels.cast('int64') tgt_labels.stop_gradient = True - loss_bbox_cls = F.cross_entropy( - input=scores, label=tgt_labels, reduction='mean') + + if not loss_normalize_pos: + loss_bbox_cls = F.cross_entropy( + input=scores, label=tgt_labels, reduction='mean') + else: + loss_bbox_cls = F.cross_entropy( + input=scores, label=tgt_labels, + reduction='none').sum() / (tgt_labels.shape[0] + 1e-7) + loss_bbox[cls_name] = loss_bbox_cls # bbox reg @@ -321,9 +342,16 @@ class BBoxHead(nn.Layer): if self.bbox_loss is not None: reg_delta = self.bbox_transform(reg_delta) reg_target = self.bbox_transform(reg_target) - loss_bbox_reg = self.bbox_loss( - reg_delta, reg_target).sum() / tgt_labels.shape[0] - loss_bbox_reg *= self.num_classes + + if not loss_normalize_pos: + loss_bbox_reg = self.bbox_loss( + reg_delta, reg_target).sum() / tgt_labels.shape[0] + loss_bbox_reg *= self.num_classes + + else: + loss_bbox_reg = self.bbox_loss( + reg_delta, reg_target).sum() / (tgt_labels.shape[0] + 1e-7) + else: loss_bbox_reg = paddle.abs(reg_delta - reg_target).sum( ) / tgt_labels.shape[0] diff --git a/paddlers/models/ppdet/modeling/heads/cascade_head.py b/paddlers/models/ppdet/modeling/heads/cascade_head.py index a9ad574..d2acf4a 100644 --- a/paddlers/models/ppdet/modeling/heads/cascade_head.py +++ b/paddlers/models/ppdet/modeling/heads/cascade_head.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. import paddle @@ -22,6 +22,7 @@ from .bbox_head import BBoxHead, TwoFCHead, XConvNormHead from .roi_extractor import RoIAlign from ..shape_spec import ShapeSpec from ..bbox_utils import delta2bbox, clip_bbox, nonempty_bbox +from ..cls_utils import _get_class_default_kwargs __all__ = ['CascadeTwoFCHead', 'CascadeXConvNormHead', 'CascadeHead'] @@ -82,7 +83,7 @@ class CascadeXConvNormHead(nn.Layer): conv_dim (int): The number of channels for the conv layers out_channel (int): Output channels resolution (int): Resolution of input feature map - norm_type (string): Norm type, bn, gn, sync_bn are available, + norm_type (string): Norm type, bn, gn, sync_bn are available, default `gn` freeze_norm (bool): Whether to freeze the norm num_cascade_stage (int): The number of cascade stage, default 3 @@ -142,10 +143,10 @@ class CascadeHead(BBoxHead): head (nn.Layer): Extract feature in bbox head in_channel (int): Input channel after RoI extractor roi_extractor (object): The module of RoI Extractor - bbox_assigner (object): The module of Box Assigner, label and sample the + bbox_assigner (object): The module of Box Assigner, label and sample the box. num_classes (int): The number of classes - bbox_weight (List[List[float]]): The weight to get the decode box and the + bbox_weight (List[List[float]]): The weight to get the decode box and the length of weight is the number of cascade stage num_cascade_stages (int): THe number of stage to refine the box """ @@ -153,13 +154,18 @@ class CascadeHead(BBoxHead): def __init__(self, head, in_channel, - roi_extractor=RoIAlign().__dict__, + roi_extractor=_get_class_default_kwargs(RoIAlign), bbox_assigner='BboxAssigner', num_classes=80, bbox_weight=[[10., 10., 5., 5.], [20.0, 20.0, 10.0, 10.0], [30.0, 30.0, 15.0, 15.0]], num_cascade_stages=3, - bbox_loss=None): + bbox_loss=None, + reg_class_agnostic=True, + stage_loss_weights=None, + loss_normalize_pos=False, + add_gt_as_proposals=[True, False, False]): + nn.Layer.__init__(self, ) self.head = head self.roi_extractor = roi_extractor @@ -171,6 +177,18 @@ class CascadeHead(BBoxHead): self.bbox_weight = bbox_weight self.num_cascade_stages = num_cascade_stages self.bbox_loss = bbox_loss + self.stage_loss_weights = [ + 1. / num_cascade_stages for _ in range(num_cascade_stages) + ] if stage_loss_weights is None else stage_loss_weights + self.add_gt_as_proposals = add_gt_as_proposals + + assert len( + self.stage_loss_weights + ) == num_cascade_stages, f'stage_loss_weights({len(self.stage_loss_weights)}) do not equal to num_cascade_stages({num_cascade_stages})' + + self.reg_class_agnostic = reg_class_agnostic + num_bbox_delta = 4 if reg_class_agnostic else 4 * num_classes + self.loss_normalize_pos = loss_normalize_pos self.bbox_score_list = [] self.bbox_delta_list = [] @@ -189,7 +207,7 @@ class CascadeHead(BBoxHead): delta_name, nn.Linear( in_channel, - 4, + num_bbox_delta, weight_attr=paddle.ParamAttr(initializer=Normal( mean=0.0, std=0.001)))) self.bbox_score_list.append(bbox_score) @@ -206,7 +224,11 @@ class CascadeHead(BBoxHead): """ targets = [] if self.training: - rois, rois_num, targets = self.bbox_assigner(rois, rois_num, inputs) + rois, rois_num, targets = self.bbox_assigner( + rois, + rois_num, + inputs, + add_gt_as_proposals=self.add_gt_as_proposals[0]) targets_list = [targets] self.assigned_rois = (rois, rois_num) self.assigned_targets = targets @@ -219,13 +241,32 @@ class CascadeHead(BBoxHead): inputs['im_shape']) if self.training: rois, rois_num, targets = self.bbox_assigner( - rois, rois_num, inputs, i, is_cascade=True) + rois, + rois_num, + inputs, + i, + is_cascade=True, + add_gt_as_proposals=self.add_gt_as_proposals[i]) targets_list.append(targets) rois_feat = self.roi_extractor(body_feats, rois, rois_num) bbox_feat = self.head(rois_feat, i) scores = self.bbox_score_list[i](bbox_feat) deltas = self.bbox_delta_list[i](bbox_feat) + + # TODO (lyuwenyu) Is it correct for only one class ? + if not self.reg_class_agnostic and i < self.num_cascade_stages - 1: + deltas = deltas.reshape([deltas.shape[0], self.num_classes, 4]) + labels = scores[:, :-1].argmax(axis=-1) + + if self.training: + deltas = deltas[paddle.arange(deltas.shape[0]), labels] + else: + deltas = deltas[((deltas + 10000) * F.one_hot( + labels, num_classes=self.num_classes).unsqueeze(-1) != 0 + ).nonzero(as_tuple=True)].reshape( + [deltas.shape[0], 4]) + head_out_list.append([scores, deltas, rois]) pred_bbox = self._get_pred_bbox(deltas, rois, self.bbox_weight[i]) @@ -233,11 +274,16 @@ class CascadeHead(BBoxHead): loss = {} for stage, value in enumerate(zip(head_out_list, targets_list)): (scores, deltas, rois), targets = value - loss_stage = self.get_loss(scores, deltas, targets, rois, - self.bbox_weight[stage]) + loss_stage = self.get_loss( + scores, + deltas, + targets, + rois, + self.bbox_weight[stage], + loss_normalize_pos=self.loss_normalize_pos) for k, v in loss_stage.items(): loss[k + "_stage{}".format( - stage)] = v / self.num_cascade_stages + stage)] = v * self.stage_loss_weights[stage] return loss, bbox_feat else: @@ -266,6 +312,14 @@ class CascadeHead(BBoxHead): num_prop = [] for p in proposals: num_prop.append(p.shape[0]) + + # NOTE(dev): num_prob will be tagged as LoDTensorArray because it + # depends on batch_size under @to_static. However the argument + # num_or_sections in paddle.split does not support LoDTensorArray, + # so we use [-1] to replace it if num_prop is not list. The modification + # This ensures the correctness of both dynamic and static graphs. + if not isinstance(num_prop, list): + num_prop = [-1] return pred_bbox.split(num_prop) def get_prediction(self, head_out_list): diff --git a/paddlers/models/ppdet/modeling/heads/centernet_head.py b/paddlers/models/ppdet/modeling/heads/centernet_head.py old mode 100644 new mode 100755 diff --git a/paddlers/models/ppdet/modeling/heads/face_head.py b/paddlers/models/ppdet/modeling/heads/face_head.py index 02dc13b..fa4b96c 100644 --- a/paddlers/models/ppdet/modeling/heads/face_head.py +++ b/paddlers/models/ppdet/modeling/heads/face_head.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. import paddle @@ -17,6 +17,7 @@ import paddle.nn as nn from paddlers.models.ppdet.core.workspace import register from ..layers import AnchorGeneratorSSD +from ..cls_utils import _get_class_default_kwargs @register @@ -39,7 +40,7 @@ class FaceHead(nn.Layer): def __init__(self, num_classes=80, in_channels=[96, 96], - anchor_generator=AnchorGeneratorSSD().__dict__, + anchor_generator=_get_class_default_kwargs(AnchorGeneratorSSD), kernel_size=3, padding=1, conv_decay=0., diff --git a/paddlers/models/ppdet/modeling/heads/fcos_head.py b/paddlers/models/ppdet/modeling/heads/fcos_head.py index 079d751..568053b 100644 --- a/paddlers/models/ppdet/modeling/heads/fcos_head.py +++ b/paddlers/models/ppdet/modeling/heads/fcos_head.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import @@ -64,6 +64,8 @@ class FCOSFeat(nn.Layer): norm_type='bn', use_dcn=False): super(FCOSFeat, self).__init__() + self.feat_in = feat_in + self.feat_out = feat_out self.num_convs = num_convs self.norm_type = norm_type self.cls_subnet_convs = [] diff --git a/paddlers/models/ppdet/modeling/heads/gfl_head.py b/paddlers/models/ppdet/modeling/heads/gfl_head.py index 15bbddd..5331b3b 100644 --- a/paddlers/models/ppdet/modeling/heads/gfl_head.py +++ b/paddlers/models/ppdet/modeling/heads/gfl_head.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. # The code is based on: @@ -29,7 +29,7 @@ from paddle.nn.initializer import Normal, Constant from paddlers.models.ppdet.core.workspace import register from paddlers.models.ppdet.modeling.layers import ConvNormLayer -from paddlers.models.ppdet.modeling.bbox_utils import distance2bbox, bbox2distance +from paddlers.models.ppdet.modeling.bbox_utils import distance2bbox, bbox2distance, batch_distance2bbox from paddlers.models.ppdet.data.transform.atss_assigner import bbox_overlaps @@ -79,7 +79,9 @@ class Integral(nn.Layer): offsets from the box center in four directions, shape (N, 4). """ x = F.softmax(x.reshape([-1, self.reg_max + 1]), axis=1) - x = F.linear(x, self.project).reshape([-1, 4]) + x = F.linear(x, self.project) + if self.training: + x = x.reshape([-1, 4]) return x @@ -241,18 +243,34 @@ class GFLHead(nn.Layer): ), "The size of fpn_feats is not equal to size of fpn_stride" cls_logits_list = [] bboxes_reg_list = [] - for scale_reg, fpn_feat in zip(self.scales_regs, fpn_feats): + for stride, scale_reg, fpn_feat in zip(self.fpn_stride, + self.scales_regs, fpn_feats): conv_cls_feat, conv_reg_feat = self.conv_feat(fpn_feat) - cls_logits = self.gfl_head_cls(conv_cls_feat) - bbox_reg = scale_reg(self.gfl_head_reg(conv_reg_feat)) + cls_score = self.gfl_head_cls(conv_cls_feat) + bbox_pred = scale_reg(self.gfl_head_reg(conv_reg_feat)) if self.dgqp_module: - quality_score = self.dgqp_module(bbox_reg) - cls_logits = F.sigmoid(cls_logits) * quality_score + quality_score = self.dgqp_module(bbox_pred) + cls_score = F.sigmoid(cls_score) * quality_score if not self.training: - cls_logits = F.sigmoid(cls_logits.transpose([0, 2, 3, 1])) - bbox_reg = bbox_reg.transpose([0, 2, 3, 1]) - cls_logits_list.append(cls_logits) - bboxes_reg_list.append(bbox_reg) + cls_score = F.sigmoid(cls_score.transpose([0, 2, 3, 1])) + bbox_pred = bbox_pred.transpose([0, 2, 3, 1]) + b, cell_h, cell_w, _ = paddle.shape(cls_score) + y, x = self.get_single_level_center_point( + [cell_h, cell_w], stride, cell_offset=self.cell_offset) + center_points = paddle.stack([x, y], axis=-1) + cls_score = cls_score.reshape([b, -1, self.cls_out_channels]) + bbox_pred = self.distribution_project(bbox_pred) * stride + bbox_pred = bbox_pred.reshape([-1, cell_h * cell_w, 4]) + + # NOTE: If keep_ratio=False and image shape value that + # multiples of 32, distance2bbox not set max_shapes parameter + # to speed up model prediction. If need to set max_shapes, + # please use inputs['im_shape']. + bbox_pred = batch_distance2bbox( + center_points, bbox_pred, max_shapes=None) + + cls_logits_list.append(cls_score) + bboxes_reg_list.append(bbox_pred) return (cls_logits_list, bboxes_reg_list) @@ -370,7 +388,7 @@ class GFLHead(nn.Layer): avg_factor = sum(avg_factor) try: - avg_factor = paddle.distributed.all_reduce(avg_factor.clone()) + paddle.distributed.all_reduce(avg_factor) avg_factor = paddle.clip( avg_factor / paddle.distributed.get_world_size(), min=1) except: @@ -410,71 +428,13 @@ class GFLHead(nn.Layer): x = x.flatten() return y, x - def get_bboxes_single(self, - cls_scores, - bbox_preds, - img_shape, - scale_factor, - rescale=True, - cell_offset=0): - assert len(cls_scores) == len(bbox_preds) - mlvl_bboxes = [] - mlvl_scores = [] - for stride, cls_score, bbox_pred in zip(self.fpn_stride, cls_scores, - bbox_preds): - featmap_size = [ - paddle.shape(cls_score)[0], paddle.shape(cls_score)[1] - ] - y, x = self.get_single_level_center_point( - featmap_size, stride, cell_offset=cell_offset) - center_points = paddle.stack([x, y], axis=-1) - scores = cls_score.reshape([-1, self.cls_out_channels]) - bbox_pred = self.distribution_project(bbox_pred) * stride - - if scores.shape[0] > self.nms_pre: - max_scores = scores.max(axis=1) - _, topk_inds = max_scores.topk(self.nms_pre) - center_points = center_points.gather(topk_inds) - bbox_pred = bbox_pred.gather(topk_inds) - scores = scores.gather(topk_inds) - - bboxes = distance2bbox( - center_points, bbox_pred, max_shape=img_shape) - mlvl_bboxes.append(bboxes) - mlvl_scores.append(scores) - mlvl_bboxes = paddle.concat(mlvl_bboxes) - if rescale: - # [h_scale, w_scale] to [w_scale, h_scale, w_scale, h_scale] - im_scale = paddle.concat([scale_factor[::-1], scale_factor[::-1]]) - mlvl_bboxes /= im_scale - mlvl_scores = paddle.concat(mlvl_scores) - mlvl_scores = mlvl_scores.transpose([1, 0]) - return mlvl_bboxes, mlvl_scores - - def decode(self, cls_scores, bbox_preds, im_shape, scale_factor, - cell_offset): - batch_bboxes = [] - batch_scores = [] - for img_id in range(cls_scores[0].shape[0]): - num_levels = len(cls_scores) - cls_score_list = [cls_scores[i][img_id] for i in range(num_levels)] - bbox_pred_list = [bbox_preds[i][img_id] for i in range(num_levels)] - bboxes, scores = self.get_bboxes_single( - cls_score_list, - bbox_pred_list, - im_shape[img_id], - scale_factor[img_id], - cell_offset=cell_offset) - batch_bboxes.append(bboxes) - batch_scores.append(scores) - batch_bboxes = paddle.stack(batch_bboxes, axis=0) - batch_scores = paddle.stack(batch_scores, axis=0) - - return batch_bboxes, batch_scores - def post_process(self, gfl_head_outs, im_shape, scale_factor): cls_scores, bboxes_reg = gfl_head_outs - bboxes, score = self.decode(cls_scores, bboxes_reg, im_shape, - scale_factor, self.cell_offset) - bbox_pred, bbox_num, _ = self.nms(bboxes, score) + bboxes = paddle.concat(bboxes_reg, axis=1) + # rescale: [h_scale, w_scale] -> [w_scale, h_scale, w_scale, h_scale] + im_scale = scale_factor.flip([1]).tile([1, 2]).unsqueeze(1) + bboxes /= im_scale + mlvl_scores = paddle.concat(cls_scores, axis=1) + mlvl_scores = mlvl_scores.transpose([0, 2, 1]) + bbox_pred, bbox_num, _ = self.nms(bboxes, mlvl_scores) return bbox_pred, bbox_num diff --git a/paddlers/models/ppdet/modeling/heads/keypoint_hrhrnet_head.py b/paddlers/models/ppdet/modeling/heads/keypoint_hrhrnet_head.py index 4c47474..fc9e2f5 100644 --- a/paddlers/models/ppdet/modeling/heads/keypoint_hrhrnet_head.py +++ b/paddlers/models/ppdet/modeling/heads/keypoint_hrhrnet_head.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. import paddle diff --git a/paddlers/models/ppdet/modeling/heads/mask_head.py b/paddlers/models/ppdet/modeling/heads/mask_head.py index 01fbc39..63f8d91 100644 --- a/paddlers/models/ppdet/modeling/heads/mask_head.py +++ b/paddlers/models/ppdet/modeling/heads/mask_head.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. import paddle @@ -20,6 +20,7 @@ from paddle.nn.initializer import KaimingNormal from paddlers.models.ppdet.core.workspace import register, create from paddlers.models.ppdet.modeling.layers import ConvNormLayer from .roi_extractor import RoIAlign +from ..cls_utils import _get_class_default_kwargs @register @@ -103,7 +104,7 @@ class MaskFeat(nn.Layer): @register class MaskHead(nn.Layer): - __shared__ = ['num_classes'] + __shared__ = ['num_classes', 'export_onnx'] __inject__ = ['mask_assigner'] """ RCNN mask head @@ -111,7 +112,7 @@ class MaskHead(nn.Layer): Args: head (nn.Layer): Extract feature in mask head roi_extractor (object): The module of RoI Extractor - mask_assigner (object): The module of Mask Assigner, + mask_assigner (object): The module of Mask Assigner, label and sample the mask num_classes (int): The number of classes share_bbox_feat (bool): Whether to share the feature from bbox head, @@ -120,12 +121,14 @@ class MaskHead(nn.Layer): def __init__(self, head, - roi_extractor=RoIAlign().__dict__, + roi_extractor=_get_class_default_kwargs(RoIAlign), mask_assigner='MaskAssigner', num_classes=80, - share_bbox_feat=False): + share_bbox_feat=False, + export_onnx=False): super(MaskHead, self).__init__() self.num_classes = num_classes + self.export_onnx = export_onnx self.roi_extractor = roi_extractor if isinstance(roi_extractor, dict): @@ -206,8 +209,8 @@ class MaskHead(nn.Layer): rois_num (Tensor): The number of prediction for each batch scale_factor (Tensor): The scale factor from origin size to input size """ - if rois.shape[0] == 0: - mask_out = paddle.full([1, 1, 1, 1], -1) + if not self.export_onnx and rois.shape[0] == 0: + mask_out = paddle.full([1, 1, 1], -1) else: bbox = [rois[:, 2:]] labels = rois[:, 0].cast('int32') @@ -218,19 +221,17 @@ class MaskHead(nn.Layer): mask_feat = self.head(rois_feat) mask_logit = self.mask_fcn_logits(mask_feat) - mask_num_class = mask_logit.shape[1] - if mask_num_class == 1: - mask_out = F.sigmoid(mask_logit) + if self.num_classes == 1: + mask_out = F.sigmoid(mask_logit)[:, 0, :, :] else: - num_masks = mask_logit.shape[0] - mask_out = [] - # TODO: need to optimize gather - for i in range(mask_logit.shape[0]): - pred_masks = paddle.unsqueeze( - mask_logit[i, :, :, :], axis=0) - mask = paddle.gather(pred_masks, labels[i], axis=1) - mask_out.append(mask) - mask_out = F.sigmoid(paddle.concat(mask_out)) + num_masks = paddle.shape(mask_logit)[0] + index = paddle.arange(num_masks).cast('int32') + mask_out = mask_logit[index, labels] + mask_out_shape = paddle.shape(mask_out) + mask_out = paddle.reshape(mask_out, [ + paddle.shape(index), mask_out_shape[-2], mask_out_shape[-1] + ]) + mask_out = F.sigmoid(mask_out) return mask_out def forward(self, diff --git a/paddlers/models/ppdet/modeling/heads/pico_head.py b/paddlers/models/ppdet/modeling/heads/pico_head.py index a6915f4..9f25051 100644 --- a/paddlers/models/ppdet/modeling/heads/pico_head.py +++ b/paddlers/models/ppdet/modeling/heads/pico_head.py @@ -24,9 +24,36 @@ import paddle.nn.functional as F from paddle import ParamAttr from paddle.nn.initializer import Normal, Constant +from paddlers.models.ppdet.modeling.ops import get_static_shape +from ..initializer import normal_ +from ..assigners.utils import generate_anchors_for_grid_cell +from ..bbox_utils import bbox_center, batch_distance2bbox, bbox2distance from paddlers.models.ppdet.core.workspace import register from paddlers.models.ppdet.modeling.layers import ConvNormLayer from .simota_head import OTAVFLHead +from .gfl_head import Integral, GFLHead +from paddlers.models.ppdet.modeling.necks.csp_pan import DPModule + +eps = 1e-9 + +__all__ = ['PicoHead', 'PicoHeadV2', 'PicoFeat'] + + +class PicoSE(nn.Layer): + def __init__(self, feat_channels): + super(PicoSE, self).__init__() + self.fc = nn.Conv2D(feat_channels, feat_channels, 1) + self.conv = ConvNormLayer(feat_channels, feat_channels, 1, 1) + + self._init_weights() + + def _init_weights(self): + normal_(self.fc.weight, std=0.001) + + def forward(self, feat, avg_feat): + weight = F.sigmoid(self.fc(avg_feat)) + out = self.conv(feat * weight) + return out @register @@ -39,6 +66,9 @@ class PicoFeat(nn.Layer): feat_out (int): The channel number of output Tensor. num_convs (int): The convolution number of the LiteGFLFeat. norm_type (str): Normalization type, 'bn'/'sync_bn'/'gn'. + share_cls_reg (bool): Whether to share the cls and reg output. + act (str): The act of per layers. + use_se (bool): Whether to use se module. """ def __init__(self, @@ -48,14 +78,20 @@ class PicoFeat(nn.Layer): num_convs=2, norm_type='bn', share_cls_reg=False, - act='hard_swish'): + act='hard_swish', + use_se=False): super(PicoFeat, self).__init__() self.num_convs = num_convs self.norm_type = norm_type self.share_cls_reg = share_cls_reg self.act = act + self.use_se = use_se self.cls_convs = [] self.reg_convs = [] + if use_se: + assert share_cls_reg == True, \ + 'In the case of using se, share_cls_reg must be set to True' + self.se = nn.LayerList() for stage_idx in range(num_fpn_stride): cls_subnet_convs = [] reg_subnet_convs = [] @@ -111,12 +147,16 @@ class PicoFeat(nn.Layer): reg_subnet_convs.append(reg_conv_pw) self.cls_convs.append(cls_subnet_convs) self.reg_convs.append(reg_subnet_convs) + if use_se: + self.se.append(PicoSE(feat_out)) def act_func(self, x): if self.act == "leaky_relu": x = F.leaky_relu(x) elif self.act == "hard_swish": x = F.hardswish(x) + elif self.act == "relu6": + x = F.relu6(x) return x def forward(self, fpn_feat, stage_idx): @@ -125,8 +165,13 @@ class PicoFeat(nn.Layer): reg_feat = fpn_feat for i in range(len(self.cls_convs[stage_idx])): cls_feat = self.act_func(self.cls_convs[stage_idx][i](cls_feat)) + reg_feat = cls_feat if not self.share_cls_reg: reg_feat = self.act_func(self.reg_convs[stage_idx][i](reg_feat)) + if self.use_se: + avg_feat = F.adaptive_avg_pool2d(cls_feat, (1, 1)) + se_feat = self.act_func(self.se[stage_idx](cls_feat, avg_feat)) + return cls_feat, se_feat return cls_feat, reg_feat @@ -150,7 +195,7 @@ class PicoHead(OTAVFLHead): 'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox', 'assigner', 'nms' ] - __shared__ = ['num_classes'] + __shared__ = ['num_classes', 'eval_size'] def __init__(self, conv_feat='PicoFeat', @@ -166,7 +211,8 @@ class PicoHead(OTAVFLHead): feat_in_chan=96, nms=None, nms_pre=1000, - cell_offset=0): + cell_offset=0, + eval_size=None): super(PicoHead, self).__init__( conv_feat=conv_feat, dgqp_module=dgqp_module, @@ -195,6 +241,7 @@ class PicoHead(OTAVFLHead): self.nms = nms self.nms_pre = nms_pre self.cell_offset = cell_offset + self.eval_size = eval_size self.use_sigmoid = self.loss_vfl.use_sigmoid if self.use_sigmoid: @@ -238,12 +285,50 @@ class PicoHead(OTAVFLHead): bias_attr=ParamAttr(initializer=Constant(value=0)))) self.head_reg_list.append(head_reg) - def forward(self, fpn_feats, deploy=False): + # initialize the anchor points + if self.eval_size: + self.anchor_points, self.stride_tensor = self._generate_anchors() + + def forward(self, fpn_feats, export_post_process=True): assert len(fpn_feats) == len( self.fpn_stride ), "The size of fpn_feats is not equal to size of fpn_stride" - cls_logits_list = [] - bboxes_reg_list = [] + + if self.training: + return self.forward_train(fpn_feats) + else: + return self.forward_eval( + fpn_feats, export_post_process=export_post_process) + + def forward_train(self, fpn_feats): + cls_logits_list, bboxes_reg_list = [], [] + for i, fpn_feat in enumerate(fpn_feats): + conv_cls_feat, conv_reg_feat = self.conv_feat(fpn_feat, i) + if self.conv_feat.share_cls_reg: + cls_logits = self.head_cls_list[i](conv_cls_feat) + cls_score, bbox_pred = paddle.split( + cls_logits, + [self.cls_out_channels, 4 * (self.reg_max + 1)], + axis=1) + else: + cls_score = self.head_cls_list[i](conv_cls_feat) + bbox_pred = self.head_reg_list[i](conv_reg_feat) + + if self.dgqp_module: + quality_score = self.dgqp_module(bbox_pred) + cls_score = F.sigmoid(cls_score) * quality_score + + cls_logits_list.append(cls_score) + bboxes_reg_list.append(bbox_pred) + + return (cls_logits_list, bboxes_reg_list) + + def forward_eval(self, fpn_feats, export_post_process=True): + if self.eval_size: + anchor_points, stride_tensor = self.anchor_points, self.stride_tensor + else: + anchor_points, stride_tensor = self._generate_anchors(fpn_feats) + cls_logits_list, bboxes_reg_list = [], [] for i, fpn_feat in enumerate(fpn_feats): conv_cls_feat, conv_reg_feat = self.conv_feat(fpn_feat, i) if self.conv_feat.share_cls_reg: @@ -260,18 +345,439 @@ class PicoHead(OTAVFLHead): quality_score = self.dgqp_module(bbox_pred) cls_score = F.sigmoid(cls_score) * quality_score - if deploy: + if not export_post_process: # Now only supports batch size = 1 in deploy # TODO(ygh): support batch size > 1 - cls_score = F.sigmoid(cls_score).reshape( + cls_score_out = F.sigmoid(cls_score).reshape( [1, self.cls_out_channels, -1]).transpose([0, 2, 1]) bbox_pred = bbox_pred.reshape([1, (self.reg_max + 1) * 4, -1]).transpose([0, 2, 1]) - elif not self.training: - cls_score = F.sigmoid(cls_score.transpose([0, 2, 3, 1])) + else: + _, _, h, w = fpn_feat.shape + l = h * w + cls_score_out = F.sigmoid( + cls_score.reshape([-1, self.cls_out_channels, l])) bbox_pred = bbox_pred.transpose([0, 2, 3, 1]) + bbox_pred = self.distribution_project(bbox_pred) + bbox_pred = bbox_pred.reshape([-1, l, 4]) - cls_logits_list.append(cls_score) + cls_logits_list.append(cls_score_out) bboxes_reg_list.append(bbox_pred) + if export_post_process: + cls_logits_list = paddle.concat(cls_logits_list, axis=-1) + bboxes_reg_list = paddle.concat(bboxes_reg_list, axis=1) + bboxes_reg_list = batch_distance2bbox(anchor_points, + bboxes_reg_list) + bboxes_reg_list *= stride_tensor + return (cls_logits_list, bboxes_reg_list) + + def _generate_anchors(self, feats=None): + # just use in eval time + anchor_points = [] + stride_tensor = [] + for i, stride in enumerate(self.fpn_stride): + if feats is not None: + _, _, h, w = feats[i].shape + else: + h = math.ceil(self.eval_size[0] / stride) + w = math.ceil(self.eval_size[1] / stride) + shift_x = paddle.arange(end=w) + self.cell_offset + shift_y = paddle.arange(end=h) + self.cell_offset + shift_y, shift_x = paddle.meshgrid(shift_y, shift_x) + anchor_point = paddle.cast( + paddle.stack( + [shift_x, shift_y], axis=-1), dtype='float32') + anchor_points.append(anchor_point.reshape([-1, 2])) + stride_tensor.append( + paddle.full( + [h * w, 1], stride, dtype='float32')) + anchor_points = paddle.concat(anchor_points) + stride_tensor = paddle.concat(stride_tensor) + return anchor_points, stride_tensor + + def post_process(self, head_outs, scale_factor, export_nms=True): + pred_scores, pred_bboxes = head_outs + if not export_nms: + return pred_bboxes, pred_scores + else: + # rescale: [h_scale, w_scale] -> [w_scale, h_scale, w_scale, h_scale] + scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1) + scale_factor = paddle.concat( + [scale_x, scale_y, scale_x, scale_y], + axis=-1).reshape([-1, 1, 4]) + # scale bbox to origin image size. + pred_bboxes /= scale_factor + bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores) + return bbox_pred, bbox_num + + +@register +class PicoHeadV2(GFLHead): + """ + PicoHeadV2 + Args: + conv_feat (object): Instance of 'PicoFeat' + num_classes (int): Number of classes + fpn_stride (list): The stride of each FPN Layer + prior_prob (float): Used to set the bias init for the class prediction layer + loss_class (object): Instance of VariFocalLoss. + loss_dfl (object): Instance of DistributionFocalLoss. + loss_bbox (object): Instance of bbox loss. + assigner (object): Instance of label assigner. + reg_max: Max value of integral set :math: `{0, ..., reg_max}` + n QFL setting. Default: 7. + """ + __inject__ = [ + 'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox', + 'static_assigner', 'assigner', 'nms' + ] + __shared__ = ['num_classes', 'eval_size'] + + def __init__(self, + conv_feat='PicoFeatV2', + dgqp_module=None, + num_classes=80, + fpn_stride=[8, 16, 32], + prior_prob=0.01, + use_align_head=True, + loss_class='VariFocalLoss', + loss_dfl='DistributionFocalLoss', + loss_bbox='GIoULoss', + static_assigner_epoch=60, + static_assigner='ATSSAssigner', + assigner='TaskAlignedAssigner', + reg_max=16, + feat_in_chan=96, + nms=None, + nms_pre=1000, + cell_offset=0, + act='hard_swish', + grid_cell_scale=5.0, + eval_size=None): + super(PicoHeadV2, self).__init__( + conv_feat=conv_feat, + dgqp_module=dgqp_module, + num_classes=num_classes, + fpn_stride=fpn_stride, + prior_prob=prior_prob, + loss_class=loss_class, + loss_dfl=loss_dfl, + loss_bbox=loss_bbox, + reg_max=reg_max, + feat_in_chan=feat_in_chan, + nms=nms, + nms_pre=nms_pre, + cell_offset=cell_offset, ) + self.conv_feat = conv_feat + self.num_classes = num_classes + self.fpn_stride = fpn_stride + self.prior_prob = prior_prob + self.loss_vfl = loss_class + self.loss_dfl = loss_dfl + self.loss_bbox = loss_bbox + + self.static_assigner_epoch = static_assigner_epoch + self.static_assigner = static_assigner + self.assigner = assigner + + self.reg_max = reg_max + self.feat_in_chan = feat_in_chan + self.nms = nms + self.nms_pre = nms_pre + self.cell_offset = cell_offset + self.act = act + self.grid_cell_scale = grid_cell_scale + self.use_align_head = use_align_head + self.cls_out_channels = self.num_classes + self.eval_size = eval_size + + bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob) + # Clear the super class initialization + self.gfl_head_cls = None + self.gfl_head_reg = None + self.scales_regs = None + + self.head_cls_list = [] + self.head_reg_list = [] + self.cls_align = nn.LayerList() + + for i in range(len(fpn_stride)): + head_cls = self.add_sublayer( + "head_cls" + str(i), + nn.Conv2D( + in_channels=self.feat_in_chan, + out_channels=self.cls_out_channels, + kernel_size=1, + stride=1, + padding=0, + weight_attr=ParamAttr(initializer=Normal( + mean=0., std=0.01)), + bias_attr=ParamAttr( + initializer=Constant(value=bias_init_value)))) + self.head_cls_list.append(head_cls) + head_reg = self.add_sublayer( + "head_reg" + str(i), + nn.Conv2D( + in_channels=self.feat_in_chan, + out_channels=4 * (self.reg_max + 1), + kernel_size=1, + stride=1, + padding=0, + weight_attr=ParamAttr(initializer=Normal( + mean=0., std=0.01)), + bias_attr=ParamAttr(initializer=Constant(value=0)))) + self.head_reg_list.append(head_reg) + if self.use_align_head: + self.cls_align.append( + DPModule( + self.feat_in_chan, + 1, + 5, + act=self.act, + use_act_in_out=False)) + + # initialize the anchor points + if self.eval_size: + self.anchor_points, self.stride_tensor = self._generate_anchors() + + def forward(self, fpn_feats, export_post_process=True): + assert len(fpn_feats) == len( + self.fpn_stride + ), "The size of fpn_feats is not equal to size of fpn_stride" + + if self.training: + return self.forward_train(fpn_feats) + else: + return self.forward_eval( + fpn_feats, export_post_process=export_post_process) + + def forward_train(self, fpn_feats): + cls_score_list, reg_list, box_list = [], [], [] + for i, (fpn_feat, stride) in enumerate(zip(fpn_feats, self.fpn_stride)): + b, _, h, w = get_static_shape(fpn_feat) + # task decomposition + conv_cls_feat, se_feat = self.conv_feat(fpn_feat, i) + cls_logit = self.head_cls_list[i](se_feat) + reg_pred = self.head_reg_list[i](se_feat) + + # cls prediction and alignment + if self.use_align_head: + cls_prob = F.sigmoid(self.cls_align[i](conv_cls_feat)) + cls_score = (F.sigmoid(cls_logit) * cls_prob + eps).sqrt() + else: + cls_score = F.sigmoid(cls_logit) + + cls_score_out = cls_score.transpose([0, 2, 3, 1]) + bbox_pred = reg_pred.transpose([0, 2, 3, 1]) + b, cell_h, cell_w, _ = paddle.shape(cls_score_out) + y, x = self.get_single_level_center_point( + [cell_h, cell_w], stride, cell_offset=self.cell_offset) + center_points = paddle.stack([x, y], axis=-1) + cls_score_out = cls_score_out.reshape( + [b, -1, self.cls_out_channels]) + bbox_pred = self.distribution_project(bbox_pred) * stride + bbox_pred = bbox_pred.reshape([b, cell_h * cell_w, 4]) + bbox_pred = batch_distance2bbox( + center_points, bbox_pred, max_shapes=None) + cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1])) + reg_list.append(reg_pred.flatten(2).transpose([0, 2, 1])) + box_list.append(bbox_pred / stride) + + cls_score_list = paddle.concat(cls_score_list, axis=1) + box_list = paddle.concat(box_list, axis=1) + reg_list = paddle.concat(reg_list, axis=1) + return cls_score_list, reg_list, box_list, fpn_feats + + def forward_eval(self, fpn_feats, export_post_process=True): + if self.eval_size: + anchor_points, stride_tensor = self.anchor_points, self.stride_tensor + else: + anchor_points, stride_tensor = self._generate_anchors(fpn_feats) + cls_score_list, box_list = [], [] + for i, (fpn_feat, stride) in enumerate(zip(fpn_feats, self.fpn_stride)): + _, _, h, w = fpn_feat.shape + # task decomposition + conv_cls_feat, se_feat = self.conv_feat(fpn_feat, i) + cls_logit = self.head_cls_list[i](se_feat) + reg_pred = self.head_reg_list[i](se_feat) + + # cls prediction and alignment + if self.use_align_head: + cls_prob = F.sigmoid(self.cls_align[i](conv_cls_feat)) + cls_score = (F.sigmoid(cls_logit) * cls_prob + eps).sqrt() + else: + cls_score = F.sigmoid(cls_logit) + + if not export_post_process: + # Now only supports batch size = 1 in deploy + cls_score_list.append( + cls_score.reshape([1, self.cls_out_channels, -1]).transpose( + [0, 2, 1])) + box_list.append( + reg_pred.reshape([1, (self.reg_max + 1) * 4, -1]).transpose( + [0, 2, 1])) + else: + l = h * w + cls_score_out = cls_score.reshape( + [-1, self.cls_out_channels, l]) + bbox_pred = reg_pred.transpose([0, 2, 3, 1]) + bbox_pred = self.distribution_project(bbox_pred) + bbox_pred = bbox_pred.reshape([-1, l, 4]) + cls_score_list.append(cls_score_out) + box_list.append(bbox_pred) + + if export_post_process: + cls_score_list = paddle.concat(cls_score_list, axis=-1) + box_list = paddle.concat(box_list, axis=1) + box_list = batch_distance2bbox(anchor_points, box_list) + box_list *= stride_tensor + + return cls_score_list, box_list + + def get_loss(self, head_outs, gt_meta): + pred_scores, pred_regs, pred_bboxes, fpn_feats = head_outs + gt_labels = gt_meta['gt_class'] + gt_bboxes = gt_meta['gt_bbox'] + gt_scores = gt_meta['gt_score'] if 'gt_score' in gt_meta else None + num_imgs = gt_meta['im_id'].shape[0] + pad_gt_mask = gt_meta['pad_gt_mask'] + + anchors, _, num_anchors_list, stride_tensor_list = generate_anchors_for_grid_cell( + fpn_feats, self.fpn_stride, self.grid_cell_scale, self.cell_offset) + + centers = bbox_center(anchors) + + # label assignment + if gt_meta['epoch_id'] < self.static_assigner_epoch: + assigned_labels, assigned_bboxes, assigned_scores = self.static_assigner( + anchors, + num_anchors_list, + gt_labels, + gt_bboxes, + pad_gt_mask, + bg_index=self.num_classes, + gt_scores=gt_scores, + pred_bboxes=pred_bboxes.detach() * stride_tensor_list) + + else: + assigned_labels, assigned_bboxes, assigned_scores = self.assigner( + pred_scores.detach(), + pred_bboxes.detach() * stride_tensor_list, + centers, + num_anchors_list, + gt_labels, + gt_bboxes, + pad_gt_mask, + bg_index=self.num_classes, + gt_scores=gt_scores) + + assigned_bboxes /= stride_tensor_list + + centers_shape = centers.shape + flatten_centers = centers.expand( + [num_imgs, centers_shape[0], centers_shape[1]]).reshape([-1, 2]) + flatten_strides = stride_tensor_list.expand( + [num_imgs, centers_shape[0], 1]).reshape([-1, 1]) + flatten_cls_preds = pred_scores.reshape([-1, self.num_classes]) + flatten_regs = pred_regs.reshape([-1, 4 * (self.reg_max + 1)]) + flatten_bboxes = pred_bboxes.reshape([-1, 4]) + flatten_bbox_targets = assigned_bboxes.reshape([-1, 4]) + flatten_labels = assigned_labels.reshape([-1]) + flatten_assigned_scores = assigned_scores.reshape( + [-1, self.num_classes]) + + pos_inds = paddle.nonzero( + paddle.logical_and((flatten_labels >= 0), + (flatten_labels < self.num_classes)), + as_tuple=False).squeeze(1) + + num_total_pos = len(pos_inds) + + if num_total_pos > 0: + pos_bbox_targets = paddle.gather( + flatten_bbox_targets, pos_inds, axis=0) + pos_decode_bbox_pred = paddle.gather( + flatten_bboxes, pos_inds, axis=0) + pos_reg = paddle.gather(flatten_regs, pos_inds, axis=0) + pos_strides = paddle.gather(flatten_strides, pos_inds, axis=0) + pos_centers = paddle.gather( + flatten_centers, pos_inds, axis=0) / pos_strides + + weight_targets = flatten_assigned_scores.detach() + weight_targets = paddle.gather( + weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0) + + pred_corners = pos_reg.reshape([-1, self.reg_max + 1]) + target_corners = bbox2distance(pos_centers, pos_bbox_targets, + self.reg_max).reshape([-1]) + # regression loss + loss_bbox = paddle.sum( + self.loss_bbox(pos_decode_bbox_pred, + pos_bbox_targets) * weight_targets) + + # dfl loss + loss_dfl = self.loss_dfl( + pred_corners, + target_corners, + weight=weight_targets.expand([-1, 4]).reshape([-1]), + avg_factor=4.0) + else: + loss_bbox = paddle.zeros([1]) + loss_dfl = paddle.zeros([1]) + + avg_factor = flatten_assigned_scores.sum() + if paddle.distributed.get_world_size() > 1: + paddle.distributed.all_reduce(avg_factor) + avg_factor = paddle.clip( + avg_factor / paddle.distributed.get_world_size(), min=1) + loss_vfl = self.loss_vfl( + flatten_cls_preds, flatten_assigned_scores, avg_factor=avg_factor) + + loss_bbox = loss_bbox / avg_factor + loss_dfl = loss_dfl / avg_factor + + loss_states = dict( + loss_vfl=loss_vfl, loss_bbox=loss_bbox, loss_dfl=loss_dfl) + + return loss_states + + def _generate_anchors(self, feats=None): + # just use in eval time + anchor_points = [] + stride_tensor = [] + for i, stride in enumerate(self.fpn_stride): + if feats is not None: + _, _, h, w = feats[i].shape + else: + h = math.ceil(self.eval_size[0] / stride) + w = math.ceil(self.eval_size[1] / stride) + shift_x = paddle.arange(end=w) + self.cell_offset + shift_y = paddle.arange(end=h) + self.cell_offset + shift_y, shift_x = paddle.meshgrid(shift_y, shift_x) + anchor_point = paddle.cast( + paddle.stack( + [shift_x, shift_y], axis=-1), dtype='float32') + anchor_points.append(anchor_point.reshape([-1, 2])) + stride_tensor.append( + paddle.full( + [h * w, 1], stride, dtype='float32')) + anchor_points = paddle.concat(anchor_points) + stride_tensor = paddle.concat(stride_tensor) + return anchor_points, stride_tensor + + def post_process(self, head_outs, scale_factor, export_nms=True): + pred_scores, pred_bboxes = head_outs + if not export_nms: + return pred_bboxes, pred_scores + else: + # rescale: [h_scale, w_scale] -> [w_scale, h_scale, w_scale, h_scale] + scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1) + scale_factor = paddle.concat( + [scale_x, scale_y, scale_x, scale_y], + axis=-1).reshape([-1, 1, 4]) + # scale bbox to origin image size. + pred_bboxes /= scale_factor + bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores) + return bbox_pred, bbox_num diff --git a/paddlers/models/ppdet/modeling/heads/ppyoloe_head.py b/paddlers/models/ppdet/modeling/heads/ppyoloe_head.py new file mode 100644 index 0000000..48a2af7 --- /dev/null +++ b/paddlers/models/ppdet/modeling/heads/ppyoloe_head.py @@ -0,0 +1,388 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddlers.models.ppdet.core.workspace import register + +from ..bbox_utils import batch_distance2bbox +from ..losses import GIoULoss +from ..initializer import bias_init_with_prob, constant_, normal_ +from ..assigners.utils import generate_anchors_for_grid_cell +from paddlers.models.ppdet.modeling.backbones.cspresnet import ConvBNLayer +from paddlers.models.ppdet.modeling.ops import get_static_shape, get_act_fn +from paddlers.models.ppdet.modeling.layers import MultiClassNMS + +__all__ = ['PPYOLOEHead'] + + +class ESEAttn(nn.Layer): + def __init__(self, feat_channels, act='swish'): + super(ESEAttn, self).__init__() + self.fc = nn.Conv2D(feat_channels, feat_channels, 1) + self.conv = ConvBNLayer(feat_channels, feat_channels, 1, act=act) + + self._init_weights() + + def _init_weights(self): + normal_(self.fc.weight, std=0.001) + + def forward(self, feat, avg_feat): + weight = F.sigmoid(self.fc(avg_feat)) + return self.conv(feat * weight) + + +@register +class PPYOLOEHead(nn.Layer): + __shared__ = [ + 'num_classes', 'eval_size', 'trt', 'exclude_nms', 'exclude_post_process' + ] + __inject__ = ['static_assigner', 'assigner', 'nms'] + + def __init__(self, + in_channels=[1024, 512, 256], + num_classes=80, + act='swish', + fpn_strides=(32, 16, 8), + grid_cell_scale=5.0, + grid_cell_offset=0.5, + reg_max=16, + static_assigner_epoch=4, + use_varifocal_loss=True, + static_assigner='ATSSAssigner', + assigner='TaskAlignedAssigner', + nms='MultiClassNMS', + eval_size=None, + loss_weight={ + 'class': 1.0, + 'iou': 2.5, + 'dfl': 0.5, + }, + trt=False, + exclude_nms=False, + exclude_post_process=False): + super(PPYOLOEHead, self).__init__() + assert len(in_channels) > 0, "len(in_channels) should > 0" + self.in_channels = in_channels + self.num_classes = num_classes + self.fpn_strides = fpn_strides + self.grid_cell_scale = grid_cell_scale + self.grid_cell_offset = grid_cell_offset + self.reg_max = reg_max + self.iou_loss = GIoULoss() + self.loss_weight = loss_weight + self.use_varifocal_loss = use_varifocal_loss + self.eval_size = eval_size + + self.static_assigner_epoch = static_assigner_epoch + self.static_assigner = static_assigner + self.assigner = assigner + self.nms = nms + if isinstance(self.nms, MultiClassNMS) and trt: + self.nms.trt = trt + self.exclude_nms = exclude_nms + self.exclude_post_process = exclude_post_process + # stem + self.stem_cls = nn.LayerList() + self.stem_reg = nn.LayerList() + act = get_act_fn( + act, trt=trt) if act is None or isinstance(act, + (str, dict)) else act + for in_c in self.in_channels: + self.stem_cls.append(ESEAttn(in_c, act=act)) + self.stem_reg.append(ESEAttn(in_c, act=act)) + # pred head + self.pred_cls = nn.LayerList() + self.pred_reg = nn.LayerList() + for in_c in self.in_channels: + self.pred_cls.append( + nn.Conv2D( + in_c, self.num_classes, 3, padding=1)) + self.pred_reg.append( + nn.Conv2D( + in_c, 4 * (self.reg_max + 1), 3, padding=1)) + # projection conv + self.proj_conv = nn.Conv2D(self.reg_max + 1, 1, 1, bias_attr=False) + self.proj_conv.skip_quant = True + self._init_weights() + + @classmethod + def from_config(cls, cfg, input_shape): + return {'in_channels': [i.channels for i in input_shape], } + + def _init_weights(self): + bias_cls = bias_init_with_prob(0.01) + for cls_, reg_ in zip(self.pred_cls, self.pred_reg): + constant_(cls_.weight) + constant_(cls_.bias, bias_cls) + constant_(reg_.weight) + constant_(reg_.bias, 1.0) + + proj = paddle.linspace(0, self.reg_max, self.reg_max + 1).reshape( + [1, self.reg_max + 1, 1, 1]) + self.proj_conv.weight.set_value(proj) + self.proj_conv.weight.stop_gradient = True + if self.eval_size: + anchor_points, stride_tensor = self._generate_anchors() + self.anchor_points = anchor_points + self.stride_tensor = stride_tensor + + def forward_train(self, feats, targets): + anchors, anchor_points, num_anchors_list, stride_tensor = \ + generate_anchors_for_grid_cell( + feats, self.fpn_strides, self.grid_cell_scale, + self.grid_cell_offset) + + cls_score_list, reg_distri_list = [], [] + for i, feat in enumerate(feats): + avg_feat = F.adaptive_avg_pool2d(feat, (1, 1)) + cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) + + feat) + reg_distri = self.pred_reg[i](self.stem_reg[i](feat, avg_feat)) + # cls and reg + cls_score = F.sigmoid(cls_logit) + cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1])) + reg_distri_list.append(reg_distri.flatten(2).transpose([0, 2, 1])) + cls_score_list = paddle.concat(cls_score_list, axis=1) + reg_distri_list = paddle.concat(reg_distri_list, axis=1) + + return self.get_loss([ + cls_score_list, reg_distri_list, anchors, anchor_points, + num_anchors_list, stride_tensor + ], targets) + + def _generate_anchors(self, feats=None, dtype='float32'): + # just use in eval time + anchor_points = [] + stride_tensor = [] + for i, stride in enumerate(self.fpn_strides): + if feats is not None: + _, _, h, w = feats[i].shape + else: + h = int(self.eval_size[0] / stride) + w = int(self.eval_size[1] / stride) + shift_x = paddle.arange(end=w) + self.grid_cell_offset + shift_y = paddle.arange(end=h) + self.grid_cell_offset + shift_y, shift_x = paddle.meshgrid(shift_y, shift_x) + anchor_point = paddle.cast( + paddle.stack( + [shift_x, shift_y], axis=-1), dtype=dtype) + anchor_points.append(anchor_point.reshape([-1, 2])) + stride_tensor.append(paddle.full([h * w, 1], stride, dtype=dtype)) + anchor_points = paddle.concat(anchor_points) + stride_tensor = paddle.concat(stride_tensor) + return anchor_points, stride_tensor + + def forward_eval(self, feats): + if self.eval_size: + anchor_points, stride_tensor = self.anchor_points, self.stride_tensor + else: + anchor_points, stride_tensor = self._generate_anchors(feats) + cls_score_list, reg_dist_list = [], [] + for i, feat in enumerate(feats): + _, _, h, w = feat.shape + l = h * w + avg_feat = F.adaptive_avg_pool2d(feat, (1, 1)) + cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) + + feat) + reg_dist = self.pred_reg[i](self.stem_reg[i](feat, avg_feat)) + reg_dist = reg_dist.reshape([-1, 4, self.reg_max + 1, l]).transpose( + [0, 2, 3, 1]) + reg_dist = self.proj_conv(F.softmax(reg_dist, axis=1)).squeeze(1) + # cls and reg + cls_score = F.sigmoid(cls_logit) + cls_score_list.append(cls_score.reshape([-1, self.num_classes, l])) + reg_dist_list.append(reg_dist) + + cls_score_list = paddle.concat(cls_score_list, axis=-1) + reg_dist_list = paddle.concat(reg_dist_list, axis=1) + + return cls_score_list, reg_dist_list, anchor_points, stride_tensor + + def forward(self, feats, targets=None): + assert len(feats) == len(self.fpn_strides), \ + "The size of feats is not equal to size of fpn_strides" + + if self.training: + return self.forward_train(feats, targets) + else: + return self.forward_eval(feats) + + @staticmethod + def _focal_loss(score, label, alpha=0.25, gamma=2.0): + weight = (score - label).pow(gamma) + if alpha > 0: + alpha_t = alpha * label + (1 - alpha) * (1 - label) + weight *= alpha_t + loss = F.binary_cross_entropy( + score, label, weight=weight, reduction='sum') + return loss + + @staticmethod + def _varifocal_loss(pred_score, gt_score, label, alpha=0.75, gamma=2.0): + weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label + loss = F.binary_cross_entropy( + pred_score, gt_score, weight=weight, reduction='sum') + return loss + + def _bbox_decode(self, anchor_points, pred_dist): + _, l, _ = get_static_shape(pred_dist) + pred_dist = F.softmax(pred_dist.reshape([-1, l, 4, self.reg_max + 1])) + pred_dist = self.proj_conv(pred_dist.transpose([0, 3, 1, 2])).squeeze(1) + return batch_distance2bbox(anchor_points, pred_dist) + + def _bbox2distance(self, points, bbox): + x1y1, x2y2 = paddle.split(bbox, 2, -1) + lt = points - x1y1 + rb = x2y2 - points + return paddle.concat([lt, rb], -1).clip(0, self.reg_max - 0.01) + + def _df_loss(self, pred_dist, target): + target_left = paddle.cast(target, 'int64') + target_right = target_left + 1 + weight_left = target_right.astype('float32') - target + weight_right = 1 - weight_left + loss_left = F.cross_entropy( + pred_dist, target_left, reduction='none') * weight_left + loss_right = F.cross_entropy( + pred_dist, target_right, reduction='none') * weight_right + return (loss_left + loss_right).mean(-1, keepdim=True) + + def _bbox_loss(self, pred_dist, pred_bboxes, anchor_points, assigned_labels, + assigned_bboxes, assigned_scores, assigned_scores_sum): + # select positive samples mask + mask_positive = (assigned_labels != self.num_classes) + num_pos = mask_positive.sum() + # pos/neg loss + if num_pos > 0: + # l1 + iou + bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 4]) + pred_bboxes_pos = paddle.masked_select(pred_bboxes, + bbox_mask).reshape([-1, 4]) + assigned_bboxes_pos = paddle.masked_select( + assigned_bboxes, bbox_mask).reshape([-1, 4]) + bbox_weight = paddle.masked_select( + assigned_scores.sum(-1), mask_positive).unsqueeze(-1) + + loss_l1 = F.l1_loss(pred_bboxes_pos, assigned_bboxes_pos) + + loss_iou = self.iou_loss(pred_bboxes_pos, + assigned_bboxes_pos) * bbox_weight + loss_iou = loss_iou.sum() / assigned_scores_sum + + dist_mask = mask_positive.unsqueeze(-1).tile( + [1, 1, (self.reg_max + 1) * 4]) + pred_dist_pos = paddle.masked_select( + pred_dist, dist_mask).reshape([-1, 4, self.reg_max + 1]) + assigned_ltrb = self._bbox2distance(anchor_points, assigned_bboxes) + assigned_ltrb_pos = paddle.masked_select( + assigned_ltrb, bbox_mask).reshape([-1, 4]) + loss_dfl = self._df_loss(pred_dist_pos, + assigned_ltrb_pos) * bbox_weight + loss_dfl = loss_dfl.sum() / assigned_scores_sum + else: + loss_l1 = paddle.zeros([1]) + loss_iou = paddle.zeros([1]) + loss_dfl = pred_dist.sum() * 0. + return loss_l1, loss_iou, loss_dfl + + def get_loss(self, head_outs, gt_meta): + pred_scores, pred_distri, anchors,\ + anchor_points, num_anchors_list, stride_tensor = head_outs + + anchor_points_s = anchor_points / stride_tensor + pred_bboxes = self._bbox_decode(anchor_points_s, pred_distri) + + gt_labels = gt_meta['gt_class'] + gt_bboxes = gt_meta['gt_bbox'] + pad_gt_mask = gt_meta['pad_gt_mask'] + # label assignment + if gt_meta['epoch_id'] < self.static_assigner_epoch: + assigned_labels, assigned_bboxes, assigned_scores = \ + self.static_assigner( + anchors, + num_anchors_list, + gt_labels, + gt_bboxes, + pad_gt_mask, + bg_index=self.num_classes, + pred_bboxes=pred_bboxes.detach() * stride_tensor) + alpha_l = 0.25 + else: + assigned_labels, assigned_bboxes, assigned_scores = \ + self.assigner( + pred_scores.detach(), + pred_bboxes.detach() * stride_tensor, + anchor_points, + num_anchors_list, + gt_labels, + gt_bboxes, + pad_gt_mask, + bg_index=self.num_classes) + alpha_l = -1 + # rescale bbox + assigned_bboxes /= stride_tensor + # cls loss + if self.use_varifocal_loss: + one_hot_label = F.one_hot(assigned_labels, + self.num_classes + 1)[..., :-1] + loss_cls = self._varifocal_loss(pred_scores, assigned_scores, + one_hot_label) + else: + loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha_l) + + assigned_scores_sum = assigned_scores.sum() + if paddle.distributed.get_world_size() > 1: + paddle.distributed.all_reduce(assigned_scores_sum) + assigned_scores_sum /= paddle.distributed.get_world_size() + assigned_scores_sum = paddle.clip(assigned_scores_sum, min=1.) + loss_cls /= assigned_scores_sum + + loss_l1, loss_iou, loss_dfl = \ + self._bbox_loss(pred_distri, pred_bboxes, anchor_points_s, + assigned_labels, assigned_bboxes, assigned_scores, + assigned_scores_sum) + loss = self.loss_weight['class'] * loss_cls + \ + self.loss_weight['iou'] * loss_iou + \ + self.loss_weight['dfl'] * loss_dfl + out_dict = { + 'loss': loss, + 'loss_cls': loss_cls, + 'loss_iou': loss_iou, + 'loss_dfl': loss_dfl, + 'loss_l1': loss_l1, + } + return out_dict + + def post_process(self, head_outs, scale_factor): + pred_scores, pred_dist, anchor_points, stride_tensor = head_outs + pred_bboxes = batch_distance2bbox(anchor_points, pred_dist) + pred_bboxes *= stride_tensor + if self.exclude_post_process: + return paddle.concat( + [pred_bboxes, pred_scores.transpose([0, 2, 1])], axis=-1), None + else: + # scale bbox to origin + scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1) + scale_factor = paddle.concat( + [scale_x, scale_y, scale_x, scale_y], + axis=-1).reshape([-1, 1, 4]) + pred_bboxes /= scale_factor + if self.exclude_nms: + # `exclude_nms=True` just use in benchmark + return pred_bboxes, pred_scores + else: + bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores) + return bbox_pred, bbox_num diff --git a/paddlers/models/ppdet/modeling/heads/retina_head.py b/paddlers/models/ppdet/modeling/heads/retina_head.py new file mode 100644 index 0000000..b9939f4 --- /dev/null +++ b/paddlers/models/ppdet/modeling/heads/retina_head.py @@ -0,0 +1,249 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.nn.initializer import Normal, Constant +from paddlers.models.ppdet.modeling.bbox_utils import bbox2delta, delta2bbox +from paddlers.models.ppdet.modeling.heads.fcos_head import FCOSFeat + +from paddlers.models.ppdet.core.workspace import register + +__all__ = ['RetinaHead'] + + +@register +class RetinaFeat(FCOSFeat): + """We use FCOSFeat to construct conv layers in RetinaNet. + We rename FCOSFeat to RetinaFeat to avoid confusion. + """ + pass + + +@register +class RetinaHead(nn.Layer): + """Used in RetinaNet proposed in paper https://arxiv.org/pdf/1708.02002.pdf + """ + __shared__ = ['num_classes'] + __inject__ = [ + 'conv_feat', 'anchor_generator', 'bbox_assigner', 'loss_class', + 'loss_bbox', 'nms' + ] + + def __init__(self, + num_classes=80, + conv_feat='RetinaFeat', + anchor_generator='RetinaAnchorGenerator', + bbox_assigner='MaxIoUAssigner', + loss_class='FocalLoss', + loss_bbox='SmoothL1Loss', + nms='MultiClassNMS', + prior_prob=0.01, + nms_pre=1000, + weights=[1., 1., 1., 1.]): + super(RetinaHead, self).__init__() + self.num_classes = num_classes + self.conv_feat = conv_feat + self.anchor_generator = anchor_generator + self.bbox_assigner = bbox_assigner + self.loss_class = loss_class + self.loss_bbox = loss_bbox + self.nms = nms + self.nms_pre = nms_pre + self.weights = weights + + bias_init_value = -math.log((1 - prior_prob) / prior_prob) + num_anchors = self.anchor_generator.num_anchors + self.retina_cls = nn.Conv2D( + in_channels=self.conv_feat.feat_out, + out_channels=self.num_classes * num_anchors, + kernel_size=3, + stride=1, + padding=1, + weight_attr=ParamAttr(initializer=Normal( + mean=0.0, std=0.01)), + bias_attr=ParamAttr(initializer=Constant(value=bias_init_value))) + self.retina_reg = nn.Conv2D( + in_channels=self.conv_feat.feat_out, + out_channels=4 * num_anchors, + kernel_size=3, + stride=1, + padding=1, + weight_attr=ParamAttr(initializer=Normal( + mean=0.0, std=0.01)), + bias_attr=ParamAttr(initializer=Constant(value=0))) + + def forward(self, neck_feats, targets=None): + cls_logits_list = [] + bboxes_reg_list = [] + for neck_feat in neck_feats: + conv_cls_feat, conv_reg_feat = self.conv_feat(neck_feat) + cls_logits = self.retina_cls(conv_cls_feat) + bbox_reg = self.retina_reg(conv_reg_feat) + cls_logits_list.append(cls_logits) + bboxes_reg_list.append(bbox_reg) + + if self.training: + return self.get_loss([cls_logits_list, bboxes_reg_list], targets) + else: + return [cls_logits_list, bboxes_reg_list] + + def get_loss(self, head_outputs, targets): + """Here we calculate loss for a batch of images. + We assign anchors to gts in each image and gather all the assigned + postive and negative samples. Then loss is calculated on the gathered + samples. + """ + cls_logits_list, bboxes_reg_list = head_outputs + anchors = self.anchor_generator(cls_logits_list) + anchors = paddle.concat(anchors) + + # matches: contain gt_inds + # match_labels: -1(ignore), 0(neg) or 1(pos) + matches_list, match_labels_list = [], [] + # assign anchors to gts, no sampling is involved + for gt_bbox in targets['gt_bbox']: + matches, match_labels = self.bbox_assigner(anchors, gt_bbox) + matches_list.append(matches) + match_labels_list.append(match_labels) + + # reshape network outputs + cls_logits = [ + _.transpose([0, 2, 3, 1]).reshape([0, -1, self.num_classes]) + for _ in cls_logits_list + ] + bboxes_reg = [ + _.transpose([0, 2, 3, 1]).reshape([0, -1, 4]) + for _ in bboxes_reg_list + ] + cls_logits = paddle.concat(cls_logits, axis=1) + bboxes_reg = paddle.concat(bboxes_reg, axis=1) + + cls_pred_list, cls_tar_list = [], [] + reg_pred_list, reg_tar_list = [], [] + # find and gather preds and targets in each image + for matches, match_labels, cls_logit, bbox_reg, gt_bbox, gt_class in \ + zip(matches_list, match_labels_list, cls_logits, bboxes_reg, + targets['gt_bbox'], targets['gt_class']): + pos_mask = (match_labels == 1) + neg_mask = (match_labels == 0) + chosen_mask = paddle.logical_or(pos_mask, neg_mask) + + gt_class = gt_class.reshape([-1]) + bg_class = paddle.to_tensor( + [self.num_classes], dtype=gt_class.dtype) + # a trick to assign num_classes to negative targets + gt_class = paddle.concat([gt_class, bg_class], axis=-1) + matches = paddle.where(neg_mask, + paddle.full_like(matches, gt_class.size - 1), + matches) + + cls_pred = cls_logit[chosen_mask] + cls_tar = gt_class[matches[chosen_mask]] + reg_pred = bbox_reg[pos_mask].reshape([-1, 4]) + reg_tar = gt_bbox[matches[pos_mask]].reshape([-1, 4]) + reg_tar = bbox2delta(anchors[pos_mask], reg_tar, self.weights) + cls_pred_list.append(cls_pred) + cls_tar_list.append(cls_tar) + reg_pred_list.append(reg_pred) + reg_tar_list.append(reg_tar) + cls_pred = paddle.concat(cls_pred_list) + cls_tar = paddle.concat(cls_tar_list) + reg_pred = paddle.concat(reg_pred_list) + reg_tar = paddle.concat(reg_tar_list) + + avg_factor = max(1.0, reg_pred.shape[0]) + cls_loss = self.loss_class( + cls_pred, cls_tar, reduction='sum') / avg_factor + + if reg_pred.shape[0] == 0: + reg_loss = paddle.zeros([1]) + reg_loss.stop_gradient = False + else: + reg_loss = self.loss_bbox( + reg_pred, reg_tar, reduction='sum') / avg_factor + + loss = cls_loss + reg_loss + out_dict = { + 'loss_cls': cls_loss, + 'loss_reg': reg_loss, + 'loss': loss, + } + return out_dict + + def get_bboxes_single(self, + anchors, + cls_scores_list, + bbox_preds_list, + im_shape, + scale_factor, + rescale=True): + assert len(cls_scores_list) == len(bbox_preds_list) + mlvl_bboxes = [] + mlvl_scores = [] + for anchor, cls_score, bbox_pred in zip(anchors, cls_scores_list, + bbox_preds_list): + cls_score = cls_score.reshape([-1, self.num_classes]) + bbox_pred = bbox_pred.reshape([-1, 4]) + if self.nms_pre is not None and cls_score.shape[0] > self.nms_pre: + max_score = cls_score.max(axis=1) + _, topk_inds = max_score.topk(self.nms_pre) + bbox_pred = bbox_pred.gather(topk_inds) + anchor = anchor.gather(topk_inds) + cls_score = cls_score.gather(topk_inds) + bbox_pred = delta2bbox(bbox_pred, anchor, self.weights).squeeze() + mlvl_bboxes.append(bbox_pred) + mlvl_scores.append(F.sigmoid(cls_score)) + mlvl_bboxes = paddle.concat(mlvl_bboxes) + mlvl_bboxes = paddle.squeeze(mlvl_bboxes) + if rescale: + mlvl_bboxes = mlvl_bboxes / paddle.concat( + [scale_factor[::-1], scale_factor[::-1]]) + mlvl_scores = paddle.concat(mlvl_scores) + mlvl_scores = mlvl_scores.transpose([1, 0]) + return mlvl_bboxes, mlvl_scores + + def decode(self, anchors, cls_logits, bboxes_reg, im_shape, scale_factor): + batch_bboxes = [] + batch_scores = [] + for img_id in range(cls_logits[0].shape[0]): + num_lvls = len(cls_logits) + cls_scores_list = [cls_logits[i][img_id] for i in range(num_lvls)] + bbox_preds_list = [bboxes_reg[i][img_id] for i in range(num_lvls)] + bboxes, scores = self.get_bboxes_single( + anchors, cls_scores_list, bbox_preds_list, im_shape[img_id], + scale_factor[img_id]) + batch_bboxes.append(bboxes) + batch_scores.append(scores) + batch_bboxes = paddle.stack(batch_bboxes, axis=0) + batch_scores = paddle.stack(batch_scores, axis=0) + return batch_bboxes, batch_scores + + def post_process(self, head_outputs, im_shape, scale_factor): + cls_logits_list, bboxes_reg_list = head_outputs + anchors = self.anchor_generator(cls_logits_list) + cls_logits = [_.transpose([0, 2, 3, 1]) for _ in cls_logits_list] + bboxes_reg = [_.transpose([0, 2, 3, 1]) for _ in bboxes_reg_list] + bboxes, scores = self.decode(anchors, cls_logits, bboxes_reg, im_shape, + scale_factor) + + bbox_pred, bbox_num, _ = self.nms(bboxes, scores) + return bbox_pred, bbox_num diff --git a/paddlers/models/ppdet/modeling/heads/roi_extractor.py b/paddlers/models/ppdet/modeling/heads/roi_extractor.py index 78646e6..7c6991b 100644 --- a/paddlers/models/ppdet/modeling/heads/roi_extractor.py +++ b/paddlers/models/ppdet/modeling/heads/roi_extractor.py @@ -29,7 +29,7 @@ class RoIAlign(object): RoI Align module For more details, please refer to the document of roi_align in - in ppdet/modeing/ops.py + in https://github.com/PaddlePaddle/Paddle/blob/release/2.5/python/paddle/vision/ops.py Args: resolution (int): The output size, default 14 @@ -38,9 +38,9 @@ class RoIAlign(object): default 0.0625 sampling_ratio (int): The number of sampling points in the interpolation grid, default 0 - canconical_level (int): The referring level of FPN layer with + canconical_level (int): The referring level of FPN layer with specified level. default 4 - canonical_size (int): The referring scale of FPN layer with + canonical_size (int): The referring scale of FPN layer with specified scale. default 224 start_level (int): The start level of FPN layer to extract RoI feature, default 0 @@ -76,33 +76,43 @@ class RoIAlign(object): def __call__(self, feats, roi, rois_num): roi = paddle.concat(roi) if len(roi) > 1 else roi[0] if len(feats) == 1: - rois_feat = ops.roi_align( - feats[self.start_level], - roi, - self.resolution, - self.spatial_scale[0], - rois_num=rois_num, + rois_feat = paddle.vision.ops.roi_align( + x=feats[self.start_level], + boxes=roi, + boxes_num=rois_num, + output_size=self.resolution, + spatial_scale=self.spatial_scale[0], aligned=self.aligned) else: offset = 2 k_min = self.start_level + offset k_max = self.end_level + offset - rois_dist, restore_index, rois_num_dist = ops.distribute_fpn_proposals( - roi, - k_min, - k_max, - self.canconical_level, - self.canonical_size, - rois_num=rois_num) + if hasattr(paddle.vision.ops, "distribute_fpn_proposals"): + rois_dist, restore_index, rois_num_dist = paddle.vision.ops.distribute_fpn_proposals( + roi, + k_min, + k_max, + self.canconical_level, + self.canonical_size, + rois_num=rois_num) + else: + rois_dist, restore_index, rois_num_dist = ops.distribute_fpn_proposals( + roi, + k_min, + k_max, + self.canconical_level, + self.canonical_size, + rois_num=rois_num) + rois_feat_list = [] for lvl in range(self.start_level, self.end_level + 1): - roi_feat = ops.roi_align( - feats[lvl], - rois_dist[lvl], - self.resolution, - self.spatial_scale[lvl], + roi_feat = paddle.vision.ops.roi_align( + x=feats[lvl], + boxes=rois_dist[lvl], + boxes_num=rois_num_dist[lvl], + output_size=self.resolution, + spatial_scale=self.spatial_scale[lvl], sampling_ratio=self.sampling_ratio, - rois_num=rois_num_dist[lvl], aligned=self.aligned) rois_feat_list.append(roi_feat) rois_feat_shuffle = paddle.concat(rois_feat_list) diff --git a/paddlers/models/ppdet/modeling/heads/s2anet_head.py b/paddlers/models/ppdet/modeling/heads/s2anet_head.py index e2e745d..f13af64 100644 --- a/paddlers/models/ppdet/modeling/heads/s2anet_head.py +++ b/paddlers/models/ppdet/modeling/heads/s2anet_head.py @@ -20,181 +20,13 @@ import paddle.nn as nn import paddle.nn.functional as F from paddle.nn.initializer import Normal, Constant from paddlers.models.ppdet.core.workspace import register -from paddlers.models.ppdet.modeling import ops -from paddlers.models.ppdet.modeling import bbox_utils from paddlers.models.ppdet.modeling.proposal_generator.target_layer import RBoxAssigner +from paddlers.models.ppdet.modeling.proposal_generator.anchor_generator import S2ANetAnchorGenerator +from paddlers.models.ppdet.modeling.layers import AlignConv +from ..cls_utils import _get_class_default_kwargs import numpy as np -class S2ANetAnchorGenerator(nn.Layer): - """ - AnchorGenerator by paddle - """ - - def __init__(self, base_size, scales, ratios, scale_major=True, ctr=None): - super(S2ANetAnchorGenerator, self).__init__() - self.base_size = base_size - self.scales = paddle.to_tensor(scales) - self.ratios = paddle.to_tensor(ratios) - self.scale_major = scale_major - self.ctr = ctr - self.base_anchors = self.gen_base_anchors() - - @property - def num_base_anchors(self): - return self.base_anchors.shape[0] - - def gen_base_anchors(self): - w = self.base_size - h = self.base_size - if self.ctr is None: - x_ctr = 0.5 * (w - 1) - y_ctr = 0.5 * (h - 1) - else: - x_ctr, y_ctr = self.ctr - - h_ratios = paddle.sqrt(self.ratios) - w_ratios = 1 / h_ratios - if self.scale_major: - ws = (w * w_ratios[:] * self.scales[:]).reshape([-1]) - hs = (h * h_ratios[:] * self.scales[:]).reshape([-1]) - else: - ws = (w * self.scales[:] * w_ratios[:]).reshape([-1]) - hs = (h * self.scales[:] * h_ratios[:]).reshape([-1]) - - base_anchors = paddle.stack( - [ - x_ctr - 0.5 * (ws - 1), y_ctr - 0.5 * (hs - 1), - x_ctr + 0.5 * (ws - 1), y_ctr + 0.5 * (hs - 1) - ], - axis=-1) - base_anchors = paddle.round(base_anchors) - return base_anchors - - def _meshgrid(self, x, y, row_major=True): - yy, xx = paddle.meshgrid(y, x) - yy = yy.reshape([-1]) - xx = xx.reshape([-1]) - if row_major: - return xx, yy - else: - return yy, xx - - def forward(self, featmap_size, stride=16): - # featmap_size*stride project it to original area - - feat_h = featmap_size[0] - feat_w = featmap_size[1] - shift_x = paddle.arange(0, feat_w, 1, 'int32') * stride - shift_y = paddle.arange(0, feat_h, 1, 'int32') * stride - shift_xx, shift_yy = self._meshgrid(shift_x, shift_y) - shifts = paddle.stack([shift_xx, shift_yy, shift_xx, shift_yy], axis=-1) - - all_anchors = self.base_anchors[:, :] + shifts[:, :] - all_anchors = all_anchors.reshape([feat_h * feat_w, 4]) - return all_anchors - - def valid_flags(self, featmap_size, valid_size): - feat_h, feat_w = featmap_size - valid_h, valid_w = valid_size - assert valid_h <= feat_h and valid_w <= feat_w - valid_x = paddle.zeros([feat_w], dtype='int32') - valid_y = paddle.zeros([feat_h], dtype='int32') - valid_x[:valid_w] = 1 - valid_y[:valid_h] = 1 - valid_xx, valid_yy = self._meshgrid(valid_x, valid_y) - valid = valid_xx & valid_yy - valid = paddle.reshape(valid, [-1, 1]) - valid = paddle.expand(valid, [-1, self.num_base_anchors]).reshape([-1]) - return valid - - -class AlignConv(nn.Layer): - def __init__(self, in_channels, out_channels, kernel_size=3, groups=1): - super(AlignConv, self).__init__() - self.kernel_size = kernel_size - self.align_conv = paddle.vision.ops.DeformConv2D( - in_channels, - out_channels, - kernel_size=self.kernel_size, - padding=(self.kernel_size - 1) // 2, - groups=groups, - weight_attr=ParamAttr(initializer=Normal(0, 0.01)), - bias_attr=None) - - @paddle.no_grad() - def get_offset(self, anchors, featmap_size, stride): - """ - Args: - anchors: [M,5] xc,yc,w,h,angle - featmap_size: (feat_h, feat_w) - stride: 8 - Returns: - - """ - anchors = paddle.reshape(anchors, [-1, 5]) # (NA,5) - dtype = anchors.dtype - feat_h = featmap_size[0] - feat_w = featmap_size[1] - pad = (self.kernel_size - 1) // 2 - idx = paddle.arange(-pad, pad + 1, dtype=dtype) - - yy, xx = paddle.meshgrid(idx, idx) - xx = paddle.reshape(xx, [-1]) - yy = paddle.reshape(yy, [-1]) - - # get sampling locations of default conv - xc = paddle.arange(0, feat_w, dtype=dtype) - yc = paddle.arange(0, feat_h, dtype=dtype) - yc, xc = paddle.meshgrid(yc, xc) - - xc = paddle.reshape(xc, [-1, 1]) - yc = paddle.reshape(yc, [-1, 1]) - x_conv = xc + xx - y_conv = yc + yy - - # get sampling locations of anchors - # x_ctr, y_ctr, w, h, a = np.unbind(anchors, dim=1) - x_ctr = anchors[:, 0] - y_ctr = anchors[:, 1] - w = anchors[:, 2] - h = anchors[:, 3] - a = anchors[:, 4] - - x_ctr = paddle.reshape(x_ctr, [-1, 1]) - y_ctr = paddle.reshape(y_ctr, [-1, 1]) - w = paddle.reshape(w, [-1, 1]) - h = paddle.reshape(h, [-1, 1]) - a = paddle.reshape(a, [-1, 1]) - - x_ctr = x_ctr / stride - y_ctr = y_ctr / stride - w_s = w / stride - h_s = h / stride - cos, sin = paddle.cos(a), paddle.sin(a) - dw, dh = w_s / self.kernel_size, h_s / self.kernel_size - x, y = dw * xx, dh * yy - xr = cos * x - sin * y - yr = sin * x + cos * y - x_anchor, y_anchor = xr + x_ctr, yr + y_ctr - # get offset filed - offset_x = x_anchor - x_conv - offset_y = y_anchor - y_conv - offset = paddle.stack([offset_y, offset_x], axis=-1) - offset = paddle.reshape( - offset, [feat_h * feat_w, self.kernel_size * self.kernel_size * 2]) - offset = paddle.transpose(offset, [1, 0]) - offset = paddle.reshape( - offset, - [1, self.kernel_size * self.kernel_size * 2, feat_h, feat_w]) - return offset - - def forward(self, x, refine_anchors, featmap_size, stride): - offset = self.get_offset(refine_anchors, featmap_size, stride) - x = F.relu(self.align_conv(x, offset)) - return x - - @register class S2ANetHead(nn.Layer): """ @@ -215,7 +47,7 @@ class S2ANetHead(nn.Layer): reg_loss_weight (list): loss weight for regression """ __shared__ = ['num_classes'] - __inject__ = ['anchor_assign'] + __inject__ = ['anchor_assign', 'nms'] def __init__(self, stacked_convs=2, @@ -230,10 +62,12 @@ class S2ANetHead(nn.Layer): align_conv_type='AlignConv', align_conv_size=3, use_sigmoid_cls=True, - anchor_assign=RBoxAssigner().__dict__, + anchor_assign=_get_class_default_kwargs(RBoxAssigner), reg_loss_weight=[1.0, 1.0, 1.0, 1.0, 1.1], cls_loss_weight=[1.1, 1.05], - reg_loss_type='l1'): + reg_loss_type='l1', + nms_pre=2000, + nms='MultiClassNMS'): super(S2ANetHead, self).__init__() self.stacked_convs = stacked_convs self.feat_in = feat_in @@ -251,7 +85,7 @@ class S2ANetHead(nn.Layer): self.align_conv_size = align_conv_size self.use_sigmoid_cls = use_sigmoid_cls - self.cls_out_channels = num_classes if self.use_sigmoid_cls else 1 + self.cls_out_channels = num_classes if self.use_sigmoid_cls else num_classes + 1 self.sampling = False self.anchor_assign = anchor_assign self.reg_loss_weight = reg_loss_weight @@ -259,7 +93,13 @@ class S2ANetHead(nn.Layer): self.alpha = 1.0 self.beta = 1.0 self.reg_loss_type = reg_loss_type - self.s2anet_head_out = None + self.nms_pre = nms_pre + self.nms = nms + self.fake_bbox = paddle.to_tensor( + np.array( + [[-1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]], + dtype='float32')) + self.fake_bbox_num = paddle.to_tensor(np.array([1], dtype='int32')) # anchor self.anchor_generators = [] @@ -402,64 +242,49 @@ class S2ANetHead(nn.Layer): weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), bias_attr=ParamAttr(initializer=Constant(0))) - self.featmap_sizes = [] - self.base_anchors_list = [] - self.refine_anchor_list = [] + def forward(self, feats, targets=None): + fam_reg_list, fam_cls_list = [], [] + odm_reg_list, odm_cls_list = [], [] + num_anchors_list, base_anchors_list, refine_anchors_list = [], [], [] - def forward(self, feats): - fam_reg_branch_list = [] - fam_cls_branch_list = [] + for i, feat in enumerate(feats): + # get shape + B = feat.shape[0] + H, W = paddle.shape(feat)[2], paddle.shape(feat)[3] - odm_reg_branch_list = [] - odm_cls_branch_list = [] + NA = H * W + num_anchors_list.append(NA) - self.featmap_sizes_list = [] - self.base_anchors_list = [] - self.refine_anchor_list = [] - - for feat_idx in range(len(feats)): - feat = feats[feat_idx] fam_cls_feat = self.fam_cls_convs(feat) - fam_cls = self.fam_cls(fam_cls_feat) # [N, CLS, H, W] --> [N, H, W, CLS] - fam_cls = fam_cls.transpose([0, 2, 3, 1]) - fam_cls_reshape = paddle.reshape( - fam_cls, [fam_cls.shape[0], -1, self.cls_out_channels]) - fam_cls_branch_list.append(fam_cls_reshape) + fam_cls = fam_cls.transpose([0, 2, 3, 1]).reshape( + [B, NA, self.cls_out_channels]) + fam_cls_list.append(fam_cls) fam_reg_feat = self.fam_reg_convs(feat) - fam_reg = self.fam_reg(fam_reg_feat) # [N, 5, H, W] --> [N, H, W, 5] - fam_reg = fam_reg.transpose([0, 2, 3, 1]) - fam_reg_reshape = paddle.reshape(fam_reg, [fam_reg.shape[0], -1, 5]) - fam_reg_branch_list.append(fam_reg_reshape) + fam_reg = fam_reg.transpose([0, 2, 3, 1]).reshape([B, NA, 5]) + fam_reg_list.append(fam_reg) # prepare anchor - featmap_size = (paddle.shape(feat)[2], paddle.shape(feat)[3]) - self.featmap_sizes_list.append(featmap_size) - init_anchors = self.anchor_generators[feat_idx]( - featmap_size, self.anchor_strides[feat_idx]) - - init_anchors = paddle.to_tensor(init_anchors, dtype='float32') - NA = featmap_size[0] * featmap_size[1] - init_anchors = paddle.reshape(init_anchors, [NA, 4]) - init_anchors = self.rect2rbox(init_anchors) - self.base_anchors_list.append(init_anchors) + init_anchors = self.anchor_generators[i]((H, W), + self.anchor_strides[i]) + init_anchors = init_anchors.reshape([1, NA, 5]) + base_anchors_list.append(init_anchors.squeeze(0)) if self.training: refine_anchor = self.bbox_decode(fam_reg.detach(), init_anchors) else: refine_anchor = self.bbox_decode(fam_reg, init_anchors) - self.refine_anchor_list.append(refine_anchor) + refine_anchors_list.append(refine_anchor) if self.align_conv_type == 'AlignConv': align_feat = self.align_conv(feat, - refine_anchor.clone(), - featmap_size, - self.anchor_strides[feat_idx]) + refine_anchor.clone(), (H, W), + self.anchor_strides[i]) elif self.align_conv_type == 'DCN': align_offset = self.align_conv_offset(feat) align_feat = self.align_conv(feat, align_offset) @@ -473,39 +298,140 @@ class S2ANetHead(nn.Layer): odm_reg_feat = self.odm_reg_convs(odm_reg_feat) odm_cls_feat = self.odm_cls_convs(odm_cls_feat) - odm_cls_score = self.odm_cls(odm_cls_feat) + odm_cls = self.odm_cls(odm_cls_feat) # [N, CLS, H, W] --> [N, H, W, CLS] - odm_cls_score = odm_cls_score.transpose([0, 2, 3, 1]) - odm_cls_score_shape = odm_cls_score.shape - odm_cls_score_reshape = paddle.reshape(odm_cls_score, [ - odm_cls_score_shape[0], odm_cls_score_shape[1] * - odm_cls_score_shape[2], self.cls_out_channels + odm_cls = odm_cls.transpose([0, 2, 3, 1]).reshape( + [B, NA, self.cls_out_channels]) + odm_cls_list.append(odm_cls) + + odm_reg = self.odm_reg(odm_reg_feat) + # [N, 5, H, W] --> [N, H, W, 5] + odm_reg = odm_reg.transpose([0, 2, 3, 1]).reshape([B, NA, 5]) + odm_reg_list.append(odm_reg) + + if self.training: + return self.get_loss([ + fam_cls_list, fam_reg_list, odm_cls_list, odm_reg_list, + num_anchors_list, base_anchors_list, refine_anchors_list + ], targets) + else: + odm_bboxes_list = [] + for odm_reg, refine_anchor in zip(odm_reg_list, + refine_anchors_list): + odm_bboxes = self.bbox_decode(odm_reg, refine_anchor) + odm_bboxes_list.append(odm_bboxes) + return [odm_bboxes_list, odm_cls_list] + + def get_bboxes(self, head_outs): + perd_bboxes_list, pred_scores_list = head_outs + batch = paddle.shape(pred_scores_list[0])[0] + bboxes, bbox_num = [], [] + for i in range(batch): + pred_scores_per_image = [t[i] for t in pred_scores_list] + pred_bboxes_per_image = [t[i] for t in perd_bboxes_list] + bbox_per_image, bbox_num_per_image = self.get_bboxes_single( + pred_scores_per_image, pred_bboxes_per_image) + bboxes.append(bbox_per_image) + bbox_num.append(bbox_num_per_image) + + bboxes = paddle.concat(bboxes) + bbox_num = paddle.concat(bbox_num) + return bboxes, bbox_num + + def get_pred(self, bboxes, bbox_num, im_shape, scale_factor): + """ + Rescale, clip and filter the bbox from the output of NMS to + get final prediction. + Args: + bboxes(Tensor): bboxes [N, 10] + bbox_num(Tensor): bbox_num + im_shape(Tensor): [1 2] + scale_factor(Tensor): [1 2] + Returns: + bbox_pred(Tensor): The output is the prediction with shape [N, 8] + including labels, scores and bboxes. The size of + bboxes are corresponding to the original image. + """ + origin_shape = paddle.floor(im_shape / scale_factor + 0.5) + + origin_shape_list = [] + scale_factor_list = [] + # scale_factor: scale_y, scale_x + for i in range(bbox_num.shape[0]): + expand_shape = paddle.expand(origin_shape[i:i + 1, :], + [bbox_num[i], 2]) + scale_y, scale_x = scale_factor[i][0], scale_factor[i][1] + scale = paddle.concat([ + scale_x, scale_y, scale_x, scale_y, scale_x, scale_y, scale_x, + scale_y ]) + expand_scale = paddle.expand(scale, [bbox_num[i], 8]) + origin_shape_list.append(expand_shape) + scale_factor_list.append(expand_scale) + + origin_shape_list = paddle.concat(origin_shape_list) + scale_factor_list = paddle.concat(scale_factor_list) + + # bboxes: [N, 10], label, score, bbox + pred_label_score = bboxes[:, 0:2] + pred_bbox = bboxes[:, 2:] + + # rescale bbox to original image + pred_bbox = pred_bbox.reshape([-1, 8]) + scaled_bbox = pred_bbox / scale_factor_list + origin_h = origin_shape_list[:, 0] + origin_w = origin_shape_list[:, 1] + + bboxes = scaled_bbox + zeros = paddle.zeros_like(origin_h) + x1 = paddle.maximum(paddle.minimum(bboxes[:, 0], origin_w - 1), zeros) + y1 = paddle.maximum(paddle.minimum(bboxes[:, 1], origin_h - 1), zeros) + x2 = paddle.maximum(paddle.minimum(bboxes[:, 2], origin_w - 1), zeros) + y2 = paddle.maximum(paddle.minimum(bboxes[:, 3], origin_h - 1), zeros) + x3 = paddle.maximum(paddle.minimum(bboxes[:, 4], origin_w - 1), zeros) + y3 = paddle.maximum(paddle.minimum(bboxes[:, 5], origin_h - 1), zeros) + x4 = paddle.maximum(paddle.minimum(bboxes[:, 6], origin_w - 1), zeros) + y4 = paddle.maximum(paddle.minimum(bboxes[:, 7], origin_h - 1), zeros) + pred_bbox = paddle.stack([x1, y1, x2, y2, x3, y3, x4, y4], axis=-1) + pred_result = paddle.concat([pred_label_score, pred_bbox], axis=1) + return pred_result + + def get_bboxes_single(self, cls_score_list, bbox_pred_list): + mlvl_bboxes = [] + mlvl_scores = [] - odm_cls_branch_list.append(odm_cls_score_reshape) + for cls_score, bbox_pred in zip(cls_score_list, bbox_pred_list): + if self.use_sigmoid_cls: + scores = F.sigmoid(cls_score) + else: + scores = F.softmax(cls_score, axis=-1) - odm_bbox_pred = self.odm_reg(odm_reg_feat) - # [N, 5, H, W] --> [N, H, W, 5] - odm_bbox_pred = odm_bbox_pred.transpose([0, 2, 3, 1]) - odm_bbox_pred_reshape = paddle.reshape(odm_bbox_pred, [-1, 5]) - odm_bbox_pred_reshape = paddle.unsqueeze( - odm_bbox_pred_reshape, axis=0) - odm_reg_branch_list.append(odm_bbox_pred_reshape) - - self.s2anet_head_out = (fam_cls_branch_list, fam_reg_branch_list, - odm_cls_branch_list, odm_reg_branch_list) - return self.s2anet_head_out - - def get_prediction(self, nms_pre=2000): - refine_anchors = self.refine_anchor_list - fam_cls_branch_list = self.s2anet_head_out[0] - fam_reg_branch_list = self.s2anet_head_out[1] - odm_cls_branch_list = self.s2anet_head_out[2] - odm_reg_branch_list = self.s2anet_head_out[3] - pred_scores, pred_bboxes = self.get_bboxes( - odm_cls_branch_list, odm_reg_branch_list, refine_anchors, nms_pre, - self.cls_out_channels, self.use_sigmoid_cls) - return pred_scores, pred_bboxes + if scores.shape[0] > self.nms_pre: + # Get maximum scores for foreground classes. + if self.use_sigmoid_cls: + max_scores = paddle.max(scores, axis=1) + else: + max_scores = paddle.max(scores[:, :-1], axis=1) + + topk_val, topk_inds = paddle.topk(max_scores, self.nms_pre) + bbox_pred = paddle.gather(bbox_pred, topk_inds) + scores = paddle.gather(scores, topk_inds) + + mlvl_bboxes.append(bbox_pred) + mlvl_scores.append(scores) + + mlvl_bboxes = paddle.concat(mlvl_bboxes) + mlvl_scores = paddle.concat(mlvl_scores) + + mlvl_polys = self.rbox2poly(mlvl_bboxes).unsqueeze(0) + mlvl_scores = paddle.transpose(mlvl_scores, [1, 0]).unsqueeze(0) + + bbox, bbox_num, _ = self.nms(mlvl_polys, mlvl_scores) + if bbox.shape[0] <= 0: + bbox = self.fake_bbox + bbox_num = self.fake_bbox_num + + return bbox, bbox_num def smooth_l1_loss(self, pred, label, delta=1.0 / 9.0): """ @@ -522,10 +448,10 @@ class S2ANetHead(nn.Layer): diff - 0.5 * delta) return loss - def get_fam_loss(self, fam_target, s2anet_head_out, reg_loss_type='gwd'): + def get_fam_loss(self, fam_target, s2anet_head_out, reg_loss_type='l1'): (labels, label_weights, bbox_targets, bbox_weights, bbox_gt_bboxes, pos_inds, neg_inds) = fam_target - fam_cls_branch_list, fam_reg_branch_list, odm_cls_branch_list, odm_reg_branch_list = s2anet_head_out + fam_cls_branch_list, fam_reg_branch_list, odm_cls_branch_list, odm_reg_branch_list, num_anchors_list = s2anet_head_out fam_cls_losses = [] fam_bbox_losses = [] @@ -534,9 +460,7 @@ class S2ANetHead(nn.Layer): neg_inds) if self.sampling else len(pos_inds) num_total_samples = max(1, num_total_samples) - for idx, feat_size in enumerate(self.featmap_sizes_list): - feat_anchor_num = feat_size[0] * feat_size[1] - + for idx, feat_anchor_num in enumerate(num_anchors_list): # step1: get data feat_labels = labels[st_idx:st_idx + feat_anchor_num] feat_label_weights = label_weights[st_idx:st_idx + feat_anchor_num] @@ -593,39 +517,8 @@ class S2ANetHead(nn.Layer): feat_bbox_weights = paddle.to_tensor( feat_bbox_weights, stop_gradient=True) - if reg_loss_type == 'l1': - fam_bbox = fam_bbox * feat_bbox_weights - fam_bbox_total = paddle.sum(fam_bbox) / num_total_samples - elif reg_loss_type == 'iou' or reg_loss_type == 'gwd': - fam_bbox = paddle.sum(fam_bbox, axis=-1) - feat_bbox_weights = paddle.sum(feat_bbox_weights, axis=-1) - try: - from rbox_iou_ops import rbox_iou - except Exception as e: - print("import custom_ops error, try install rbox_iou_ops " \ - "following ppdet/ext_op/README.md", e) - sys.stdout.flush() - sys.exit(-1) - # calc iou - fam_bbox_decode = self.delta2rbox(self.base_anchors_list[idx], - fam_bbox_pred) - bbox_gt_bboxes = paddle.to_tensor( - bbox_gt_bboxes, - dtype=fam_bbox_decode.dtype, - place=fam_bbox_decode.place) - bbox_gt_bboxes.stop_gradient = True - iou = rbox_iou(fam_bbox_decode, bbox_gt_bboxes) - iou = paddle.diag(iou) - - if reg_loss_type == 'gwd': - bbox_gt_bboxes_level = bbox_gt_bboxes[st_idx:st_idx + - feat_anchor_num, :] - fam_bbox_total = self.gwd_loss(fam_bbox_decode, - bbox_gt_bboxes_level) - fam_bbox_total = fam_bbox_total * feat_bbox_weights - fam_bbox_total = paddle.sum( - fam_bbox_total) / num_total_samples - + fam_bbox = fam_bbox * feat_bbox_weights + fam_bbox_total = paddle.sum(fam_bbox) / num_total_samples fam_bbox_losses.append(fam_bbox_total) st_idx += feat_anchor_num @@ -636,10 +529,10 @@ class S2ANetHead(nn.Layer): fam_reg_loss = paddle.add_n(fam_bbox_losses) return fam_cls_loss, fam_reg_loss - def get_odm_loss(self, odm_target, s2anet_head_out, reg_loss_type='gwd'): + def get_odm_loss(self, odm_target, s2anet_head_out, reg_loss_type='l1'): (labels, label_weights, bbox_targets, bbox_weights, bbox_gt_bboxes, pos_inds, neg_inds) = odm_target - fam_cls_branch_list, fam_reg_branch_list, odm_cls_branch_list, odm_reg_branch_list = s2anet_head_out + fam_cls_branch_list, fam_reg_branch_list, odm_cls_branch_list, odm_reg_branch_list, num_anchors_list = s2anet_head_out odm_cls_losses = [] odm_bbox_losses = [] @@ -648,9 +541,7 @@ class S2ANetHead(nn.Layer): neg_inds) if self.sampling else len(pos_inds) num_total_samples = max(1, num_total_samples) - for idx, feat_size in enumerate(self.featmap_sizes_list): - feat_anchor_num = feat_size[0] * feat_size[1] - + for idx, feat_anchor_num in enumerate(num_anchors_list): # step1: get data feat_labels = labels[st_idx:st_idx + feat_anchor_num] feat_label_weights = label_weights[st_idx:st_idx + feat_anchor_num] @@ -708,38 +599,8 @@ class S2ANetHead(nn.Layer): feat_bbox_weights = paddle.to_tensor( feat_bbox_weights, stop_gradient=True) - if reg_loss_type == 'l1': - odm_bbox = odm_bbox * feat_bbox_weights - odm_bbox_total = paddle.sum(odm_bbox) / num_total_samples - elif reg_loss_type == 'iou' or reg_loss_type == 'gwd': - odm_bbox = paddle.sum(odm_bbox, axis=-1) - feat_bbox_weights = paddle.sum(feat_bbox_weights, axis=-1) - try: - from rbox_iou_ops import rbox_iou - except Exception as e: - print("import custom_ops error, try install rbox_iou_ops " \ - "following ppdet/ext_op/README.md", e) - sys.stdout.flush() - sys.exit(-1) - # calc iou - odm_bbox_decode = self.delta2rbox(self.refine_anchor_list[idx], - odm_bbox_pred) - bbox_gt_bboxes = paddle.to_tensor( - bbox_gt_bboxes, - dtype=odm_bbox_decode.dtype, - place=odm_bbox_decode.place) - bbox_gt_bboxes.stop_gradient = True - iou = rbox_iou(odm_bbox_decode, bbox_gt_bboxes) - iou = paddle.diag(iou) - - if reg_loss_type == 'gwd': - bbox_gt_bboxes_level = bbox_gt_bboxes[st_idx:st_idx + - feat_anchor_num, :] - odm_bbox_total = self.gwd_loss(odm_bbox_decode, - bbox_gt_bboxes_level) - odm_bbox_total = odm_bbox_total * feat_bbox_weights - odm_bbox_total = paddle.sum( - odm_bbox_total) / num_total_samples + odm_bbox = odm_bbox * feat_bbox_weights + odm_bbox_total = paddle.sum(odm_bbox) / num_total_samples odm_bbox_losses.append(odm_bbox_total) st_idx += feat_anchor_num @@ -751,8 +612,9 @@ class S2ANetHead(nn.Layer): odm_reg_loss = paddle.add_n(odm_bbox_losses) return odm_cls_loss, odm_reg_loss - def get_loss(self, inputs): - # inputs: im_id image im_shape scale_factor gt_bbox gt_class is_crowd + def get_loss(self, head_outs, inputs): + fam_cls_list, fam_reg_list, odm_cls_list, odm_reg_list, \ + num_anchors_list, base_anchors_list, refine_anchors_list = head_outs # compute loss fam_cls_loss_lst = [] @@ -760,29 +622,27 @@ class S2ANetHead(nn.Layer): odm_cls_loss_lst = [] odm_reg_loss_lst = [] - im_shape = inputs['im_shape'] - for im_id in range(im_shape.shape[0]): - np_im_shape = inputs['im_shape'][im_id].numpy() - np_scale_factor = inputs['scale_factor'][im_id].numpy() + batch = len(inputs['gt_rbox']) + for i in range(batch): # data_format: (xc, yc, w, h, theta) - gt_bboxes = inputs['gt_rbox'][im_id].numpy() - gt_labels = inputs['gt_class'][im_id].numpy() - is_crowd = inputs['is_crowd'][im_id].numpy() + gt_mask = inputs['pad_gt_mask'][i, :, 0] + gt_idx = paddle.nonzero(gt_mask).squeeze(-1) + gt_bboxes = paddle.gather(inputs['gt_rbox'][i], gt_idx).numpy() + gt_labels = paddle.gather(inputs['gt_class'][i], gt_idx).numpy() + is_crowd = paddle.gather(inputs['is_crowd'][i], gt_idx).numpy() gt_labels = gt_labels + 1 - # featmap_sizes - anchors_list_all = np.concatenate(self.base_anchors_list) - - # get im_feat - fam_cls_feats_list = [e[im_id] for e in self.s2anet_head_out[0]] - fam_reg_feats_list = [e[im_id] for e in self.s2anet_head_out[1]] - odm_cls_feats_list = [e[im_id] for e in self.s2anet_head_out[2]] - odm_reg_feats_list = [e[im_id] for e in self.s2anet_head_out[3]] - im_s2anet_head_out = (fam_cls_feats_list, fam_reg_feats_list, - odm_cls_feats_list, odm_reg_feats_list) + anchors_per_image = np.concatenate(base_anchors_list) + fam_cls_per_image = [t[i] for t in fam_cls_list] + fam_reg_per_image = [t[i] for t in fam_reg_list] + odm_cls_per_image = [t[i] for t in odm_cls_list] + odm_reg_per_image = [t[i] for t in odm_reg_list] + im_s2anet_head_out = (fam_cls_per_image, fam_reg_per_image, + odm_cls_per_image, odm_reg_per_image, + num_anchors_list) # FAM - im_fam_target = self.anchor_assign(anchors_list_all, gt_bboxes, + im_fam_target = self.anchor_assign(anchors_per_image, gt_bboxes, gt_labels, is_crowd) if im_fam_target is not None: im_fam_cls_loss, im_fam_reg_loss = self.get_fam_loss( @@ -791,11 +651,10 @@ class S2ANetHead(nn.Layer): fam_reg_loss_lst.append(im_fam_reg_loss) # ODM - np_refine_anchors_list = paddle.concat( - self.refine_anchor_list).numpy() - np_refine_anchors_list = np.concatenate(np_refine_anchors_list) - np_refine_anchors_list = np_refine_anchors_list.reshape(-1, 5) - im_odm_target = self.anchor_assign(np_refine_anchors_list, + refine_anchors_per_image = [t[i] for t in refine_anchors_list] + refine_anchors_per_image = paddle.concat( + refine_anchors_per_image).numpy() + im_odm_target = self.anchor_assign(refine_anchors_per_image, gt_bboxes, gt_labels, is_crowd) if im_odm_target is not None: @@ -803,116 +662,38 @@ class S2ANetHead(nn.Layer): im_odm_target, im_s2anet_head_out, self.reg_loss_type) odm_cls_loss_lst.append(im_odm_cls_loss) odm_reg_loss_lst.append(im_odm_reg_loss) - fam_cls_loss = paddle.add_n(fam_cls_loss_lst) - fam_reg_loss = paddle.add_n(fam_reg_loss_lst) - odm_cls_loss = paddle.add_n(odm_cls_loss_lst) - odm_reg_loss = paddle.add_n(odm_reg_loss_lst) + + fam_cls_loss = paddle.add_n(fam_cls_loss_lst) / batch + fam_reg_loss = paddle.add_n(fam_reg_loss_lst) / batch + odm_cls_loss = paddle.add_n(odm_cls_loss_lst) / batch + odm_reg_loss = paddle.add_n(odm_reg_loss_lst) / batch + loss = fam_cls_loss + fam_reg_loss + odm_cls_loss + odm_reg_loss + return { + 'loss': loss, 'fam_cls_loss': fam_cls_loss, 'fam_reg_loss': fam_reg_loss, 'odm_cls_loss': odm_cls_loss, 'odm_reg_loss': odm_reg_loss } - def get_bboxes(self, cls_score_list, bbox_pred_list, mlvl_anchors, nms_pre, - cls_out_channels, use_sigmoid_cls): - assert len(cls_score_list) == len(bbox_pred_list) == len(mlvl_anchors) - - mlvl_bboxes = [] - mlvl_scores = [] - - idx = 0 - for cls_score, bbox_pred, anchors in zip(cls_score_list, bbox_pred_list, - mlvl_anchors): - cls_score = paddle.reshape(cls_score, [-1, cls_out_channels]) - if use_sigmoid_cls: - scores = F.sigmoid(cls_score) - else: - scores = F.softmax(cls_score, axis=-1) - - # bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 5) - bbox_pred = paddle.transpose(bbox_pred, [1, 2, 0]) - bbox_pred = paddle.reshape(bbox_pred, [-1, 5]) - anchors = paddle.reshape(anchors, [-1, 5]) - - if scores.shape[0] > nms_pre: - # Get maximum scores for foreground classes. - if use_sigmoid_cls: - max_scores = paddle.max(scores, axis=1) - else: - max_scores = paddle.max(scores[:, 1:], axis=1) - - topk_val, topk_inds = paddle.topk(max_scores, nms_pre) - anchors = paddle.gather(anchors, topk_inds) - bbox_pred = paddle.gather(bbox_pred, topk_inds) - scores = paddle.gather(scores, topk_inds) - - bbox_delta = paddle.reshape(bbox_pred, [-1, 5]) - bboxes = self.delta2rbox(anchors, bbox_delta) - mlvl_bboxes.append(bboxes) - mlvl_scores.append(scores) - - idx += 1 - - mlvl_bboxes = paddle.concat(mlvl_bboxes, axis=0) - mlvl_scores = paddle.concat(mlvl_scores) - - return mlvl_scores, mlvl_bboxes - - def rect2rbox(self, bboxes): - """ - :param bboxes: shape (n, 4) (xmin, ymin, xmax, ymax) - :return: dbboxes: shape (n, 5) (x_ctr, y_ctr, w, h, angle) - """ - bboxes = paddle.reshape(bboxes, [-1, 4]) - num_boxes = paddle.shape(bboxes)[0] - x_ctr = (bboxes[:, 2] + bboxes[:, 0]) / 2.0 - y_ctr = (bboxes[:, 3] + bboxes[:, 1]) / 2.0 - edges1 = paddle.abs(bboxes[:, 2] - bboxes[:, 0]) - edges2 = paddle.abs(bboxes[:, 3] - bboxes[:, 1]) - - rbox_w = paddle.maximum(edges1, edges2) - rbox_h = paddle.minimum(edges1, edges2) - - # set angle - inds = edges1 < edges2 - inds = paddle.cast(inds, 'int32') - rboxes_angle = inds * np.pi / 2.0 - - rboxes = paddle.stack( - (x_ctr, y_ctr, rbox_w, rbox_h, rboxes_angle), axis=1) - return rboxes - - # deltas to rbox - def delta2rbox(self, rrois, deltas, wh_ratio_clip=1e-6): - """ - :param rrois: (cx, cy, w, h, theta) - :param deltas: (dx, dy, dw, dh, dtheta) - :param means: means of anchor - :param stds: stds of anchor - :param wh_ratio_clip: clip threshold of wh_ratio - :return: + def bbox_decode(self, preds, anchors, wh_ratio_clip=1e-6): + """decode bbox from deltas + Args: + preds: [B, L, 5] + anchors: [1, L, 5] + return: + bboxes: [B, L, 5] """ - deltas = paddle.reshape(deltas, [-1, 5]) - rrois = paddle.reshape(rrois, [-1, 5]) - # fix dy2st bug denorm_deltas = deltas * self.stds + self.means - denorm_deltas = paddle.add( - paddle.multiply(deltas, self.stds), self.means) - - dx = denorm_deltas[:, 0] - dy = denorm_deltas[:, 1] - dw = denorm_deltas[:, 2] - dh = denorm_deltas[:, 3] - dangle = denorm_deltas[:, 4] + preds = paddle.add(paddle.multiply(preds, self.stds), self.means) + + dx, dy, dw, dh, dangle = paddle.split(preds, 5, axis=-1) max_ratio = np.abs(np.log(wh_ratio_clip)) dw = paddle.clip(dw, min=-max_ratio, max=max_ratio) dh = paddle.clip(dh, min=-max_ratio, max=max_ratio) - rroi_x = rrois[:, 0] - rroi_y = rrois[:, 1] - rroi_w = rrois[:, 2] - rroi_h = rrois[:, 3] - rroi_angle = rrois[:, 4] + rroi_x, rroi_y, rroi_w, rroi_h, rroi_angle = paddle.split( + anchors, 5, axis=-1) gx = dx * rroi_w * paddle.cos(rroi_angle) - dy * rroi_h * paddle.sin( rroi_angle) + rroi_x @@ -922,127 +703,43 @@ class S2ANetHead(nn.Layer): gh = rroi_h * dh.exp() ga = np.pi * dangle + rroi_angle ga = (ga + np.pi / 4) % np.pi - np.pi / 4 - ga = paddle.to_tensor(ga) - gw = paddle.to_tensor(gw, dtype='float32') - gh = paddle.to_tensor(gh, dtype='float32') - bboxes = paddle.stack([gx, gy, gw, gh, ga], axis=-1) + bboxes = paddle.concat([gx, gy, gw, gh, ga], axis=-1) return bboxes - def bbox_decode(self, bbox_preds, anchors): - """decode bbox from deltas - Args: - bbox_preds: [N,H,W,5] - anchors: [H*W,5] - return: - bboxes: [N,H,W,5] + def rbox2poly(self, rboxes): """ - num_imgs, H, W, _ = bbox_preds.shape - bbox_delta = paddle.reshape(bbox_preds, [-1, 5]) - bboxes = self.delta2rbox(anchors, bbox_delta) - return bboxes - - def trace(self, A): - tr = paddle.diagonal(A, axis1=-2, axis2=-1) - tr = paddle.sum(tr, axis=-1) - return tr - - def sqrt_newton_schulz_autograd(self, A, numIters): - A_shape = A.shape - batchSize = A_shape[0] - dim = A_shape[1] - - normA = A * A - normA = paddle.sum(normA, axis=1) - normA = paddle.sum(normA, axis=1) - normA = paddle.sqrt(normA) - normA1 = normA.reshape([batchSize, 1, 1]) - Y = paddle.divide(A, paddle.expand_as(normA1, A)) - I = paddle.eye(dim, dim).reshape([1, dim, dim]) - l0 = [] - for i in range(batchSize): - l0.append(I) - I = paddle.concat(l0, axis=0) - I.stop_gradient = False - Z = paddle.eye(dim, dim).reshape([1, dim, dim]) - l1 = [] - for i in range(batchSize): - l1.append(Z) - Z = paddle.concat(l1, axis=0) - Z.stop_gradient = False - - for i in range(numIters): - T = 0.5 * (3.0 * I - Z.bmm(Y)) - Y = Y.bmm(T) - Z = T.bmm(Z) - sA = Y * paddle.sqrt(normA1).reshape([batchSize, 1, 1]) - sA = paddle.expand_as(sA, A) - return sA - - def wasserstein_distance_sigma(sigma1, sigma2): - wasserstein_distance_item2 = paddle.matmul( - sigma1, sigma1) + paddle.matmul( - sigma2, sigma2) - 2 * self.sqrt_newton_schulz_autograd( - paddle.matmul( - paddle.matmul(sigma1, paddle.matmul(sigma2, sigma2)), - sigma1), 10) - wasserstein_distance_item2 = self.trace(wasserstein_distance_item2) - - return wasserstein_distance_item2 - - def xywhr2xyrs(self, xywhr): - xywhr = paddle.reshape(xywhr, [-1, 5]) - xy = xywhr[:, :2] - wh = paddle.clip(xywhr[:, 2:4], min=1e-7, max=1e7) - r = xywhr[:, 4] - cos_r = paddle.cos(r) - sin_r = paddle.sin(r) - R = paddle.stack( - (cos_r, -sin_r, sin_r, cos_r), axis=-1).reshape([-1, 2, 2]) - S = 0.5 * paddle.nn.functional.diag_embed(wh) - return xy, R, S - - def gwd_loss(self, - pred, - target, - fun='log', - tau=1.0, - alpha=1.0, - normalize=False): - - xy_p, R_p, S_p = self.xywhr2xyrs(pred) - xy_t, R_t, S_t = self.xywhr2xyrs(target) - - xy_distance = (xy_p - xy_t).square().sum(axis=-1) - - Sigma_p = R_p.matmul(S_p.square()).matmul(R_p.transpose([0, 2, 1])) - Sigma_t = R_t.matmul(S_t.square()).matmul(R_t.transpose([0, 2, 1])) - - whr_distance = paddle.diagonal( - S_p, axis1=-2, axis2=-1).square().sum(axis=-1) - - whr_distance = whr_distance + paddle.diagonal( - S_t, axis1=-2, axis2=-1).square().sum(axis=-1) - _t = Sigma_p.matmul(Sigma_t) - - _t_tr = paddle.diagonal(_t, axis1=-2, axis2=-1).sum(axis=-1) - _t_det_sqrt = paddle.diagonal(S_p, axis1=-2, axis2=-1).prod(axis=-1) - _t_det_sqrt = _t_det_sqrt * paddle.diagonal( - S_t, axis1=-2, axis2=-1).prod(axis=-1) - whr_distance = whr_distance + (-2) * ( - (_t_tr + 2 * _t_det_sqrt).clip(0).sqrt()) - - distance = (xy_distance + alpha * alpha * whr_distance).clip(0) - - if normalize: - wh_p = pred[..., 2:4].clip(min=1e-7, max=1e7) - wh_t = target[..., 2:4].clip(min=1e-7, max=1e7) - scale = ((wh_p.log() + wh_t.log()).sum(dim=-1) / 4).exp() - distance = distance / scale - - if fun == 'log': - distance = paddle.log1p(distance) - - if tau >= 1.0: - return 1 - 1 / (tau + distance) - - return distance + rboxes: [x_ctr,y_ctr,w,h,angle] + to + polys: [x0,y0,x1,y1,x2,y2,x3,y3] + """ + N = paddle.shape(rboxes)[0] + + x_ctr = rboxes[:, 0] + y_ctr = rboxes[:, 1] + width = rboxes[:, 2] + height = rboxes[:, 3] + angle = rboxes[:, 4] + + tl_x, tl_y, br_x, br_y = -width * 0.5, -height * 0.5, width * 0.5, height * 0.5 + + normal_rects = paddle.stack( + [tl_x, br_x, br_x, tl_x, tl_y, tl_y, br_y, br_y], axis=0) + normal_rects = paddle.reshape(normal_rects, [2, 4, N]) + normal_rects = paddle.transpose(normal_rects, [2, 0, 1]) + + sin, cos = paddle.sin(angle), paddle.cos(angle) + # M: [N,2,2] + M = paddle.stack([cos, -sin, sin, cos], axis=0) + M = paddle.reshape(M, [2, 2, N]) + M = paddle.transpose(M, [2, 0, 1]) + + # polys: [N,8] + polys = paddle.matmul(M, normal_rects) + polys = paddle.transpose(polys, [2, 1, 0]) + polys = paddle.reshape(polys, [-1, N]) + polys = paddle.transpose(polys, [1, 0]) + + tmp = paddle.stack( + [x_ctr, y_ctr, x_ctr, y_ctr, x_ctr, y_ctr, x_ctr, y_ctr], axis=1) + polys = polys + tmp + return polys diff --git a/paddlers/models/ppdet/modeling/heads/simota_head.py b/paddlers/models/ppdet/modeling/heads/simota_head.py index 62be2b6..7467747 100644 --- a/paddlers/models/ppdet/modeling/heads/simota_head.py +++ b/paddlers/models/ppdet/modeling/heads/simota_head.py @@ -132,8 +132,8 @@ class OTAHead(GFLHead): yy, xx = self.get_single_level_center_point(featmap_size, stride, self.cell_offset) - center_and_stride = paddle.stack([xx, yy, stride, stride], - -1).tile([num_imgs, 1, 1]) + center_and_stride = paddle.stack([xx, yy, stride, stride], -1).tile( + [num_imgs, 1, 1]) center_and_strides.append(center_and_stride) center_in_feature = center_and_stride.reshape( [-1, 4])[:, :-2] / stride @@ -179,8 +179,8 @@ class OTAHead(GFLHead): num_level_anchors) num_total_pos = sum(pos_num_l) try: - num_total_pos = paddle.distributed.all_reduce(num_total_pos.clone( - )) / paddle.distributed.get_world_size() + paddle.distributed.all_reduce(num_total_pos) + num_total_pos = num_total_pos / paddle.distributed.get_world_size() except: num_total_pos = max(num_total_pos, 1) @@ -255,7 +255,7 @@ class OTAHead(GFLHead): avg_factor = sum(avg_factor) try: - avg_factor = paddle.distributed.all_reduce(avg_factor.clone()) + paddle.distributed.all_reduce(avg_factor) avg_factor = paddle.clip( avg_factor / paddle.distributed.get_world_size(), min=1) except: @@ -396,8 +396,8 @@ class OTAVFLHead(OTAHead): num_level_anchors) num_total_pos = sum(pos_num_l) try: - num_total_pos = paddle.distributed.all_reduce(num_total_pos.clone( - )) / paddle.distributed.get_world_size() + paddle.distributed.all_reduce(num_total_pos) + num_total_pos = num_total_pos / paddle.distributed.get_world_size() except: num_total_pos = max(num_total_pos, 1) @@ -475,7 +475,7 @@ class OTAVFLHead(OTAHead): avg_factor = sum(avg_factor) try: - avg_factor = paddle.distributed.all_reduce(avg_factor.clone()) + paddle.distributed.all_reduce(avg_factor) avg_factor = paddle.clip( avg_factor / paddle.distributed.get_world_size(), min=1) except: diff --git a/paddlers/models/ppdet/modeling/heads/ssd_head.py b/paddlers/models/ppdet/modeling/heads/ssd_head.py index 060e4c3..9f4b50f 100644 --- a/paddlers/models/ppdet/modeling/heads/ssd_head.py +++ b/paddlers/models/ppdet/modeling/heads/ssd_head.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. import paddle @@ -20,6 +20,7 @@ from paddle.regularizer import L2Decay from paddle import ParamAttr from ..layers import AnchorGeneratorSSD +from ..cls_utils import _get_class_default_kwargs class SepConvLayer(nn.Layer): @@ -113,7 +114,7 @@ class SSDHead(nn.Layer): def __init__(self, num_classes=80, in_channels=(512, 1024, 512, 256, 256, 256), - anchor_generator=AnchorGeneratorSSD().__dict__, + anchor_generator=_get_class_default_kwargs(AnchorGeneratorSSD), kernel_size=3, padding=1, use_sepconv=False, diff --git a/paddlers/models/ppdet/modeling/heads/tood_head.py b/paddlers/models/ppdet/modeling/heads/tood_head.py index b479ba7..0a49cd8 100644 --- a/paddlers/models/ppdet/modeling/heads/tood_head.py +++ b/paddlers/models/ppdet/modeling/heads/tood_head.py @@ -218,13 +218,17 @@ class TOODHead(nn.Layer): assert len(feats) == len(self.fpn_strides), \ "The size of feats is not equal to size of fpn_strides" - anchors, num_anchors_list, stride_tensor_list = generate_anchors_for_grid_cell( + anchors, anchor_points, num_anchors_list, stride_tensor =\ + generate_anchors_for_grid_cell( feats, self.fpn_strides, self.grid_cell_scale, self.grid_cell_offset) + anchor_centers_split = paddle.split(anchor_points / stride_tensor, + num_anchors_list) cls_score_list, bbox_pred_list = [], [] - for feat, scale_reg, anchor, stride in zip(feats, self.scales_regs, - anchors, self.fpn_strides): + for feat, scale_reg, anchor_centers, stride in zip( + feats, self.scales_regs, anchor_centers_split, + self.fpn_strides): b, _, h, w = get_static_shape(feat) inter_feats = [] for inter_conv in self.inter_convs: @@ -250,8 +254,8 @@ class TOODHead(nn.Layer): # reg prediction and alignment reg_dist = scale_reg(self.tood_reg(reg_feat).exp()) reg_dist = reg_dist.flatten(2).transpose([0, 2, 1]) - anchor_centers = bbox_center(anchor).unsqueeze(0) / stride - reg_bbox = batch_distance2bbox(anchor_centers, reg_dist) + reg_bbox = batch_distance2bbox( + anchor_centers.unsqueeze(0), reg_dist) if self.use_align_head: reg_offset = F.relu(self.reg_offset_conv1(feat)) reg_offset = self.reg_offset_conv2(reg_offset) @@ -268,12 +272,8 @@ class TOODHead(nn.Layer): bbox_pred_list.append(bbox_pred) cls_score_list = paddle.concat(cls_score_list, axis=1) bbox_pred_list = paddle.concat(bbox_pred_list, axis=1) - anchors = paddle.concat(anchors) - anchors.stop_gradient = True - stride_tensor_list = paddle.concat(stride_tensor_list).unsqueeze(0) - stride_tensor_list.stop_gradient = True - return cls_score_list, bbox_pred_list, anchors, num_anchors_list, stride_tensor_list + return cls_score_list, bbox_pred_list, anchors, num_anchors_list, stride_tensor @staticmethod def _focal_loss(score, label, alpha=0.25, gamma=2.0): @@ -286,9 +286,11 @@ class TOODHead(nn.Layer): return loss def get_loss(self, head_outs, gt_meta): - pred_scores, pred_bboxes, anchors, num_anchors_list, stride_tensor_list = head_outs + pred_scores, pred_bboxes, anchors, \ + num_anchors_list, stride_tensor = head_outs gt_labels = gt_meta['gt_class'] gt_bboxes = gt_meta['gt_bbox'] + pad_gt_mask = gt_meta['pad_gt_mask'] # label assignment if gt_meta['epoch_id'] < self.static_assigner_epoch: assigned_labels, assigned_bboxes, assigned_scores = self.static_assigner( @@ -296,20 +298,23 @@ class TOODHead(nn.Layer): num_anchors_list, gt_labels, gt_bboxes, + pad_gt_mask, bg_index=self.num_classes) alpha_l = 0.25 else: assigned_labels, assigned_bboxes, assigned_scores = self.assigner( pred_scores.detach(), - pred_bboxes.detach() * stride_tensor_list, + pred_bboxes.detach() * stride_tensor, bbox_center(anchors), + num_anchors_list, gt_labels, gt_bboxes, + pad_gt_mask, bg_index=self.num_classes) alpha_l = -1 # rescale bbox - assigned_bboxes /= stride_tensor_list + assigned_bboxes /= stride_tensor # classification loss loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha=alpha_l) # select positive samples mask diff --git a/paddlers/models/ppdet/modeling/heads/ttf_head.py b/paddlers/models/ppdet/modeling/heads/ttf_head.py index 9b3fac2..928374c 100644 --- a/paddlers/models/ppdet/modeling/heads/ttf_head.py +++ b/paddlers/models/ppdet/modeling/heads/ttf_head.py @@ -31,7 +31,7 @@ class HMHead(nn.Layer): ch_out (int): The channel number of output Tensor. num_classes (int): Number of classes. conv_num (int): The convolution number of hm_feat. - dcn_head(bool): whether use dcn in head. False by default. + dcn_head(bool): whether use dcn in head. False by default. lite_head(bool): whether use lite version. False by default. norm_type (string): norm type, 'sync_bn', 'bn', 'gn' are optional. bn by default diff --git a/paddlers/models/ppdet/modeling/heads/yolo_head.py b/paddlers/models/ppdet/modeling/heads/yolo_head.py index 1aa4892..dc884a3 100644 --- a/paddlers/models/ppdet/modeling/heads/yolo_head.py +++ b/paddlers/models/ppdet/modeling/heads/yolo_head.py @@ -1,3 +1,17 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import paddle import paddle.nn as nn import paddle.nn.functional as F @@ -5,6 +19,17 @@ from paddle import ParamAttr from paddle.regularizer import L2Decay from paddlers.models.ppdet.core.workspace import register +import math +import numpy as np +from ..initializer import bias_init_with_prob, constant_ +from ..backbones.csp_darknet import BaseConv, DWConv +from ..losses import IouLoss +from paddlers.models.ppdet.modeling.assigners.simota_assigner import SimOTAAssigner +from paddlers.models.ppdet.modeling.bbox_utils import bbox_overlaps +from paddlers.models.ppdet.modeling.layers import MultiClassNMS + +__all__ = ['YOLOv3Head', 'YOLOXHead'] + def _de_sigmoid(x, eps=1e-7): x = paddle.clip(x, eps, 1. / eps) @@ -122,3 +147,270 @@ class YOLOv3Head(nn.Layer): @classmethod def from_config(cls, cfg, input_shape): return {'in_channels': [i.channels for i in input_shape], } + + +@register +class YOLOXHead(nn.Layer): + __shared__ = ['num_classes', 'width_mult', 'act', 'trt', 'exclude_nms'] + __inject__ = ['assigner', 'nms'] + + def __init__(self, + num_classes=80, + width_mult=1.0, + depthwise=False, + in_channels=[256, 512, 1024], + feat_channels=256, + fpn_strides=(8, 16, 32), + l1_epoch=285, + act='silu', + assigner=SimOTAAssigner(use_vfl=False), + nms='MultiClassNMS', + loss_weight={ + 'cls': 1.0, + 'obj': 1.0, + 'iou': 5.0, + 'l1': 1.0, + }, + trt=False, + exclude_nms=False): + super(YOLOXHead, self).__init__() + self._dtype = paddle.framework.get_default_dtype() + self.num_classes = num_classes + assert len(in_channels) > 0, "in_channels length should > 0" + self.in_channels = in_channels + feat_channels = int(feat_channels * width_mult) + self.fpn_strides = fpn_strides + self.l1_epoch = l1_epoch + self.assigner = assigner + self.nms = nms + if isinstance(self.nms, MultiClassNMS) and trt: + self.nms.trt = trt + self.exclude_nms = exclude_nms + self.loss_weight = loss_weight + self.iou_loss = IouLoss(loss_weight=1.0) # default loss_weight 2.5 + + ConvBlock = DWConv if depthwise else BaseConv + + self.stem_conv = nn.LayerList() + self.conv_cls = nn.LayerList() + self.conv_reg = nn.LayerList() # reg [x,y,w,h] + obj + for in_c in self.in_channels: + self.stem_conv.append(BaseConv(in_c, feat_channels, 1, 1, act=act)) + + self.conv_cls.append( + nn.Sequential(*[ + ConvBlock( + feat_channels, feat_channels, 3, 1, act=act), ConvBlock( + feat_channels, feat_channels, 3, 1, act=act), + nn.Conv2D( + feat_channels, + self.num_classes, + 1, + bias_attr=ParamAttr(regularizer=L2Decay(0.0))) + ])) + + self.conv_reg.append( + nn.Sequential(*[ + ConvBlock( + feat_channels, feat_channels, 3, 1, act=act), + ConvBlock( + feat_channels, feat_channels, 3, 1, act=act), + nn.Conv2D( + feat_channels, + 4 + 1, # reg [x,y,w,h] + obj + 1, + bias_attr=ParamAttr(regularizer=L2Decay(0.0))) + ])) + + self._init_weights() + + @classmethod + def from_config(cls, cfg, input_shape): + return {'in_channels': [i.channels for i in input_shape], } + + def _init_weights(self): + bias_cls = bias_init_with_prob(0.01) + bias_reg = paddle.full([5], math.log(5.), dtype=self._dtype) + bias_reg[:2] = 0. + bias_reg[-1] = bias_cls + for cls_, reg_ in zip(self.conv_cls, self.conv_reg): + constant_(cls_[-1].weight) + constant_(cls_[-1].bias, bias_cls) + constant_(reg_[-1].weight) + reg_[-1].bias.set_value(bias_reg) + + def _generate_anchor_point(self, feat_sizes, strides, offset=0.): + anchor_points, stride_tensor = [], [] + num_anchors_list = [] + for feat_size, stride in zip(feat_sizes, strides): + h, w = feat_size + x = (paddle.arange(w) + offset) * stride + y = (paddle.arange(h) + offset) * stride + y, x = paddle.meshgrid(y, x) + anchor_points.append(paddle.stack([x, y], axis=-1).reshape([-1, 2])) + stride_tensor.append( + paddle.full( + [len(anchor_points[-1]), 1], stride, dtype=self._dtype)) + num_anchors_list.append(len(anchor_points[-1])) + anchor_points = paddle.concat(anchor_points).astype(self._dtype) + anchor_points.stop_gradient = True + stride_tensor = paddle.concat(stride_tensor) + stride_tensor.stop_gradient = True + return anchor_points, stride_tensor, num_anchors_list + + def forward(self, feats, targets=None): + assert len(feats) == len(self.fpn_strides), \ + "The size of feats is not equal to size of fpn_strides" + + feat_sizes = [[f.shape[-2], f.shape[-1]] for f in feats] + cls_score_list, reg_pred_list = [], [] + obj_score_list = [] + for i, feat in enumerate(feats): + feat = self.stem_conv[i](feat) + cls_logit = self.conv_cls[i](feat) + reg_pred = self.conv_reg[i](feat) + # cls prediction + cls_score = F.sigmoid(cls_logit) + cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1])) + # reg prediction + reg_xywh, obj_logit = paddle.split(reg_pred, [4, 1], axis=1) + reg_xywh = reg_xywh.flatten(2).transpose([0, 2, 1]) + reg_pred_list.append(reg_xywh) + # obj prediction + obj_score = F.sigmoid(obj_logit) + obj_score_list.append(obj_score.flatten(2).transpose([0, 2, 1])) + + cls_score_list = paddle.concat(cls_score_list, axis=1) + reg_pred_list = paddle.concat(reg_pred_list, axis=1) + obj_score_list = paddle.concat(obj_score_list, axis=1) + + # bbox decode + anchor_points, stride_tensor, _ =\ + self._generate_anchor_point(feat_sizes, self.fpn_strides) + reg_xy, reg_wh = paddle.split(reg_pred_list, 2, axis=-1) + reg_xy += (anchor_points / stride_tensor) + reg_wh = paddle.exp(reg_wh) * 0.5 + bbox_pred_list = paddle.concat( + [reg_xy - reg_wh, reg_xy + reg_wh], axis=-1) + + if self.training: + anchor_points, stride_tensor, num_anchors_list =\ + self._generate_anchor_point(feat_sizes, self.fpn_strides, 0.5) + yolox_losses = self.get_loss([ + cls_score_list, bbox_pred_list, obj_score_list, anchor_points, + stride_tensor, num_anchors_list + ], targets) + return yolox_losses + else: + pred_scores = (cls_score_list * obj_score_list).sqrt() + return pred_scores, bbox_pred_list, stride_tensor + + def get_loss(self, head_outs, targets): + pred_cls, pred_bboxes, pred_obj,\ + anchor_points, stride_tensor, num_anchors_list = head_outs + gt_labels = targets['gt_class'] + gt_bboxes = targets['gt_bbox'] + pred_scores = (pred_cls * pred_obj).sqrt() + # label assignment + center_and_strides = paddle.concat( + [anchor_points, stride_tensor, stride_tensor], axis=-1) + pos_num_list, label_list, bbox_target_list = [], [], [] + for pred_score, pred_bbox, gt_box, gt_label in zip( + pred_scores.detach(), + pred_bboxes.detach() * stride_tensor, gt_bboxes, gt_labels): + pos_num, label, _, bbox_target = self.assigner( + pred_score, center_and_strides, pred_bbox, gt_box, gt_label) + pos_num_list.append(pos_num) + label_list.append(label) + bbox_target_list.append(bbox_target) + labels = paddle.to_tensor(np.stack(label_list, axis=0)) + bbox_targets = paddle.to_tensor(np.stack(bbox_target_list, axis=0)) + bbox_targets /= stride_tensor # rescale bbox + + # 1. obj score loss + mask_positive = (labels != self.num_classes) + loss_obj = F.binary_cross_entropy( + pred_obj, + mask_positive.astype(pred_obj.dtype).unsqueeze(-1), + reduction='sum') + + num_pos = sum(pos_num_list) + + if num_pos > 0: + num_pos = paddle.to_tensor(num_pos, dtype=self._dtype).clip(min=1) + loss_obj /= num_pos + + # 2. iou loss + bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 4]) + pred_bboxes_pos = paddle.masked_select(pred_bboxes, + bbox_mask).reshape([-1, 4]) + assigned_bboxes_pos = paddle.masked_select( + bbox_targets, bbox_mask).reshape([-1, 4]) + bbox_iou = bbox_overlaps(pred_bboxes_pos, assigned_bboxes_pos) + bbox_iou = paddle.diag(bbox_iou) + + loss_iou = self.iou_loss( + pred_bboxes_pos.split( + 4, axis=-1), + assigned_bboxes_pos.split( + 4, axis=-1)) + loss_iou = loss_iou.sum() / num_pos + + # 3. cls loss + cls_mask = mask_positive.unsqueeze(-1).tile( + [1, 1, self.num_classes]) + pred_cls_pos = paddle.masked_select( + pred_cls, cls_mask).reshape([-1, self.num_classes]) + assigned_cls_pos = paddle.masked_select(labels, mask_positive) + assigned_cls_pos = F.one_hot(assigned_cls_pos, + self.num_classes + 1)[..., :-1] + assigned_cls_pos *= bbox_iou.unsqueeze(-1) + loss_cls = F.binary_cross_entropy( + pred_cls_pos, assigned_cls_pos, reduction='sum') + loss_cls /= num_pos + + # 4. l1 loss + if targets['epoch_id'] >= self.l1_epoch: + loss_l1 = F.l1_loss( + pred_bboxes_pos, assigned_bboxes_pos, reduction='sum') + loss_l1 /= num_pos + else: + loss_l1 = paddle.zeros([1]) + loss_l1.stop_gradient = False + else: + loss_cls = paddle.zeros([1]) + loss_iou = paddle.zeros([1]) + loss_l1 = paddle.zeros([1]) + loss_cls.stop_gradient = False + loss_iou.stop_gradient = False + loss_l1.stop_gradient = False + + loss = self.loss_weight['obj'] * loss_obj + \ + self.loss_weight['cls'] * loss_cls + \ + self.loss_weight['iou'] * loss_iou + + if targets['epoch_id'] >= self.l1_epoch: + loss += (self.loss_weight['l1'] * loss_l1) + + yolox_losses = { + 'loss': loss, + 'loss_cls': loss_cls, + 'loss_obj': loss_obj, + 'loss_iou': loss_iou, + 'loss_l1': loss_l1, + } + return yolox_losses + + def post_process(self, head_outs, img_shape, scale_factor): + pred_scores, pred_bboxes, stride_tensor = head_outs + pred_scores = pred_scores.transpose([0, 2, 1]) + pred_bboxes *= stride_tensor + # scale bbox to origin image + scale_factor = scale_factor.flip(-1).tile([1, 2]).unsqueeze(1) + pred_bboxes /= scale_factor + if self.exclude_nms: + # `exclude_nms=True` just use in benchmark + return pred_bboxes.sum(), pred_scores.sum() + else: + bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores) + return bbox_pred, bbox_num diff --git a/paddlers/models/ppdet/modeling/initializer.py b/paddlers/models/ppdet/modeling/initializer.py index 1a1ce84..5c0f8d5 100644 --- a/paddlers/models/ppdet/modeling/initializer.py +++ b/paddlers/models/ppdet/modeling/initializer.py @@ -273,7 +273,8 @@ def linear_init_(module): def conv_init_(module): bound = 1 / np.sqrt(np.prod(module.weight.shape[1:])) uniform_(module.weight, -bound, bound) - uniform_(module.bias, -bound, bound) + if module.bias is not None: + uniform_(module.bias, -bound, bound) def bias_init_with_prob(prior_prob=0.01): diff --git a/paddlers/models/ppdet/modeling/layers.py b/paddlers/models/ppdet/modeling/layers.py index 6a0cca1..5ee134f 100644 --- a/paddlers/models/ppdet/modeling/layers.py +++ b/paddlers/models/ppdet/modeling/layers.py @@ -39,6 +39,81 @@ def _to_list(l): return [l] +class AlignConv(nn.Layer): + def __init__(self, in_channels, out_channels, kernel_size=3, groups=1): + super(AlignConv, self).__init__() + self.kernel_size = kernel_size + self.align_conv = paddle.vision.ops.DeformConv2D( + in_channels, + out_channels, + kernel_size=self.kernel_size, + padding=(self.kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(initializer=Normal(0, 0.01)), + bias_attr=None) + + @paddle.no_grad() + def get_offset(self, anchors, featmap_size, stride): + """ + Args: + anchors: [B, L, 5] xc,yc,w,h,angle + featmap_size: (feat_h, feat_w) + stride: 8 + Returns: + + """ + batch = anchors.shape[0] + dtype = anchors.dtype + feat_h, feat_w = featmap_size + pad = (self.kernel_size - 1) // 2 + idx = paddle.arange(-pad, pad + 1, dtype=dtype) + + yy, xx = paddle.meshgrid(idx, idx) + xx = paddle.reshape(xx, [-1]) + yy = paddle.reshape(yy, [-1]) + + # get sampling locations of default conv + xc = paddle.arange(0, feat_w, dtype=dtype) + yc = paddle.arange(0, feat_h, dtype=dtype) + yc, xc = paddle.meshgrid(yc, xc) + + xc = paddle.reshape(xc, [-1, 1]) + yc = paddle.reshape(yc, [-1, 1]) + x_conv = xc + xx + y_conv = yc + yy + + # get sampling locations of anchors + x_ctr, y_ctr, w, h, a = paddle.split(anchors, 5, axis=-1) + x_ctr = x_ctr / stride + y_ctr = y_ctr / stride + w_s = w / stride + h_s = h / stride + cos, sin = paddle.cos(a), paddle.sin(a) + dw, dh = w_s / self.kernel_size, h_s / self.kernel_size + x, y = dw * xx, dh * yy + xr = cos * x - sin * y + yr = sin * x + cos * y + x_anchor, y_anchor = xr + x_ctr, yr + y_ctr + # get offset filed + offset_x = x_anchor - x_conv + offset_y = y_anchor - y_conv + offset = paddle.stack([offset_y, offset_x], axis=-1) + offset = offset.reshape( + [batch, feat_h, feat_w, self.kernel_size * self.kernel_size * 2]) + offset = offset.transpose([0, 3, 1, 2]) + + return offset + + def forward(self, x, refine_anchors, featmap_size, stride): + batch = paddle.shape(x)[0].numpy() + offset = self.get_offset(refine_anchors, featmap_size, stride) + if self.training: + x = F.relu(self.align_conv(x, offset.detach())) + else: + x = F.relu(self.align_conv(x, offset)) + return x + + class DeformableConvV2(nn.Layer): def __init__(self, in_channels, @@ -128,7 +203,7 @@ class ConvNormLayer(nn.Layer): dcn_lr_scale=2., dcn_regularizer=L2Decay(0.)): super(ConvNormLayer, self).__init__() - assert norm_type in ['bn', 'sync_bn', 'gn'] + assert norm_type in ['bn', 'sync_bn', 'gn', None] if bias_on: bias_attr = ParamAttr( @@ -183,10 +258,13 @@ class ConvNormLayer(nn.Layer): num_channels=ch_out, weight_attr=param_attr, bias_attr=bias_attr) + else: + self.norm = None def forward(self, inputs): out = self.conv(inputs) - out = self.norm(out) + if self.norm is not None: + out = self.norm(out) return out @@ -248,7 +326,7 @@ class LiteConv(nn.Layer): class DropBlock(nn.Layer): - def __init__(self, block_size, keep_prob, name, data_format='NCHW'): + def __init__(self, block_size, keep_prob, name=None, data_format='NCHW'): """ DropBlock layer, see https://arxiv.org/abs/1810.12890 @@ -360,18 +438,20 @@ class AnchorGeneratorSSD(object): @register @serializable class RCNNBox(object): - __shared__ = ['num_classes'] + __shared__ = ['num_classes', 'export_onnx'] def __init__(self, prior_box_var=[10., 10., 5., 5.], code_type="decode_center_size", box_normalized=False, - num_classes=80): + num_classes=80, + export_onnx=False): super(RCNNBox, self).__init__() self.prior_box_var = prior_box_var self.code_type = code_type self.box_normalized = box_normalized self.num_classes = num_classes + self.export_onnx = export_onnx def __call__(self, bbox_head_out, rois, im_shape, scale_factor): bbox_pred = bbox_head_out[0] @@ -379,39 +459,38 @@ class RCNNBox(object): roi = rois[0] rois_num = rois[1] - origin_shape = paddle.floor(im_shape / scale_factor + 0.5) - scale_list = [] - origin_shape_list = [] + if self.export_onnx: + onnx_rois_num_per_im = rois_num[0] + origin_shape = paddle.expand(im_shape[0, :], + [onnx_rois_num_per_im, 2]) - batch_size = 1 - if isinstance(roi, list): - batch_size = len(roi) else: - batch_size = paddle.slice(paddle.shape(im_shape), [0], [0], [1]) - # bbox_pred.shape: [N, C*4] - for idx in range(batch_size): - roi_per_im = roi[idx] - rois_num_per_im = rois_num[idx] - expand_im_shape = paddle.expand(im_shape[idx, :], - [rois_num_per_im, 2]) - origin_shape_list.append(expand_im_shape) + origin_shape_list = [] + if isinstance(roi, list): + batch_size = len(roi) + else: + batch_size = paddle.slice(paddle.shape(im_shape), [0], [0], [1]) + + # bbox_pred.shape: [N, C*4] + for idx in range(batch_size): + rois_num_per_im = rois_num[idx] + expand_im_shape = paddle.expand(im_shape[idx, :], + [rois_num_per_im, 2]) + origin_shape_list.append(expand_im_shape) - origin_shape = paddle.concat(origin_shape_list) + origin_shape = paddle.concat(origin_shape_list) # bbox_pred.shape: [N, C*4] # C=num_classes in faster/mask rcnn(bbox_head), C=1 in cascade rcnn(cascade_head) bbox = paddle.concat(roi) - if bbox.shape[0] == 0: - bbox = paddle.zeros([0, bbox_pred.shape[1]], dtype='float32') - else: - bbox = delta2bbox(bbox_pred, bbox, self.prior_box_var) + bbox = delta2bbox(bbox_pred, bbox, self.prior_box_var) scores = cls_prob[:, :-1] # bbox.shape: [N, C, 4] # bbox.shape[1] must be equal to scores.shape[1] - bbox_num_class = bbox.shape[1] - if bbox_num_class == 1: - bbox = paddle.tile(bbox, [1, self.num_classes, 1]) + total_num = bbox.shape[0] + bbox_dim = bbox.shape[-1] + bbox = paddle.expand(bbox, [total_num, self.num_classes, bbox_dim]) origin_h = paddle.unsqueeze(origin_shape[:, 0], axis=1) origin_w = paddle.unsqueeze(origin_shape[:, 1], axis=1) @@ -436,7 +515,8 @@ class MultiClassNMS(object): normalized=True, nms_eta=1.0, return_index=False, - return_rois_num=True): + return_rois_num=True, + trt=False): super(MultiClassNMS, self).__init__() self.score_threshold = score_threshold self.nms_top_k = nms_top_k @@ -446,20 +526,21 @@ class MultiClassNMS(object): self.nms_eta = nms_eta self.return_index = return_index self.return_rois_num = return_rois_num + self.trt = trt def __call__(self, bboxes, score, background_label=-1): """ - bboxes (Tensor|List[Tensor]): 1. (Tensor) Predicted bboxes with shape + bboxes (Tensor|List[Tensor]): 1. (Tensor) Predicted bboxes with shape [N, M, 4], N is the batch size and M is the number of bboxes 2. (List[Tensor]) bboxes and bbox_num, bboxes have shape of [M, C, 4], C is the class number and bbox_num means the number of bboxes of each batch with - shape [N,] + shape [N,] score (Tensor): Predicted scores with shape [N, C, M] or [M, C] background_label (int): Ignore the background label; For example, RCNN - is num_classes and YOLO is -1. + is num_classes and YOLO is -1. """ kwargs = self.__dict__.copy() if isinstance(bboxes, tuple): @@ -467,7 +548,20 @@ class MultiClassNMS(object): kwargs.update({'rois_num': bbox_num}) if background_label > -1: kwargs.update({'background_label': background_label}) - return ops.multiclass_nms(bboxes, score, **kwargs) + kwargs.pop('trt') + # TODO(wangxinxin08): paddle version should be develop or 2.3 and above to run nms on tensorrt + if self.trt and (int(paddle.version.major) == 0 or + (int(paddle.version.major) >= 2 and + int(paddle.version.minor) >= 3)): + # TODO(wangxinxin08): tricky switch to run nms on tensorrt + kwargs.update({'nms_eta': 1.1}) + bbox, bbox_num, _ = ops.multiclass_nms(bboxes, score, **kwargs) + bbox = bbox.reshape([1, -1, 6]) + idx = paddle.nonzero(bbox[..., 0] != -1) + bbox = paddle.gather_nd(bbox, idx) + return bbox, bbox_num, None + else: + return ops.multiclass_nms(bboxes, score, **kwargs) @register @@ -536,10 +630,15 @@ class YOLOBox(object): origin_shape = im_shape / scale_factor origin_shape = paddle.cast(origin_shape, 'int32') for i, head_out in enumerate(yolo_head_out): - boxes, scores = ops.yolo_box(head_out, origin_shape, anchors[i], - self.num_classes, self.conf_thresh, - self.downsample_ratio // 2**i, - self.clip_bbox, self.scale_x_y) + boxes, scores = paddle.vision.ops.yolo_box( + head_out, + origin_shape, + anchors[i], + self.num_classes, + self.conf_thresh, + self.downsample_ratio // 2**i, + self.clip_bbox, + scale_x_y=self.scale_x_y) boxes_list.append(boxes) scores_list.append(paddle.transpose(scores, perm=[0, 2, 1])) yolo_boxes = paddle.concat(boxes_list, axis=1) @@ -550,9 +649,14 @@ class YOLOBox(object): @register @serializable class SSDBox(object): - def __init__(self, is_normalized=True): + def __init__(self, + is_normalized=True, + prior_box_var=[0.1, 0.1, 0.2, 0.2], + use_fuse_decode=False): self.is_normalized = is_normalized self.norm_delta = float(not self.is_normalized) + self.prior_box_var = prior_box_var + self.use_fuse_decode = use_fuse_decode def __call__(self, preds, @@ -561,128 +665,42 @@ class SSDBox(object): scale_factor, var_weight=None): boxes, scores = preds - outputs = [] - for box, score, prior_box in zip(boxes, scores, prior_boxes): - pb_w = prior_box[:, 2] - prior_box[:, 0] + self.norm_delta - pb_h = prior_box[:, 3] - prior_box[:, 1] + self.norm_delta - pb_x = prior_box[:, 0] + pb_w * 0.5 - pb_y = prior_box[:, 1] + pb_h * 0.5 - out_x = pb_x + box[:, :, 0] * pb_w * 0.1 - out_y = pb_y + box[:, :, 1] * pb_h * 0.1 - out_w = paddle.exp(box[:, :, 2] * 0.2) * pb_w - out_h = paddle.exp(box[:, :, 3] * 0.2) * pb_h - - if self.is_normalized: - h = paddle.unsqueeze( - im_shape[:, 0] / scale_factor[:, 0], axis=-1) - w = paddle.unsqueeze( - im_shape[:, 1] / scale_factor[:, 1], axis=-1) - output = paddle.stack( - [(out_x - out_w / 2.) * w, (out_y - out_h / 2.) * h, - (out_x + out_w / 2.) * w, (out_y + out_h / 2.) * h], - axis=-1) - else: - output = paddle.stack( - [ - out_x - out_w / 2., out_y - out_h / 2., - out_x + out_w / 2. - 1., out_y + out_h / 2. - 1. - ], - axis=-1) - outputs.append(output) - boxes = paddle.concat(outputs, axis=1) - - scores = F.softmax(paddle.concat(scores, axis=1)) - scores = paddle.transpose(scores, [0, 2, 1]) - - return boxes, scores - - -@register -@serializable -class AnchorGrid(object): - """Generate anchor grid - - Args: - image_size (int or list): input image size, may be a single integer or - list of [h, w]. Default: 512 - min_level (int): min level of the feature pyramid. Default: 3 - max_level (int): max level of the feature pyramid. Default: 7 - anchor_base_scale: base anchor scale. Default: 4 - num_scales: number of anchor scales. Default: 3 - aspect_ratios: aspect ratios. default: [[1, 1], [1.4, 0.7], [0.7, 1.4]] - """ - - def __init__(self, - image_size=512, - min_level=3, - max_level=7, - anchor_base_scale=4, - num_scales=3, - aspect_ratios=[[1, 1], [1.4, 0.7], [0.7, 1.4]]): - super(AnchorGrid, self).__init__() - if isinstance(image_size, Integral): - self.image_size = [image_size, image_size] + boxes = paddle.concat(boxes, axis=1) + prior_boxes = paddle.concat(prior_boxes) + if self.use_fuse_decode: + output_boxes = ops.box_coder( + prior_boxes, + self.prior_box_var, + boxes, + code_type="decode_center_size", + box_normalized=self.is_normalized) else: - self.image_size = image_size - for dim in self.image_size: - assert dim % 2 ** max_level == 0, \ - "image size should be multiple of the max level stride" - self.min_level = min_level - self.max_level = max_level - self.anchor_base_scale = anchor_base_scale - self.num_scales = num_scales - self.aspect_ratios = aspect_ratios + pb_w = prior_boxes[:, 2] - prior_boxes[:, 0] + self.norm_delta + pb_h = prior_boxes[:, 3] - prior_boxes[:, 1] + self.norm_delta + pb_x = prior_boxes[:, 0] + pb_w * 0.5 + pb_y = prior_boxes[:, 1] + pb_h * 0.5 + out_x = pb_x + boxes[:, :, 0] * pb_w * self.prior_box_var[0] + out_y = pb_y + boxes[:, :, 1] * pb_h * self.prior_box_var[1] + out_w = paddle.exp(boxes[:, :, 2] * self.prior_box_var[2]) * pb_w + out_h = paddle.exp(boxes[:, :, 3] * self.prior_box_var[3]) * pb_h + output_boxes = paddle.stack( + [ + out_x - out_w / 2., out_y - out_h / 2., out_x + out_w / 2., + out_y + out_h / 2. + ], + axis=-1) + + if self.is_normalized: + h = (im_shape[:, 0] / scale_factor[:, 0]).unsqueeze(-1) + w = (im_shape[:, 1] / scale_factor[:, 1]).unsqueeze(-1) + im_shape = paddle.stack([w, h, w, h], axis=-1) + output_boxes *= im_shape + else: + output_boxes[..., -2:] -= 1.0 + output_scores = F.softmax(paddle.concat( + scores, axis=1)).transpose([0, 2, 1]) - @property - def base_cell(self): - if not hasattr(self, '_base_cell'): - self._base_cell = self.make_cell() - return self._base_cell - - def make_cell(self): - scales = [2**(i / self.num_scales) for i in range(self.num_scales)] - scales = np.array(scales) - ratios = np.array(self.aspect_ratios) - ws = np.outer(scales, ratios[:, 0]).reshape(-1, 1) - hs = np.outer(scales, ratios[:, 1]).reshape(-1, 1) - anchors = np.hstack((-0.5 * ws, -0.5 * hs, 0.5 * ws, 0.5 * hs)) - return anchors - - def make_grid(self, stride): - cell = self.base_cell * stride * self.anchor_base_scale - x_steps = np.arange(stride // 2, self.image_size[1], stride) - y_steps = np.arange(stride // 2, self.image_size[0], stride) - offset_x, offset_y = np.meshgrid(x_steps, y_steps) - offset_x = offset_x.flatten() - offset_y = offset_y.flatten() - offsets = np.stack((offset_x, offset_y, offset_x, offset_y), axis=-1) - offsets = offsets[:, np.newaxis, :] - return (cell + offsets).reshape(-1, 4) - - def generate(self): - return [ - self.make_grid(2**l) - for l in range(self.min_level, self.max_level + 1) - ] - - def __call__(self): - if not hasattr(self, '_anchor_vars'): - anchor_vars = [] - helper = LayerHelper('anchor_grid') - for idx, l in enumerate(range(self.min_level, self.max_level + 1)): - stride = 2**l - anchors = self.make_grid(stride) - var = helper.create_parameter( - attr=ParamAttr(name='anchors_{}'.format(idx)), - shape=anchors.shape, - dtype='float32', - stop_gradient=True, - default_initializer=NumpyArrayInitializer(anchors)) - anchor_vars.append(var) - var.persistable = True - self._anchor_vars = anchor_vars - - return self._anchor_vars + return output_boxes, output_scores @register @@ -722,7 +740,7 @@ class FCOSBox(object): Postprocess each layer of the output with corresponding locations. Args: locations (Tensor): anchor points for current layer, [H*W, 2] - box_cls (Tensor): categories prediction, [N, C, H, W], + box_cls (Tensor): categories prediction, [N, C, H, W], C is the number of classes box_reg (Tensor): bounding box prediction, [N, 4, H, W] box_ctn (Tensor): centerness prediction, [N, 1, H, W] @@ -807,7 +825,6 @@ class TTFBox(object): # batch size is 1 scores_r = paddle.reshape(scores, [cat, -1]) topk_scores, topk_inds = paddle.topk(scores_r, k) - topk_scores, topk_inds = paddle.topk(scores_r, k) topk_ys = topk_inds // width topk_xs = topk_inds % width @@ -1198,11 +1215,11 @@ def _convert_attention_mask(attn_mask, dtype): to prevents attention to some unwanted positions, usually the paddings or the subsequent positions. It is a tensor with shape broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. - When the data type is bool, the unwanted positions have `False` - values and the others have `True` values. When the data type is - int, the unwanted positions have 0 values and the others have 1 - values. When the data type is float, the unwanted positions have - `-INF` values and the others have 0 values. It can be None when + When the data type is bool, the unwanted positions have `False` + values and the others have `True` values. When the data type is + int, the unwanted positions have 0 values and the others have 1 + values. When the data type is float, the unwanted positions have + `-INF` values and the others have 0 values. It can be None when nothing wanted or needed to be prevented attention to. Default None. dtype (VarType): The target type of `attn_mask` we expect. Returns: diff --git a/paddlers/models/ppdet/modeling/losses/__init__.py b/paddlers/models/ppdet/modeling/losses/__init__.py index dc1cc49..a4a09f0 100644 --- a/paddlers/models/ppdet/modeling/losses/__init__.py +++ b/paddlers/models/ppdet/modeling/losses/__init__.py @@ -25,6 +25,8 @@ from . import fairmot_loss from . import gfocal_loss from . import detr_loss from . import sparsercnn_loss +from . import focal_loss +from . import smooth_l1_loss from .yolo_loss import * from .iou_aware_loss import * @@ -39,3 +41,5 @@ from .fairmot_loss import * from .gfocal_loss import * from .detr_loss import * from .sparsercnn_loss import * +from .focal_loss import * +from .smooth_l1_loss import * diff --git a/paddlers/models/ppdet/modeling/losses/detr_loss.py b/paddlers/models/ppdet/modeling/losses/detr_loss.py index 5c654d3..7333903 100644 --- a/paddlers/models/ppdet/modeling/losses/detr_loss.py +++ b/paddlers/models/ppdet/modeling/losses/detr_loss.py @@ -80,7 +80,7 @@ class DETRLoss(nn.Layer): target_label = target_label.reshape([bs, num_query_objects]) if self.use_focal_loss: target_label = F.one_hot(target_label, - self.num_classes + 1)[:, :, :-1] + self.num_classes + 1)[..., :-1] return { 'loss_class': self.loss_coeff['class'] * sigmoid_focal_loss( logits, target_label, num_gts / num_query_objects) diff --git a/paddlers/models/ppdet/modeling/losses/fairmot_loss.py b/paddlers/models/ppdet/modeling/losses/fairmot_loss.py old mode 100644 new mode 100755 diff --git a/paddlers/models/ppdet/modeling/losses/fcos_loss.py b/paddlers/models/ppdet/modeling/losses/fcos_loss.py index db90b7d..675bcd6 100644 --- a/paddlers/models/ppdet/modeling/losses/fcos_loss.py +++ b/paddlers/models/ppdet/modeling/losses/fcos_loss.py @@ -30,7 +30,7 @@ def flatten_tensor(inputs, channel_first=False): Flatten a Tensor Args: inputs (Tensor): 4-D Tensor with shape [N, C, H, W] or [N, H, W, C] - channel_first (bool): If true the dimension order of Tensor is + channel_first (bool): If true the dimension order of Tensor is [N, C, H, W], otherwise is [N, H, W, C] Return: output_channel_last (Tensor): The flattened Tensor in channel_last style diff --git a/paddlers/models/ppdet/modeling/losses/focal_loss.py b/paddlers/models/ppdet/modeling/losses/focal_loss.py new file mode 100644 index 0000000..508a08c --- /dev/null +++ b/paddlers/models/ppdet/modeling/losses/focal_loss.py @@ -0,0 +1,63 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn.functional as F +import paddle.nn as nn +from paddlers.models.ppdet.core.workspace import register + +__all__ = ['FocalLoss'] + + +@register +class FocalLoss(nn.Layer): + """A wrapper around paddle.nn.functional.sigmoid_focal_loss. + Args: + use_sigmoid (bool): currently only support use_sigmoid=True + alpha (float): parameter alpha in Focal Loss + gamma (float): parameter gamma in Focal Loss + loss_weight (float): final loss will be multiplied by this + """ + + def __init__(self, use_sigmoid=True, alpha=0.25, gamma=2.0, + loss_weight=1.0): + super(FocalLoss, self).__init__() + assert use_sigmoid == True, \ + 'Focal Loss only supports sigmoid at the moment' + self.use_sigmoid = use_sigmoid + self.alpha = alpha + self.gamma = gamma + self.loss_weight = loss_weight + + def forward(self, pred, target, reduction='none'): + """forward function. + Args: + pred (Tensor): logits of class prediction, of shape (N, num_classes) + target (Tensor): target class label, of shape (N, ) + reduction (str): the way to reduce loss, one of (none, sum, mean) + """ + num_classes = pred.shape[1] + target = F.one_hot(target, num_classes + 1).cast(pred.dtype) + target = target[:, :-1].detach() + loss = F.sigmoid_focal_loss( + pred, + target, + alpha=self.alpha, + gamma=self.gamma, + reduction=reduction) + return loss * self.loss_weight diff --git a/paddlers/models/ppdet/modeling/losses/iou_loss.py b/paddlers/models/ppdet/modeling/losses/iou_loss.py index 548fbb4..deb3332 100644 --- a/paddlers/models/ppdet/modeling/losses/iou_loss.py +++ b/paddlers/models/ppdet/modeling/losses/iou_loss.py @@ -17,13 +17,13 @@ from __future__ import division from __future__ import print_function import numpy as np - +import math import paddle from paddlers.models.ppdet.core.workspace import register, serializable from ..bbox_utils import bbox_iou -__all__ = ['IouLoss', 'GIoULoss', 'DIouLoss'] +__all__ = ['IouLoss', 'GIoULoss', 'DIouLoss', 'SIoULoss'] @register @@ -208,3 +208,88 @@ class DIouLoss(GIoULoss): diou = paddle.mean((1 - iouk + ciou_term + diou_term) * iou_weight) return diou * self.loss_weight + + +@register +@serializable +class SIoULoss(GIoULoss): + """ + see https://arxiv.org/pdf/2205.12740.pdf + Args: + loss_weight (float): siou loss weight, default as 1 + eps (float): epsilon to avoid divide by zero, default as 1e-10 + theta (float): default as 4 + reduction (str): Options are "none", "mean" and "sum". default as none + """ + + def __init__(self, loss_weight=1., eps=1e-10, theta=4., reduction='none'): + super(SIoULoss, self).__init__(loss_weight=loss_weight, eps=eps) + self.loss_weight = loss_weight + self.eps = eps + self.theta = theta + self.reduction = reduction + + def __call__(self, pbox, gbox): + x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1) + x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1) + + box1 = [x1, y1, x2, y2] + box2 = [x1g, y1g, x2g, y2g] + iou = bbox_iou(box1, box2) + + cx = (x1 + x2) / 2 + cy = (y1 + y2) / 2 + w = x2 - x1 + self.eps + h = y2 - y1 + self.eps + + cxg = (x1g + x2g) / 2 + cyg = (y1g + y2g) / 2 + wg = x2g - x1g + self.eps + hg = y2g - y1g + self.eps + + x2 = paddle.maximum(x1, x2) + y2 = paddle.maximum(y1, y2) + + # A or B + xc1 = paddle.minimum(x1, x1g) + yc1 = paddle.minimum(y1, y1g) + xc2 = paddle.maximum(x2, x2g) + yc2 = paddle.maximum(y2, y2g) + + cw_out = xc2 - xc1 + ch_out = yc2 - yc1 + + ch = paddle.maximum(cy, cyg) - paddle.minimum(cy, cyg) + cw = paddle.maximum(cx, cxg) - paddle.minimum(cx, cxg) + + # angle cost + dist_intersection = paddle.sqrt((cx - cxg)**2 + (cy - cyg)**2) + sin_angle_alpha = ch / dist_intersection + sin_angle_beta = cw / dist_intersection + thred = paddle.pow(paddle.to_tensor(2), 0.5) / 2 + thred.stop_gradient = True + sin_alpha = paddle.where(sin_angle_alpha > thred, sin_angle_beta, + sin_angle_alpha) + angle_cost = paddle.cos(paddle.asin(sin_alpha) * 2 - math.pi / 2) + + # distance cost + gamma = 2 - angle_cost + # gamma.stop_gradient = True + beta_x = ((cxg - cx) / cw_out)**2 + beta_y = ((cyg - cy) / ch_out)**2 + dist_cost = 1 - paddle.exp(-gamma * beta_x) + 1 - paddle.exp(-gamma * + beta_y) + + # shape cost + omega_w = paddle.abs(w - wg) / paddle.maximum(w, wg) + omega_h = paddle.abs(hg - h) / paddle.maximum(h, hg) + omega = (1 - paddle.exp(-omega_w))**self.theta + ( + 1 - paddle.exp(-omega_h))**self.theta + siou_loss = 1 - iou + (omega + dist_cost) / 2 + + if self.reduction == 'mean': + siou_loss = paddle.mean(siou_loss) + elif self.reduction == 'sum': + siou_loss = paddle.sum(siou_loss) + + return siou_loss * self.loss_weight diff --git a/paddlers/models/ppdet/modeling/losses/smooth_l1_loss.py b/paddlers/models/ppdet/modeling/losses/smooth_l1_loss.py new file mode 100644 index 0000000..7fb1eaf --- /dev/null +++ b/paddlers/models/ppdet/modeling/losses/smooth_l1_loss.py @@ -0,0 +1,61 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddlers.models.ppdet.core.workspace import register + +__all__ = ['SmoothL1Loss'] + + +@register +class SmoothL1Loss(nn.Layer): + """Smooth L1 Loss. + Args: + beta (float): controls smooth region, it becomes L1 Loss when beta=0.0 + loss_weight (float): the final loss will be multiplied by this + """ + + def __init__(self, beta=1.0, loss_weight=1.0): + super(SmoothL1Loss, self).__init__() + assert beta >= 0 + self.beta = beta + self.loss_weight = loss_weight + + def forward(self, pred, target, reduction='none'): + """forward function, based on fvcore. + Args: + pred (Tensor): prediction tensor + target (Tensor): target tensor, pred.shape must be the same as target.shape + reduction (str): the way to reduce loss, one of (none, sum, mean) + """ + assert reduction in ('none', 'sum', 'mean') + target = target.detach() + if self.beta < 1e-5: + loss = paddle.abs(pred - target) + else: + n = paddle.abs(pred - target) + cond = n < self.beta + loss = paddle.where(cond, 0.5 * n**2 / self.beta, + n - 0.5 * self.beta) + if reduction == 'mean': + loss = loss.mean() if loss.size > 0 else 0.0 * loss.sum() + elif reduction == 'sum': + loss = loss.sum() + return loss * self.loss_weight diff --git a/paddlers/models/ppdet/modeling/losses/sparsercnn_loss.py b/paddlers/models/ppdet/modeling/losses/sparsercnn_loss.py index e7f2ef1..8f99de8 100644 --- a/paddlers/models/ppdet/modeling/losses/sparsercnn_loss.py +++ b/paddlers/models/ppdet/modeling/losses/sparsercnn_loss.py @@ -198,7 +198,7 @@ class SparseRCNNLoss(nn.Layer): # Retrieve the matching between the outputs of the last layer and the targets indices = self.matcher(outputs_without_aux, targets) - # Compute the average number of target boxes accross all nodes, for normalization purposes + # Compute the average number of target boxes across all nodes, for normalization purposes num_boxes = sum(len(t["labels"]) for t in targets) num_boxes = paddle.to_tensor( [num_boxes], diff --git a/paddlers/models/ppdet/modeling/losses/ssd_loss.py b/paddlers/models/ppdet/modeling/losses/ssd_loss.py index 345f095..53c1198 100644 --- a/paddlers/models/ppdet/modeling/losses/ssd_loss.py +++ b/paddlers/models/ppdet/modeling/losses/ssd_loss.py @@ -20,8 +20,7 @@ import paddle import paddle.nn as nn import paddle.nn.functional as F from paddlers.models.ppdet.core.workspace import register -from ..ops import iou_similarity -from ..bbox_utils import bbox2delta +from ..bbox_utils import iou_similarity, bbox2delta __all__ = ['SSDLoss'] diff --git a/paddlers/models/ppdet/modeling/losses/varifocal_loss.py b/paddlers/models/ppdet/modeling/losses/varifocal_loss.py index 854b253..030e17a 100644 --- a/paddlers/models/ppdet/modeling/losses/varifocal_loss.py +++ b/paddlers/models/ppdet/modeling/losses/varifocal_loss.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. # The code is based on: diff --git a/paddlers/models/ppdet/modeling/losses/yolo_loss.py b/paddlers/models/ppdet/modeling/losses/yolo_loss.py index fadc303..a0d0f0d 100644 --- a/paddlers/models/ppdet/modeling/losses/yolo_loss.py +++ b/paddlers/models/ppdet/modeling/losses/yolo_loss.py @@ -21,7 +21,7 @@ import paddle.nn as nn import paddle.nn.functional as F from paddlers.models.ppdet.core.workspace import register -from ..bbox_utils import decode_yolo, xywh2xyxy, iou_similarity +from ..bbox_utils import decode_yolo, xywh2xyxy, batch_iou_similarity __all__ = ['YOLOv3Loss'] @@ -56,7 +56,7 @@ class YOLOv3Loss(nn.Layer): downsample (list): downsample ratio for each detection block scale_x_y (float): scale_x_y factor iou_loss (object): IoULoss instance - iou_aware_loss (object): IouAwareLoss instance + iou_aware_loss (object): IouAwareLoss instance """ super(YOLOv3Loss, self).__init__() self.num_classes = num_classes @@ -80,7 +80,7 @@ class YOLOv3Loss(nn.Layer): gwh = gbox[:, :, 0:2] + gbox[:, :, 2:4] * 0.5 gbox = paddle.concat([gxy, gwh], axis=-1) - iou = iou_similarity(pbox, gbox) + iou = batch_iou_similarity(pbox, gbox) iou.stop_gradient = True iou_max = iou.max(2) # [N, M1] iou_mask = paddle.cast(iou_max <= self.ignore_thresh, dtype=pbox.dtype) diff --git a/paddlers/models/ppdet/modeling/mot/matching/__init__.py b/paddlers/models/ppdet/modeling/mot/matching/__init__.py index 9b2f207..7a25400 100644 --- a/paddlers/models/ppdet/modeling/mot/matching/__init__.py +++ b/paddlers/models/ppdet/modeling/mot/matching/__init__.py @@ -14,6 +14,8 @@ from . import jde_matching from . import deepsort_matching +from . import ocsort_matching from .jde_matching import * from .deepsort_matching import * +from .ocsort_matching import * diff --git a/paddlers/models/ppdet/modeling/mot/matching/deepsort_matching.py b/paddlers/models/ppdet/modeling/mot/matching/deepsort_matching.py index 1a50b30..85fba90 100644 --- a/paddlers/models/ppdet/modeling/mot/matching/deepsort_matching.py +++ b/paddlers/models/ppdet/modeling/mot/matching/deepsort_matching.py @@ -78,8 +78,8 @@ def iou_cost(tracks, detections, track_indices=None, detection_indices=None): that should be matched. Defaults to all `detections`. Returns: - cost_matrix (ndarray): A cost matrix of shape len(track_indices), - len(detection_indices) where entry (i, j) is + cost_matrix (ndarray): A cost matrix of shape len(track_indices), + len(detection_indices) where entry (i, j) is `1 - iou(tracks[track_indices[i]], detections[detection_indices[j]])`. """ if track_indices is None: @@ -152,7 +152,7 @@ class NearestNeighborDistanceMetric(object): budget (Optional[int]): If not None, fix samples per class to at most this number. Removes the oldest samples when the budget is reached. - Attributes: + Attributes: samples (Dict[int -> List[ndarray]]): A dictionary that maps from target identities to the list of samples that have been observed so far. """ @@ -216,8 +216,8 @@ def min_cost_matching(distance_metric, Args: distance_metric : Callable[List[Track], List[Detection], List[int], List[int]) -> ndarray - The distance metric is given a list of tracks and detections as - well as a list of N track indices and M detection indices. The + The distance metric is given a list of tracks and detections as + well as a list of N track indices and M detection indices. The metric should return the NxM dimensional cost matrix, where element (i, j) is the association cost between the i-th track in the given track indices and the j-th detection in the given detection_indices. @@ -284,8 +284,8 @@ def matching_cascade(distance_metric, Args: distance_metric : Callable[List[Track], List[Detection], List[int], List[int]) -> ndarray - The distance metric is given a list of tracks and detections as - well as a list of N track indices and M detection indices. The + The distance metric is given a list of tracks and detections as + well as a list of N track indices and M detection indices. The metric should return the NxM dimensional cost matrix, where element (i, j) is the association cost between the i-th track in the given track indices and the j-th detection in the given detection_indices. diff --git a/paddlers/models/ppdet/modeling/mot/matching/jde_matching.py b/paddlers/models/ppdet/modeling/mot/matching/jde_matching.py index 08a1963..89be751 100644 --- a/paddlers/models/ppdet/modeling/mot/matching/jde_matching.py +++ b/paddlers/models/ppdet/modeling/mot/matching/jde_matching.py @@ -15,7 +15,14 @@ This code is based on https://github.com/Zhongdao/Towards-Realtime-MOT/blob/master/tracker/matching.py """ -import lap +try: + import lap +except: + print( + 'Warning: Unable to use JDE/FairMOT/ByteTrack, please install lap, for example: `pip install lap`, see https://github.com/gatagat/lap' + ) + pass + import scipy import numpy as np from scipy.spatial.distance import cdist @@ -26,7 +33,7 @@ warnings.filterwarnings("ignore") __all__ = [ 'merge_matches', 'linear_assignment', - 'cython_bbox_ious', + 'bbox_ious', 'iou_distance', 'embedding_distance', 'fuse_motion', @@ -53,6 +60,12 @@ def merge_matches(m1, m2, shape): def linear_assignment(cost_matrix, thresh): + try: + import lap + except Exception as e: + raise RuntimeError( + 'Unable to use JDE/FairMOT/ByteTrack, please install lap, for example: `pip install lap`, see https://github.com/gatagat/lap' + ) if cost_matrix.size == 0: return np.empty( (0, 2), dtype=int), tuple(range(cost_matrix.shape[0])), tuple( @@ -68,22 +81,28 @@ def linear_assignment(cost_matrix, thresh): return matches, unmatched_a, unmatched_b -def cython_bbox_ious(atlbrs, btlbrs): - ious = np.zeros((len(atlbrs), len(btlbrs)), dtype=np.float) - if ious.size == 0: +def bbox_ious(atlbrs, btlbrs): + boxes = np.ascontiguousarray(atlbrs, dtype=np.float) + query_boxes = np.ascontiguousarray(btlbrs, dtype=np.float) + N = boxes.shape[0] + K = query_boxes.shape[0] + ious = np.zeros((N, K), dtype=boxes.dtype) + if N * K == 0: return ious - try: - import cython_bbox - except Exception as e: - print('cython_bbox not found, please install cython_bbox.' - 'for example: `pip install cython_bbox`.') - raise e - - ious = cython_bbox.bbox_overlaps( - np.ascontiguousarray( - atlbrs, dtype=np.float), - np.ascontiguousarray( - btlbrs, dtype=np.float)) + + for k in range(K): + box_area = ((query_boxes[k, 2] - query_boxes[k, 0] + 1) * + (query_boxes[k, 3] - query_boxes[k, 1] + 1)) + for n in range(N): + iw = (min(boxes[n, 2], query_boxes[k, 2]) - max( + boxes[n, 0], query_boxes[k, 0]) + 1) + if iw > 0: + ih = (min(boxes[n, 3], query_boxes[k, 3]) - max( + boxes[n, 1], query_boxes[k, 1]) + 1) + if ih > 0: + ua = float((boxes[n, 2] - boxes[n, 0] + 1) * (boxes[ + n, 3] - boxes[n, 1] + 1) + box_area - iw * ih) + ious[n, k] = iw * ih / ua return ious @@ -98,7 +117,7 @@ def iou_distance(atracks, btracks): else: atlbrs = [track.tlbr for track in atracks] btlbrs = [track.tlbr for track in btracks] - _ious = cython_bbox_ious(atlbrs, btlbrs) + _ious = bbox_ious(atlbrs, btlbrs) cost_matrix = 1 - _ious return cost_matrix diff --git a/paddlers/models/ppdet/modeling/mot/matching/ocsort_matching.py b/paddlers/models/ppdet/modeling/mot/matching/ocsort_matching.py new file mode 100644 index 0000000..a32d761 --- /dev/null +++ b/paddlers/models/ppdet/modeling/mot/matching/ocsort_matching.py @@ -0,0 +1,124 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is based on https://github.com/noahcao/OC_SORT/blob/master/trackers/ocsort_tracker/association.py +""" + +import os +import numpy as np + + +def iou_batch(bboxes1, bboxes2): + bboxes2 = np.expand_dims(bboxes2, 0) + bboxes1 = np.expand_dims(bboxes1, 1) + + xx1 = np.maximum(bboxes1[..., 0], bboxes2[..., 0]) + yy1 = np.maximum(bboxes1[..., 1], bboxes2[..., 1]) + xx2 = np.minimum(bboxes1[..., 2], bboxes2[..., 2]) + yy2 = np.minimum(bboxes1[..., 3], bboxes2[..., 3]) + w = np.maximum(0., xx2 - xx1) + h = np.maximum(0., yy2 - yy1) + area = w * h + iou_matrix = area / ((bboxes1[..., 2] - bboxes1[..., 0]) * + (bboxes1[..., 3] - bboxes1[..., 1]) + + (bboxes2[..., 2] - bboxes2[..., 0]) * + (bboxes2[..., 3] - bboxes2[..., 1]) - area) + return iou_matrix + + +def speed_direction_batch(dets, tracks): + tracks = tracks[..., np.newaxis] + CX1, CY1 = (dets[:, 0] + dets[:, 2]) / 2.0, (dets[:, 1] + dets[:, 3]) / 2.0 + CX2, CY2 = (tracks[:, 0] + tracks[:, 2]) / 2.0, ( + tracks[:, 1] + tracks[:, 3]) / 2.0 + dx = CX1 - CX2 + dy = CY1 - CY2 + norm = np.sqrt(dx**2 + dy**2) + 1e-6 + dx = dx / norm + dy = dy / norm + return dy, dx + + +def linear_assignment(cost_matrix): + try: + import lap + _, x, y = lap.lapjv(cost_matrix, extend_cost=True) + return np.array([[y[i], i] for i in x if i >= 0]) + except ImportError: + from scipy.optimize import linear_sum_assignment + x, y = linear_sum_assignment(cost_matrix) + return np.array(list(zip(x, y))) + + +def associate(detections, trackers, iou_threshold, velocities, previous_obs, + vdc_weight): + if (len(trackers) == 0): + return np.empty( + (0, 2), dtype=int), np.arange(len(detections)), np.empty( + (0, 5), dtype=int) + + Y, X = speed_direction_batch(detections, previous_obs) + inertia_Y, inertia_X = velocities[:, 0], velocities[:, 1] + inertia_Y = np.repeat(inertia_Y[:, np.newaxis], Y.shape[1], axis=1) + inertia_X = np.repeat(inertia_X[:, np.newaxis], X.shape[1], axis=1) + diff_angle_cos = inertia_X * X + inertia_Y * Y + diff_angle_cos = np.clip(diff_angle_cos, a_min=-1, a_max=1) + diff_angle = np.arccos(diff_angle_cos) + diff_angle = (np.pi / 2.0 - np.abs(diff_angle)) / np.pi + + valid_mask = np.ones(previous_obs.shape[0]) + valid_mask[np.where(previous_obs[:, 4] < 0)] = 0 + + iou_matrix = iou_batch(detections, trackers) + scores = np.repeat( + detections[:, -1][:, np.newaxis], trackers.shape[0], axis=1) + # iou_matrix = iou_matrix * scores # a trick sometiems works, we don't encourage this + valid_mask = np.repeat(valid_mask[:, np.newaxis], X.shape[1], axis=1) + + angle_diff_cost = (valid_mask * diff_angle) * vdc_weight + angle_diff_cost = angle_diff_cost.T + angle_diff_cost = angle_diff_cost * scores + + if min(iou_matrix.shape) > 0: + a = (iou_matrix > iou_threshold).astype(np.int32) + if a.sum(1).max() == 1 and a.sum(0).max() == 1: + matched_indices = np.stack(np.where(a), axis=1) + else: + matched_indices = linear_assignment(-(iou_matrix + angle_diff_cost)) + else: + matched_indices = np.empty(shape=(0, 2)) + + unmatched_detections = [] + for d, det in enumerate(detections): + if (d not in matched_indices[:, 0]): + unmatched_detections.append(d) + unmatched_trackers = [] + for t, trk in enumerate(trackers): + if (t not in matched_indices[:, 1]): + unmatched_trackers.append(t) + + # filter out matched with low IOU + matches = [] + for m in matched_indices: + if (iou_matrix[m[0], m[1]] < iou_threshold): + unmatched_detections.append(m[0]) + unmatched_trackers.append(m[1]) + else: + matches.append(m.reshape(1, 2)) + if (len(matches) == 0): + matches = np.empty((0, 2), dtype=int) + else: + matches = np.concatenate(matches, axis=0) + + return matches, np.array(unmatched_detections), np.array(unmatched_trackers) diff --git a/paddlers/models/ppdet/modeling/mot/motion/kalman_filter.py b/paddlers/models/ppdet/modeling/mot/motion/kalman_filter.py index 6de6198..6714a00 100644 --- a/paddlers/models/ppdet/modeling/mot/motion/kalman_filter.py +++ b/paddlers/models/ppdet/modeling/mot/motion/kalman_filter.py @@ -83,7 +83,7 @@ class KalmanFilter(object): Returns: The mean vector (8 dimensional) and covariance matrix (8x8 - dimensional) of the new track. Unobserved velocities are + dimensional) of the new track. Unobserved velocities are initialized to 0 mean. """ mean_pos = measurement @@ -112,7 +112,7 @@ class KalmanFilter(object): object state at the previous time step. Returns: - The mean vector and covariance matrix of the predicted state. + The mean vector and covariance matrix of the predicted state. Unobserved velocities are initialized to 0 mean. """ std_pos = [ @@ -157,7 +157,7 @@ class KalmanFilter(object): def multi_predict(self, mean, covariance): """ Run Kalman filter prediction step (Vectorized version). - + Args: mean (ndarray): The Nx8 dimensional mean matrix of the object states at the previous time step. @@ -231,7 +231,7 @@ class KalmanFilter(object): A suitable distance threshold can be obtained from `chi2inv95`. If `only_position` is False, the chi-square distribution has 4 degrees of freedom, otherwise 2. - + Args: mean (ndarray): Mean vector over the state distribution (8 dimensional). @@ -240,7 +240,7 @@ class KalmanFilter(object): measurements (ndarray): An Nx4 dimensional matrix of N measurements, each in format (x, y, a, h) where (x, y) is the bounding box center position, a the aspect ratio, and h the height. - only_position (Optional[bool]): If True, distance computation is + only_position (Optional[bool]): If True, distance computation is done with respect to the bounding box center position only. metric (str): Metric type, 'gaussian' or 'maha'. diff --git a/paddlers/models/ppdet/modeling/mot/tracker/__init__.py b/paddlers/models/ppdet/modeling/mot/tracker/__init__.py index f97fe45..8019cac 100644 --- a/paddlers/models/ppdet/modeling/mot/tracker/__init__.py +++ b/paddlers/models/ppdet/modeling/mot/tracker/__init__.py @@ -16,8 +16,10 @@ from . import base_jde_tracker from . import base_sde_tracker from . import jde_tracker from . import deepsort_tracker +from . import ocsort_tracker from .base_jde_tracker import * from .base_sde_tracker import * from .jde_tracker import * from .deepsort_tracker import * +from .ocsort_tracker import * diff --git a/paddlers/models/ppdet/modeling/mot/tracker/base_jde_tracker.py b/paddlers/models/ppdet/modeling/mot/tracker/base_jde_tracker.py index 37b2501..6ee42a0 100644 --- a/paddlers/models/ppdet/modeling/mot/tracker/base_jde_tracker.py +++ b/paddlers/models/ppdet/modeling/mot/tracker/base_jde_tracker.py @@ -51,7 +51,7 @@ class BaseTrack(object): history = OrderedDict() features = [] - curr_feature = None + curr_feat = None score = 0 start_frame = 0 frame_id = 0 diff --git a/paddlers/models/ppdet/modeling/mot/tracker/deepsort_tracker.py b/paddlers/models/ppdet/modeling/mot/tracker/deepsort_tracker.py index 221890a..4000fe6 100644 --- a/paddlers/models/ppdet/modeling/mot/tracker/deepsort_tracker.py +++ b/paddlers/models/ppdet/modeling/mot/tracker/deepsort_tracker.py @@ -47,12 +47,12 @@ class DeepSORTTracker(object): Removes the oldest samples when the budget is reached. max_age (int): maximum number of missed misses before a track is deleted n_init (float): Number of frames that a track remains in initialization - phase. Number of consecutive detections before the track is confirmed. - The track state is set to `Deleted` if a miss occurs within the first + phase. Number of consecutive detections before the track is confirmed. + The track state is set to `Deleted` if a miss occurs within the first `n_init` frames. - metric_type (str): either "euclidean" or "cosine", the distance metric + metric_type (str): either "euclidean" or "cosine", the distance metric used for measurement to track association. - matching_threshold (float): samples with larger distance are + matching_threshold (float): samples with larger distance are considered an invalid match. max_iou_distance (float): max iou distance threshold motion (object): KalmanFilter instance @@ -96,13 +96,16 @@ class DeepSORTTracker(object): Perform measurement update and track management. Args: pred_dets (np.array): Detection results of the image, the shape is - [N, 6], means 'x0, y0, x1, y1, score, cls_id'. + [N, 6], means 'cls_id, score, x0, y0, x1, y1'. pred_embs (np.array): Embedding results of the image, the shape is [N, 128], usually pred_embs.shape[1] is a multiple of 128. """ - pred_tlwhs = pred_dets[:, :4] - pred_scores = pred_dets[:, 4:5] - pred_cls_ids = pred_dets[:, 5:] + pred_cls_ids = pred_dets[:, 0:1] + pred_scores = pred_dets[:, 1:2] + pred_xyxys = pred_dets[:, 2:6] + pred_tlwhs = np.concatenate( + (pred_xyxys[:, 0:2], pred_xyxys[:, 2:4] - pred_xyxys[:, 0:2] + 1), + axis=1) detections = [ Detection(tlwh, score, feat, cls_id) diff --git a/paddlers/models/ppdet/modeling/mot/tracker/jde_tracker.py b/paddlers/models/ppdet/modeling/mot/tracker/jde_tracker.py index 2d0d308..d2ac7fe 100644 --- a/paddlers/models/ppdet/modeling/mot/tracker/jde_tracker.py +++ b/paddlers/models/ppdet/modeling/mot/tracker/jde_tracker.py @@ -38,22 +38,30 @@ class JDETracker(object): JDE tracker, support single class and multi classes Args: + use_byte (bool): Whether use ByteTracker, default False num_classes (int): the number of classes det_thresh (float): threshold of detection score track_buffer (int): buffer for tracker min_box_area (int): min box area to filter out low quality boxes vertical_ratio (float): w/h, the vertical ratio of the bbox to filter - bad results. If set <0 means no need to filter bboxes,usually set + bad results. If set <= 0 means no need to filter bboxes,usually set 1.6 for pedestrian tracking. - tracked_thresh (float): linear assignment threshold of tracked + tracked_thresh (float): linear assignment threshold of tracked stracks and detections - r_tracked_thresh (float): linear assignment threshold of + r_tracked_thresh (float): linear assignment threshold of tracked stracks and unmatched detections - unconfirmed_thresh (float): linear assignment threshold of + unconfirmed_thresh (float): linear assignment threshold of unconfirmed stracks and unmatched detections + conf_thres (float): confidence threshold for tracking, also used in + ByteTracker as higher confidence threshold + match_thres (float): linear assignment threshold of tracked + stracks and detections in ByteTracker + low_conf_thres (float): lower confidence threshold for tracking in + ByteTracker + input_size (list): input feature map size to reid model, [h, w] format, + [64, 192] as default. motion (str): motion model, KalmanFilter as default - conf_thres (float): confidence threshold for tracking - metric_type (str): either "euclidean" or "cosine", the distance metric + metric_type (str): either "euclidean" or "cosine", the distance metric used for measurement to track association. """ @@ -62,14 +70,15 @@ class JDETracker(object): num_classes=1, det_thresh=0.3, track_buffer=30, - min_box_area=200, - vertical_ratio=1.6, + min_box_area=0, + vertical_ratio=0, tracked_thresh=0.7, r_tracked_thresh=0.5, unconfirmed_thresh=0.7, conf_thres=0, match_thres=0.8, low_conf_thres=0.2, + input_size=[64, 192], motion='KalmanFilter', metric_type='euclidean'): self.use_byte = use_byte @@ -86,6 +95,7 @@ class JDETracker(object): self.match_thres = match_thres self.low_conf_thres = low_conf_thres + self.input_size = input_size if motion == 'KalmanFilter': self.motion = KalmanFilter() self.metric_type = metric_type @@ -106,13 +116,13 @@ class JDETracker(object): Args: pred_dets (np.array): Detection results of the image, the shape is - [N, 6], means 'x0, y0, x1, y1, score, cls_id'. + [N, 6], means 'cls_id, score, x0, y0, x1, y1'. pred_embs (np.array): Embedding results of the image, the shape is [N, 128] or [N, 512]. Return: output_stracks_dict (dict(list)): The list contains information - regarding the online_tracklets for the recieved image tensor. + regarding the online_tracklets for the received image tensor. """ self.frame_id += 1 if self.frame_id == 1: @@ -128,7 +138,7 @@ class JDETracker(object): # unify single and multi classes detection and embedding results for cls_id in range(self.num_classes): - cls_idx = (pred_dets[:, 5:] == cls_id).squeeze(-1) + cls_idx = (pred_dets[:, 0:1] == cls_id).squeeze(-1) pred_dets_dict[cls_id] = pred_dets[cls_idx] if pred_embs is not None: pred_embs_dict[cls_id] = pred_embs[cls_idx] @@ -139,14 +149,15 @@ class JDETracker(object): """ Step 1: Get detections by class""" pred_dets_cls = pred_dets_dict[cls_id] pred_embs_cls = pred_embs_dict[cls_id] - remain_inds = (pred_dets_cls[:, 4:5] > self.conf_thres).squeeze(-1) + remain_inds = (pred_dets_cls[:, 1:2] > self.conf_thres).squeeze(-1) if remain_inds.sum() > 0: pred_dets_cls = pred_dets_cls[remain_inds] - if self.use_byte: + if pred_embs_cls is None: + # in original ByteTrack detections = [ STrack( - STrack.tlbr_to_tlwh(tlbrs[:4]), - tlbrs[4], + STrack.tlbr_to_tlwh(tlbrs[2:6]), + tlbrs[1], cls_id, 30, temp_feat=None) for tlbrs in pred_dets_cls @@ -155,7 +166,7 @@ class JDETracker(object): pred_embs_cls = pred_embs_cls[remain_inds] detections = [ STrack( - STrack.tlbr_to_tlwh(tlbrs[:4]), tlbrs[4], cls_id, + STrack.tlbr_to_tlwh(tlbrs[2:6]), tlbrs[1], cls_id, 30, temp_feat) for (tlbrs, temp_feat ) in zip(pred_dets_cls, pred_embs_cls) @@ -181,11 +192,12 @@ class JDETracker(object): # Predict the current location with KalmanFilter STrack.multi_predict(track_pool_dict[cls_id], self.motion) - if self.use_byte: + if pred_embs_cls is None: + # in original ByteTrack dists = matching.iou_distance(track_pool_dict[cls_id], detections) matches, u_track, u_detection = matching.linear_assignment( - dists, thresh=self.match_thres) # + dists, thresh=self.match_thres) # not self.tracked_thresh else: dists = matching.embedding_distance( track_pool_dict[cls_id], @@ -213,22 +225,34 @@ class JDETracker(object): # None of the steps below happen if there are no undetected tracks. """ Step 3: Second association, with IOU""" if self.use_byte: - inds_low = pred_dets_dict[cls_id][:, 4:5] > self.low_conf_thres - inds_high = pred_dets_dict[cls_id][:, 4:5] < self.conf_thres + inds_low = pred_dets_dict[cls_id][:, 1:2] > self.low_conf_thres + inds_high = pred_dets_dict[cls_id][:, 1:2] < self.conf_thres inds_second = np.logical_and(inds_low, inds_high).squeeze(-1) pred_dets_cls_second = pred_dets_dict[cls_id][inds_second] # association the untrack to the low score detections if len(pred_dets_cls_second) > 0: - detections_second = [ - STrack( - STrack.tlbr_to_tlwh(tlbrs[:4]), - tlbrs[4], - cls_id, - 30, - temp_feat=None) - for tlbrs in pred_dets_cls_second[:, :5] - ] + if pred_embs_dict[cls_id] is None: + # in original ByteTrack + detections_second = [ + STrack( + STrack.tlbr_to_tlwh(tlbrs[2:6]), + tlbrs[1], + cls_id, + 30, + temp_feat=None) + for tlbrs in pred_dets_cls_second + ] + else: + pred_embs_cls_second = pred_embs_dict[cls_id][ + inds_second] + detections_second = [ + STrack( + STrack.tlbr_to_tlwh(tlbrs[2:6]), tlbrs[1], + cls_id, 30, temp_feat) + for (tlbrs, temp_feat) in zip(pred_dets_cls_second, + pred_embs_cls_second) + ] else: detections_second = [] r_tracked_stracks = [ diff --git a/paddlers/models/ppdet/modeling/mot/tracker/ocsort_tracker.py b/paddlers/models/ppdet/modeling/mot/tracker/ocsort_tracker.py new file mode 100644 index 0000000..b86eb5e --- /dev/null +++ b/paddlers/models/ppdet/modeling/mot/tracker/ocsort_tracker.py @@ -0,0 +1,369 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is based on https://github.com/noahcao/OC_SORT/blob/master/trackers/ocsort_tracker/ocsort.py +""" + +import numpy as np +try: + from filterpy.kalman import KalmanFilter +except: + print( + 'Warning: Unable to use OC-SORT, please install filterpy, for example: `pip install filterpy`, see https://github.com/rlabbe/filterpy' + ) + pass + +from ..matching.ocsort_matching import associate, linear_assignment, iou_batch +from paddlers.models.ppdet.core.workspace import register, serializable + + +def k_previous_obs(observations, cur_age, k): + if len(observations) == 0: + return [-1, -1, -1, -1, -1] + for i in range(k): + dt = k - i + if cur_age - dt in observations: + return observations[cur_age - dt] + max_age = max(observations.keys()) + return observations[max_age] + + +def convert_bbox_to_z(bbox): + """ + Takes a bounding box in the form [x1,y1,x2,y2] and returns z in the form + [x,y,s,r] where x,y is the centre of the box and s is the scale/area and r is + the aspect ratio + """ + w = bbox[2] - bbox[0] + h = bbox[3] - bbox[1] + x = bbox[0] + w / 2. + y = bbox[1] + h / 2. + s = w * h # scale is just area + r = w / float(h + 1e-6) + return np.array([x, y, s, r]).reshape((4, 1)) + + +def convert_x_to_bbox(x, score=None): + """ + Takes a bounding box in the centre form [x,y,s,r] and returns it in the form + [x1,y1,x2,y2] where x1,y1 is the top left and x2,y2 is the bottom right + """ + w = np.sqrt(x[2] * x[3]) + h = x[2] / w + if (score == None): + return np.array( + [x[0] - w / 2., x[1] - h / 2., x[0] + w / 2., + x[1] + h / 2.]).reshape((1, 4)) + else: + score = np.array([score]) + return np.array([ + x[0] - w / 2., x[1] - h / 2., x[0] + w / 2., x[1] + h / 2., score + ]).reshape((1, 5)) + + +def speed_direction(bbox1, bbox2): + cx1, cy1 = (bbox1[0] + bbox1[2]) / 2.0, (bbox1[1] + bbox1[3]) / 2.0 + cx2, cy2 = (bbox2[0] + bbox2[2]) / 2.0, (bbox2[1] + bbox2[3]) / 2.0 + speed = np.array([cy2 - cy1, cx2 - cx1]) + norm = np.sqrt((cy2 - cy1)**2 + (cx2 - cx1)**2) + 1e-6 + return speed / norm + + +class KalmanBoxTracker(object): + """ + This class represents the internal state of individual tracked objects observed as bbox. + + Args: + bbox (np.array): bbox in [x1,y1,x2,y2,score] format. + delta_t (int): delta_t of previous observation + """ + count = 0 + + def __init__(self, bbox, delta_t=3): + try: + from filterpy.kalman import KalmanFilter + except Exception as e: + raise RuntimeError( + 'Unable to use OC-SORT, please install filterpy, for example: `pip install filterpy`, see https://github.com/rlabbe/filterpy' + ) + self.kf = KalmanFilter(dim_x=7, dim_z=4) + self.kf.F = np.array([[1, 0, 0, 0, 1, 0, 0], [0, 1, 0, 0, 0, 1, 0], + [0, 0, 1, 0, 0, 0, 1], [0, 0, 0, 1, 0, 0, 0], + [0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 1, 0], + [0, 0, 0, 0, 0, 0, 1]]) + self.kf.H = np.array([[1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], + [0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0]]) + self.kf.R[2:, 2:] *= 10. + self.kf.P[4:, 4:] *= 1000. + # give high uncertainty to the unobservable initial velocities + self.kf.P *= 10. + self.kf.Q[-1, -1] *= 0.01 + self.kf.Q[4:, 4:] *= 0.01 + + self.score = bbox[4] + self.kf.x[:4] = convert_bbox_to_z(bbox) + self.time_since_update = 0 + self.id = KalmanBoxTracker.count + KalmanBoxTracker.count += 1 + self.history = [] + self.hits = 0 + self.hit_streak = 0 + self.age = 0 + """ + NOTE: [-1,-1,-1,-1,-1] is a compromising placeholder for non-observation status, the same for the return of + function k_previous_obs. It is ugly and I do not like it. But to support generate observation array in a + fast and unified way, which you would see below k_observations = np.array([k_previous_obs(...]]), let's bear it for now. + """ + self.last_observation = np.array([-1, -1, -1, -1, -1]) # placeholder + self.observations = dict() + self.history_observations = [] + self.velocity = None + self.delta_t = delta_t + + def update(self, bbox): + """ + Updates the state vector with observed bbox. + """ + if bbox is not None: + if self.last_observation.sum() >= 0: # no previous observation + previous_box = None + for i in range(self.delta_t): + dt = self.delta_t - i + if self.age - dt in self.observations: + previous_box = self.observations[self.age - dt] + break + if previous_box is None: + previous_box = self.last_observation + """ + Estimate the track speed direction with observations \Delta t steps away + """ + self.velocity = speed_direction(previous_box, bbox) + """ + Insert new observations. This is a ugly way to maintain both self.observations + and self.history_observations. Bear it for the moment. + """ + self.last_observation = bbox + self.observations[self.age] = bbox + self.history_observations.append(bbox) + + self.time_since_update = 0 + self.history = [] + self.hits += 1 + self.hit_streak += 1 + self.kf.update(convert_bbox_to_z(bbox)) + else: + self.kf.update(bbox) + + def predict(self): + """ + Advances the state vector and returns the predicted bounding box estimate. + """ + if ((self.kf.x[6] + self.kf.x[2]) <= 0): + self.kf.x[6] *= 0.0 + + self.kf.predict() + self.age += 1 + if (self.time_since_update > 0): + self.hit_streak = 0 + self.time_since_update += 1 + self.history.append(convert_x_to_bbox(self.kf.x, score=self.score)) + return self.history[-1] + + def get_state(self): + return convert_x_to_bbox(self.kf.x, score=self.score) + + +@register +@serializable +class OCSORTTracker(object): + """ + OCSORT tracker, support single class + + Args: + det_thresh (float): threshold of detection score + max_age (int): maximum number of missed misses before a track is deleted + min_hits (int): minimum hits for associate + iou_threshold (float): iou threshold for associate + delta_t (int): delta_t of previous observation + inertia (float): vdc_weight of angle_diff_cost for associate + vertical_ratio (float): w/h, the vertical ratio of the bbox to filter + bad results. If set <= 0 means no need to filter bboxes,usually set + 1.6 for pedestrian tracking. + min_box_area (int): min box area to filter out low quality boxes + use_byte (bool): Whether use ByteTracker, default False + """ + + def __init__(self, + det_thresh=0.6, + max_age=30, + min_hits=3, + iou_threshold=0.3, + delta_t=3, + inertia=0.2, + vertical_ratio=-1, + min_box_area=0, + use_byte=False): + self.det_thresh = det_thresh + self.max_age = max_age + self.min_hits = min_hits + self.iou_threshold = iou_threshold + self.delta_t = delta_t + self.inertia = inertia + self.vertical_ratio = vertical_ratio + self.min_box_area = min_box_area + self.use_byte = use_byte + + self.trackers = [] + self.frame_count = 0 + KalmanBoxTracker.count = 0 + + def update(self, pred_dets, pred_embs=None): + """ + Args: + pred_dets (np.array): Detection results of the image, the shape is + [N, 6], means 'cls_id, score, x0, y0, x1, y1'. + pred_embs (np.array): Embedding results of the image, the shape is + [N, 128] or [N, 512], default as None. + + Return: + tracking boxes (np.array): [M, 6], means 'x0, y0, x1, y1, score, id'. + """ + if pred_dets is None: + return np.empty((0, 6)) + + self.frame_count += 1 + + bboxes = pred_dets[:, 2:] + scores = pred_dets[:, 1:2] + dets = np.concatenate((bboxes, scores), axis=1) + scores = scores.squeeze(-1) + + inds_low = scores > 0.1 + inds_high = scores < self.det_thresh + inds_second = np.logical_and(inds_low, inds_high) + # self.det_thresh > score > 0.1, for second matching + dets_second = dets[inds_second] # detections for second matching + remain_inds = scores > self.det_thresh + dets = dets[remain_inds] + + # get predicted locations from existing trackers. + trks = np.zeros((len(self.trackers), 5)) + to_del = [] + ret = [] + for t, trk in enumerate(trks): + pos = self.trackers[t].predict()[0] + trk[:] = [pos[0], pos[1], pos[2], pos[3], 0] + if np.any(np.isnan(pos)): + to_del.append(t) + trks = np.ma.compress_rows(np.ma.masked_invalid(trks)) + for t in reversed(to_del): + self.trackers.pop(t) + + velocities = np.array([ + trk.velocity if trk.velocity is not None else np.array((0, 0)) + for trk in self.trackers + ]) + last_boxes = np.array([trk.last_observation for trk in self.trackers]) + k_observations = np.array([ + k_previous_obs(trk.observations, trk.age, self.delta_t) + for trk in self.trackers + ]) + """ + First round of association + """ + matched, unmatched_dets, unmatched_trks = associate( + dets, trks, self.iou_threshold, velocities, k_observations, + self.inertia) + for m in matched: + self.trackers[m[1]].update(dets[m[0], :]) + """ + Second round of associaton by OCR + """ + # BYTE association + if self.use_byte and len(dets_second) > 0 and unmatched_trks.shape[ + 0] > 0: + u_trks = trks[unmatched_trks] + iou_left = iou_batch( + dets_second, + u_trks) # iou between low score detections and unmatched tracks + iou_left = np.array(iou_left) + if iou_left.max() > self.iou_threshold: + """ + NOTE: by using a lower threshold, e.g., self.iou_threshold - 0.1, you may + get a higher performance especially on MOT17/MOT20 datasets. But we keep it + uniform here for simplicity + """ + matched_indices = linear_assignment(-iou_left) + to_remove_trk_indices = [] + for m in matched_indices: + det_ind, trk_ind = m[0], unmatched_trks[m[1]] + if iou_left[m[0], m[1]] < self.iou_threshold: + continue + self.trackers[trk_ind].update(dets_second[det_ind, :]) + to_remove_trk_indices.append(trk_ind) + unmatched_trks = np.setdiff1d(unmatched_trks, + np.array(to_remove_trk_indices)) + + if unmatched_dets.shape[0] > 0 and unmatched_trks.shape[0] > 0: + left_dets = dets[unmatched_dets] + left_trks = last_boxes[unmatched_trks] + iou_left = iou_batch(left_dets, left_trks) + iou_left = np.array(iou_left) + if iou_left.max() > self.iou_threshold: + """ + NOTE: by using a lower threshold, e.g., self.iou_threshold - 0.1, you may + get a higher performance especially on MOT17/MOT20 datasets. But we keep it + uniform here for simplicity + """ + rematched_indices = linear_assignment(-iou_left) + to_remove_det_indices = [] + to_remove_trk_indices = [] + for m in rematched_indices: + det_ind, trk_ind = unmatched_dets[m[0]], unmatched_trks[m[ + 1]] + if iou_left[m[0], m[1]] < self.iou_threshold: + continue + self.trackers[trk_ind].update(dets[det_ind, :]) + to_remove_det_indices.append(det_ind) + to_remove_trk_indices.append(trk_ind) + unmatched_dets = np.setdiff1d(unmatched_dets, + np.array(to_remove_det_indices)) + unmatched_trks = np.setdiff1d(unmatched_trks, + np.array(to_remove_trk_indices)) + + for m in unmatched_trks: + self.trackers[m].update(None) + + # create and initialise new trackers for unmatched detections + for i in unmatched_dets: + trk = KalmanBoxTracker(dets[i, :], delta_t=self.delta_t) + self.trackers.append(trk) + i = len(self.trackers) + for trk in reversed(self.trackers): + if trk.last_observation.sum() < 0: + d = trk.get_state()[0] + else: + d = trk.last_observation # tlbr + score + if (trk.time_since_update < 1) and ( + trk.hit_streak >= self.min_hits or + self.frame_count <= self.min_hits): + # +1 as MOT benchmark requires positive + ret.append(np.concatenate((d, [trk.id + 1])).reshape(1, -1)) + i -= 1 + # remove dead tracklet + if (trk.time_since_update > self.max_age): + self.trackers.pop(i) + if (len(ret) > 0): + return np.concatenate(ret) + return np.empty((0, 6)) diff --git a/paddlers/models/ppdet/modeling/mot/utils.py b/paddlers/models/ppdet/modeling/mot/utils.py index a33fd0c..cf3069e 100644 --- a/paddlers/models/ppdet/modeling/mot/utils.py +++ b/paddlers/models/ppdet/modeling/mot/utils.py @@ -77,7 +77,7 @@ class Detection(object): tlwh (Tensor): Bounding box in format `(top left x, top left y, width, height)`. score (Tensor): Bounding box confidence score. - feature (Tensor): A feature vector that describes the object + feature (Tensor): A feature vector that describes the object contained in this image. cls_id (Tensor): Bounding box category id. """ @@ -205,8 +205,8 @@ def load_det_results(det_file, num_frames): def scale_coords(coords, input_shape, im_shape, scale_factor): # Note: ratio has only one value, scale_factor[0] == scale_factor[1] - # - # This function only used for JDE YOLOv3 or other detectors with + # + # This function only used for JDE YOLOv3 or other detectors with # LetterBoxResize and JDEBBoxPostProcess, coords output from detector had # not scaled back to the origin image. diff --git a/paddlers/models/ppdet/modeling/necks/__init__.py b/paddlers/models/ppdet/modeling/necks/__init__.py index 197ef56..529b5e2 100644 --- a/paddlers/models/ppdet/modeling/necks/__init__.py +++ b/paddlers/models/ppdet/modeling/necks/__init__.py @@ -19,6 +19,9 @@ from . import ttf_fpn from . import centernet_fpn from . import bifpn from . import csp_pan +from . import es_pan +from . import lc_pan +from . import custom_pan from .fpn import * from .yolo_fpn import * @@ -28,3 +31,6 @@ from .centernet_fpn import * from .blazeface_fpn import * from .bifpn import * from .csp_pan import * +from .es_pan import * +from .lc_pan import * +from .custom_pan import * diff --git a/paddlers/models/ppdet/modeling/necks/centernet_fpn.py b/paddlers/models/ppdet/modeling/necks/centernet_fpn.py old mode 100644 new mode 100755 index d5a7322..81a3681 --- a/paddlers/models/ppdet/modeling/necks/centernet_fpn.py +++ b/paddlers/models/ppdet/modeling/necks/centernet_fpn.py @@ -164,11 +164,11 @@ class IDAUp(nn.Layer): for i in range(start_level + 1, end_level): upsample = getattr(self, 'up_' + str(i - start_level)) project = getattr(self, 'proj_' + str(i - start_level)) - inputs[i] = project(inputs[i]) inputs[i] = upsample(inputs[i]) node = getattr(self, 'node_' + str(i - start_level)) inputs[i] = node(paddle.add(inputs[i], inputs[i - 1])) + return inputs class DLAUp(nn.Layer): @@ -197,8 +197,8 @@ class DLAUp(nn.Layer): out = [inputs[-1]] # start with 32 for i in range(len(inputs) - self.start_level - 1): ida = getattr(self, 'ida_{}'.format(i)) - ida(inputs, len(inputs) - i - 2, len(inputs)) - out.insert(0, inputs[-1]) + outputs = ida(inputs, len(inputs) - i - 2, len(inputs)) + out.insert(0, outputs[-1]) return out @@ -259,7 +259,9 @@ class CenterNetDLAFPN(nn.Layer): def forward(self, body_feats): - dla_up_feats = self.dla_up(body_feats) + inputs = [body_feats[i] for i in range(len(body_feats))] + + dla_up_feats = self.dla_up(inputs) ida_up_feats = [] for i in range(self.last_level - self.first_level): diff --git a/paddlers/models/ppdet/modeling/necks/csp_pan.py b/paddlers/models/ppdet/modeling/necks/csp_pan.py index 0843462..2558b55 100644 --- a/paddlers/models/ppdet/modeling/necks/csp_pan.py +++ b/paddlers/models/ppdet/modeling/necks/csp_pan.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. # The code is based on: @@ -19,7 +19,6 @@ import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr -from paddle.regularizer import L2Decay from paddlers.models.ppdet.core.workspace import register, serializable from ..shape_spec import ShapeSpec @@ -36,8 +35,6 @@ class ConvBNLayer(nn.Layer): act='leaky_relu'): super(ConvBNLayer, self).__init__() initializer = nn.initializer.KaimingUniform() - self.act = act - assert self.act in ['leaky_relu', "hard_swish"] self.conv = nn.Conv2D( in_channels=in_channel, out_channels=out_channel, @@ -48,13 +45,14 @@ class ConvBNLayer(nn.Layer): weight_attr=ParamAttr(initializer=initializer), bias_attr=False) self.bn = nn.BatchNorm2D(out_channel) + if act == "hard_swish": + act = 'hardswish' + self.act = act def forward(self, x): x = self.bn(self.conv(x)) - if self.act == "leaky_relu": - x = F.leaky_relu(x) - elif self.act == "hard_swish": - x = F.hardswish(x) + if self.act: + x = getattr(F, self.act)(x) return x @@ -75,10 +73,11 @@ class DPModule(nn.Layer): out_channel=96, kernel_size=3, stride=1, - act='leaky_relu'): + act='leaky_relu', + use_act_in_out=True): super(DPModule, self).__init__() initializer = nn.initializer.KaimingUniform() - self.act = act + self.use_act_in_out = use_act_in_out self.dwconv = nn.Conv2D( in_channels=in_channel, out_channels=out_channel, @@ -98,17 +97,17 @@ class DPModule(nn.Layer): weight_attr=ParamAttr(initializer=initializer), bias_attr=False) self.bn2 = nn.BatchNorm2D(out_channel) - - def act_func(self, x): - if self.act == "leaky_relu": - x = F.leaky_relu(x) - elif self.act == "hard_swish": - x = F.hardswish(x) - return x + if act == "hard_swish": + act = 'hardswish' + self.act = act def forward(self, x): - x = self.act_func(self.bn1(self.dwconv(x))) - x = self.act_func(self.bn2(self.pwconv(x))) + x = self.bn1(self.dwconv(x)) + if self.act: + x = getattr(F, self.act)(x) + x = self.bn2(self.pwconv(x)) + if self.use_act_in_out and self.act: + x = getattr(F, self.act)(x) return x diff --git a/paddlers/models/ppdet/modeling/necks/custom_pan.py b/paddlers/models/ppdet/modeling/necks/custom_pan.py new file mode 100644 index 0000000..76388e9 --- /dev/null +++ b/paddlers/models/ppdet/modeling/necks/custom_pan.py @@ -0,0 +1,225 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddlers.models.ppdet.core.workspace import register, serializable +from paddlers.models.ppdet.modeling.layers import DropBlock +from paddlers.models.ppdet.modeling.ops import get_act_fn +from ..backbones.cspresnet import ConvBNLayer, BasicBlock +from ..shape_spec import ShapeSpec + +__all__ = ['CustomCSPPAN'] + + +class SPP(nn.Layer): + def __init__(self, + ch_in, + ch_out, + k, + pool_size, + act='swish', + data_format='NCHW'): + super(SPP, self).__init__() + self.pool = [] + self.data_format = data_format + for i, size in enumerate(pool_size): + pool = self.add_sublayer( + 'pool{}'.format(i), + nn.MaxPool2D( + kernel_size=size, + stride=1, + padding=size // 2, + data_format=data_format, + ceil_mode=False)) + self.pool.append(pool) + self.conv = ConvBNLayer(ch_in, ch_out, k, padding=k // 2, act=act) + + def forward(self, x): + outs = [x] + for pool in self.pool: + outs.append(pool(x)) + if self.data_format == 'NCHW': + y = paddle.concat(outs, axis=1) + else: + y = paddle.concat(outs, axis=-1) + + y = self.conv(y) + return y + + +class CSPStage(nn.Layer): + def __init__(self, block_fn, ch_in, ch_out, n, act='swish', spp=False): + super(CSPStage, self).__init__() + + ch_mid = int(ch_out // 2) + self.conv1 = ConvBNLayer(ch_in, ch_mid, 1, act=act) + self.conv2 = ConvBNLayer(ch_in, ch_mid, 1, act=act) + self.convs = nn.Sequential() + next_ch_in = ch_mid + for i in range(n): + self.convs.add_sublayer( + str(i), + eval(block_fn)(next_ch_in, ch_mid, act=act, shortcut=False)) + if i == (n - 1) // 2 and spp: + self.convs.add_sublayer( + 'spp', SPP(ch_mid * 4, ch_mid, 1, [5, 9, 13], act=act)) + next_ch_in = ch_mid + self.conv3 = ConvBNLayer(ch_mid * 2, ch_out, 1, act=act) + + def forward(self, x): + y1 = self.conv1(x) + y2 = self.conv2(x) + y2 = self.convs(y2) + y = paddle.concat([y1, y2], axis=1) + y = self.conv3(y) + return y + + +@register +@serializable +class CustomCSPPAN(nn.Layer): + __shared__ = ['norm_type', 'data_format', 'width_mult', 'depth_mult', 'trt'] + + def __init__(self, + in_channels=[256, 512, 1024], + out_channels=[1024, 512, 256], + norm_type='bn', + act='leaky', + stage_fn='CSPStage', + block_fn='BasicBlock', + stage_num=1, + block_num=3, + drop_block=False, + block_size=3, + keep_prob=0.9, + spp=False, + data_format='NCHW', + width_mult=1.0, + depth_mult=1.0, + trt=False): + + super(CustomCSPPAN, self).__init__() + out_channels = [max(round(c * width_mult), 1) for c in out_channels] + block_num = max(round(block_num * depth_mult), 1) + act = get_act_fn( + act, trt=trt) if act is None or isinstance(act, + (str, dict)) else act + self.num_blocks = len(in_channels) + self.data_format = data_format + self._out_channels = out_channels + in_channels = in_channels[::-1] + fpn_stages = [] + fpn_routes = [] + for i, (ch_in, ch_out) in enumerate(zip(in_channels, out_channels)): + if i > 0: + ch_in += ch_pre // 2 + + stage = nn.Sequential() + for j in range(stage_num): + stage.add_sublayer( + str(j), + eval(stage_fn)(block_fn, + ch_in if j == 0 else ch_out, + ch_out, + block_num, + act=act, + spp=(spp and i == 0))) + + if drop_block: + stage.add_sublayer('drop', DropBlock(block_size, keep_prob)) + + fpn_stages.append(stage) + + if i < self.num_blocks - 1: + fpn_routes.append( + ConvBNLayer( + ch_in=ch_out, + ch_out=ch_out // 2, + filter_size=1, + stride=1, + padding=0, + act=act)) + + ch_pre = ch_out + + self.fpn_stages = nn.LayerList(fpn_stages) + self.fpn_routes = nn.LayerList(fpn_routes) + + pan_stages = [] + pan_routes = [] + for i in reversed(range(self.num_blocks - 1)): + pan_routes.append( + ConvBNLayer( + ch_in=out_channels[i + 1], + ch_out=out_channels[i + 1], + filter_size=3, + stride=2, + padding=1, + act=act)) + + ch_in = out_channels[i] + out_channels[i + 1] + ch_out = out_channels[i] + stage = nn.Sequential() + for j in range(stage_num): + stage.add_sublayer( + str(j), + eval(stage_fn)(block_fn, + ch_in if j == 0 else ch_out, + ch_out, + block_num, + act=act, + spp=False)) + if drop_block: + stage.add_sublayer('drop', DropBlock(block_size, keep_prob)) + + pan_stages.append(stage) + + self.pan_stages = nn.LayerList(pan_stages[::-1]) + self.pan_routes = nn.LayerList(pan_routes[::-1]) + + def forward(self, blocks, for_mot=False): + blocks = blocks[::-1] + fpn_feats = [] + + for i, block in enumerate(blocks): + if i > 0: + block = paddle.concat([route, block], axis=1) + route = self.fpn_stages[i](block) + fpn_feats.append(route) + + if i < self.num_blocks - 1: + route = self.fpn_routes[i](route) + route = F.interpolate( + route, scale_factor=2., data_format=self.data_format) + + pan_feats = [fpn_feats[-1], ] + route = fpn_feats[-1] + for i in reversed(range(self.num_blocks - 1)): + block = fpn_feats[i] + route = self.pan_routes[i](route) + block = paddle.concat([route, block], axis=1) + route = self.pan_stages[i](block) + pan_feats.append(route) + + return pan_feats[::-1] + + @classmethod + def from_config(cls, cfg, input_shape): + return {'in_channels': [i.channels for i in input_shape], } + + @property + def out_shape(self): + return [ShapeSpec(channels=c) for c in self._out_channels] diff --git a/paddlers/models/ppdet/modeling/necks/es_pan.py b/paddlers/models/ppdet/modeling/necks/es_pan.py new file mode 100644 index 0000000..1d7d31a --- /dev/null +++ b/paddlers/models/ppdet/modeling/necks/es_pan.py @@ -0,0 +1,212 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.regularizer import L2Decay +from paddlers.models.ppdet.core.workspace import register, serializable + +from ..shape_spec import ShapeSpec +from ..backbones.esnet import SEModule +from .csp_pan import ConvBNLayer, Channel_T, DPModule + +__all__ = ['ESPAN'] + + +class ES_Block(nn.Layer): + def __init__(self, + in_channels, + mid_channels, + out_channels, + kernel_size=5, + stride=1, + act='leaky_relu'): + super(ES_Block, self).__init__() + self._residual = ConvBNLayer( + in_channel=in_channels, + out_channel=out_channels, + kernel_size=1, + stride=1, + groups=1, + act=act) + self._conv_pw = ConvBNLayer( + in_channel=in_channels, + out_channel=mid_channels // 2, + kernel_size=1, + stride=1, + groups=1, + act=act) + self._conv_dw = ConvBNLayer( + in_channel=mid_channels // 2, + out_channel=mid_channels // 2, + kernel_size=kernel_size, + stride=stride, + groups=mid_channels // 2, + act=None) + self._se = SEModule(mid_channels) + + self._conv_linear = ConvBNLayer( + in_channel=mid_channels, + out_channel=out_channels, + kernel_size=1, + stride=1, + groups=1, + act=act) + + self._out_conv = ConvBNLayer( + in_channel=out_channels * 2, + out_channel=out_channels, + kernel_size=1, + stride=1, + groups=1, + act=act) + + def forward(self, inputs): + x1 = self._residual(inputs) + x2 = self._conv_pw(inputs) + x3 = self._conv_dw(x2) + x3 = paddle.concat([x2, x3], axis=1) + x3 = self._se(x3) + x3 = self._conv_linear(x3) + out = paddle.concat([x1, x3], axis=1) + out = self._out_conv(out) + return out + + +@register +@serializable +class ESPAN(nn.Layer): + """Path Aggregation Network with ES module. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + kernel_size (int): The conv2d kernel size of this Module. + num_features (int): Number of output features of CSPPAN module. + num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 1 + use_depthwise (bool): Whether to depthwise separable convolution in + blocks. Default: True + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size=5, + num_features=3, + use_depthwise=True, + act='hard_swish', + spatial_scales=[0.125, 0.0625, 0.03125]): + super(ESPAN, self).__init__() + self.conv_t = Channel_T(in_channels, out_channels, act=act) + in_channels = [out_channels] * len(spatial_scales) + self.in_channels = in_channels + self.out_channels = out_channels + self.spatial_scales = spatial_scales + self.num_features = num_features + conv_func = DPModule if use_depthwise else ConvBNLayer + + if self.num_features == 4: + self.first_top_conv = conv_func( + in_channels[0], in_channels[0], kernel_size, stride=2, act=act) + self.second_top_conv = conv_func( + in_channels[0], in_channels[0], kernel_size, stride=2, act=act) + self.spatial_scales.append(self.spatial_scales[-1] / 2) + + # build top-down blocks + self.upsample = nn.Upsample(scale_factor=2, mode='nearest') + self.top_down_blocks = nn.LayerList() + for idx in range(len(in_channels) - 1, 0, -1): + self.top_down_blocks.append( + ES_Block( + in_channels[idx - 1] * 2, + in_channels[idx - 1], + in_channels[idx - 1], + kernel_size=kernel_size, + stride=1, + act=act)) + + # build bottom-up blocks + self.downsamples = nn.LayerList() + self.bottom_up_blocks = nn.LayerList() + for idx in range(len(in_channels) - 1): + self.downsamples.append( + conv_func( + in_channels[idx], + in_channels[idx], + kernel_size=kernel_size, + stride=2, + act=act)) + self.bottom_up_blocks.append( + ES_Block( + in_channels[idx] * 2, + in_channels[idx + 1], + in_channels[idx + 1], + kernel_size=kernel_size, + stride=1, + act=act)) + + def forward(self, inputs): + """ + Args: + inputs (tuple[Tensor]): input features. + + Returns: + tuple[Tensor]: CSPPAN features. + """ + assert len(inputs) == len(self.in_channels) + inputs = self.conv_t(inputs) + + # top-down path + inner_outs = [inputs[-1]] + for idx in range(len(self.in_channels) - 1, 0, -1): + feat_heigh = inner_outs[0] + feat_low = inputs[idx - 1] + + upsample_feat = self.upsample(feat_heigh) + + inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx]( + paddle.concat([upsample_feat, feat_low], 1)) + inner_outs.insert(0, inner_out) + + # bottom-up path + outs = [inner_outs[0]] + for idx in range(len(self.in_channels) - 1): + feat_low = outs[-1] + feat_height = inner_outs[idx + 1] + downsample_feat = self.downsamples[idx](feat_low) + out = self.bottom_up_blocks[idx](paddle.concat( + [downsample_feat, feat_height], 1)) + outs.append(out) + + top_features = None + if self.num_features == 4: + top_features = self.first_top_conv(inputs[-1]) + top_features = top_features + self.second_top_conv(outs[-1]) + outs.append(top_features) + + return tuple(outs) + + @property + def out_shape(self): + return [ + ShapeSpec( + channels=self.out_channels, stride=1. / s) + for s in self.spatial_scales + ] + + @classmethod + def from_config(cls, cfg, input_shape): + return {'in_channels': [i.channels for i in input_shape], } diff --git a/paddlers/models/ppdet/modeling/necks/fpn.py b/paddlers/models/ppdet/modeling/necks/fpn.py index 6bf1f94..472101c 100644 --- a/paddlers/models/ppdet/modeling/necks/fpn.py +++ b/paddlers/models/ppdet/modeling/necks/fpn.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. import paddle.nn as nn @@ -31,28 +31,28 @@ class FPN(nn.Layer): Feature Pyramid Network, see https://arxiv.org/abs/1612.03144 Args: - in_channels (list[int]): input channels of each level which can be + in_channels (list[int]): input channels of each level which can be derived from the output shape of backbone by from_config - out_channel (list[int]): output channel of each level + out_channel (int): output channel of each level spatial_scales (list[float]): the spatial scales between input feature - maps and original input image which can be derived from the output + maps and original input image which can be derived from the output shape of backbone by from_config has_extra_convs (bool): whether to add extra conv to the last level. default False extra_stage (int): the number of extra stages added to the last level. default 1 - use_c5 (bool): Whether to use c5 as the input of extra stage, + use_c5 (bool): Whether to use c5 as the input of extra stage, otherwise p5 is used. default True - norm_type (string|None): The normalization type in FPN module. If - norm_type is None, norm will not be used after conv and if + norm_type (string|None): The normalization type in FPN module. If + norm_type is None, norm will not be used after conv and if norm_type is string, bn, gn, sync_bn are available. default None norm_decay (float): weight decay for normalization layer weights. default 0. - freeze_norm (bool): whether to freeze normalization layer. + freeze_norm (bool): whether to freeze normalization layer. default False relu_before_extra_convs (bool): whether to add relu before extra convs. default False - + """ def __init__(self, diff --git a/paddlers/models/ppdet/modeling/necks/hrfpn.py b/paddlers/models/ppdet/modeling/necks/hrfpn.py index 785c572..0a17ea1 100644 --- a/paddlers/models/ppdet/modeling/necks/hrfpn.py +++ b/paddlers/models/ppdet/modeling/necks/hrfpn.py @@ -37,7 +37,8 @@ class HRFPN(nn.Layer): out_channel=256, share_conv=False, extra_stage=1, - spatial_scales=[1. / 4, 1. / 8, 1. / 16, 1. / 32]): + spatial_scales=[1. / 4, 1. / 8, 1. / 16, 1. / 32], + use_bias=False): super(HRFPN, self).__init__() in_channel = sum(in_channels) self.in_channel = in_channel @@ -47,12 +48,14 @@ class HRFPN(nn.Layer): spatial_scales = spatial_scales + [spatial_scales[-1] / 2.] self.spatial_scales = spatial_scales self.num_out = len(self.spatial_scales) + self.use_bias = use_bias + bias_attr = False if use_bias is False else None self.reduction = nn.Conv2D( in_channels=in_channel, out_channels=out_channel, kernel_size=1, - bias_attr=False) + bias_attr=bias_attr) if share_conv: self.fpn_conv = nn.Conv2D( @@ -60,7 +63,7 @@ class HRFPN(nn.Layer): out_channels=out_channel, kernel_size=3, padding=1, - bias_attr=False) + bias_attr=bias_attr) else: self.fpn_conv = [] for i in range(self.num_out): @@ -72,7 +75,7 @@ class HRFPN(nn.Layer): out_channels=out_channel, kernel_size=3, padding=1, - bias_attr=False)) + bias_attr=bias_attr)) self.fpn_conv.append(conv) def forward(self, body_feats): diff --git a/paddlers/models/ppdet/modeling/necks/lc_pan.py b/paddlers/models/ppdet/modeling/necks/lc_pan.py new file mode 100644 index 0000000..0faf32b --- /dev/null +++ b/paddlers/models/ppdet/modeling/necks/lc_pan.py @@ -0,0 +1,168 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.regularizer import L2Decay +from paddlers.models.ppdet.core.workspace import register, serializable + +from ..shape_spec import ShapeSpec +from ..backbones.lcnet import DepthwiseSeparable +from .csp_pan import ConvBNLayer, Channel_T, DPModule + +__all__ = ['LCPAN'] + + +@register +@serializable +class LCPAN(nn.Layer): + """Path Aggregation Network with LCNet module. + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + kernel_size (int): The conv2d kernel size of this Module. + num_features (int): Number of output features of CSPPAN module. + num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 1 + use_depthwise (bool): Whether to depthwise separable convolution in + blocks. Default: True + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size=5, + num_features=3, + use_depthwise=True, + act='hard_swish', + spatial_scales=[0.125, 0.0625, 0.03125]): + super(LCPAN, self).__init__() + self.conv_t = Channel_T(in_channels, out_channels, act=act) + in_channels = [out_channels] * len(spatial_scales) + self.in_channels = in_channels + self.out_channels = out_channels + self.spatial_scales = spatial_scales + self.num_features = num_features + conv_func = DPModule if use_depthwise else ConvBNLayer + + NET_CONFIG = { + #k, in_c, out_c, stride, use_se + "block1": [ + [kernel_size, out_channels * 2, out_channels * 2, 1, False], + [kernel_size, out_channels * 2, out_channels, 1, False], + ], + "block2": [ + [kernel_size, out_channels * 2, out_channels * 2, 1, False], + [kernel_size, out_channels * 2, out_channels, 1, False], + ] + } + + if self.num_features == 4: + self.first_top_conv = conv_func( + in_channels[0], in_channels[0], kernel_size, stride=2, act=act) + self.second_top_conv = conv_func( + in_channels[0], in_channels[0], kernel_size, stride=2, act=act) + self.spatial_scales.append(self.spatial_scales[-1] / 2) + + # build top-down blocks + self.upsample = nn.Upsample(scale_factor=2, mode='nearest') + self.top_down_blocks = nn.LayerList() + for idx in range(len(in_channels) - 1, 0, -1): + self.top_down_blocks.append( + nn.Sequential(*[ + DepthwiseSeparable( + num_channels=in_c, + num_filters=out_c, + dw_size=k, + stride=s, + use_se=se) + for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG[ + "block1"]) + ])) + + # build bottom-up blocks + self.downsamples = nn.LayerList() + self.bottom_up_blocks = nn.LayerList() + for idx in range(len(in_channels) - 1): + self.downsamples.append( + conv_func( + in_channels[idx], + in_channels[idx], + kernel_size=kernel_size, + stride=2, + act=act)) + self.bottom_up_blocks.append( + nn.Sequential(*[ + DepthwiseSeparable( + num_channels=in_c, + num_filters=out_c, + dw_size=k, + stride=s, + use_se=se) + for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG[ + "block2"]) + ])) + + def forward(self, inputs): + """ + Args: + inputs (tuple[Tensor]): input features. + Returns: + tuple[Tensor]: CSPPAN features. + """ + assert len(inputs) == len(self.in_channels) + inputs = self.conv_t(inputs) + + # top-down path + inner_outs = [inputs[-1]] + for idx in range(len(self.in_channels) - 1, 0, -1): + feat_heigh = inner_outs[0] + feat_low = inputs[idx - 1] + + upsample_feat = self.upsample(feat_heigh) + + inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx]( + paddle.concat([upsample_feat, feat_low], 1)) + inner_outs.insert(0, inner_out) + + # bottom-up path + outs = [inner_outs[0]] + for idx in range(len(self.in_channels) - 1): + feat_low = outs[-1] + feat_height = inner_outs[idx + 1] + downsample_feat = self.downsamples[idx](feat_low) + out = self.bottom_up_blocks[idx](paddle.concat( + [downsample_feat, feat_height], 1)) + outs.append(out) + + top_features = None + if self.num_features == 4: + top_features = self.first_top_conv(inputs[-1]) + top_features = top_features + self.second_top_conv(outs[-1]) + outs.append(top_features) + + return tuple(outs) + + @property + def out_shape(self): + return [ + ShapeSpec( + channels=self.out_channels, stride=1. / s) + for s in self.spatial_scales + ] + + @classmethod + def from_config(cls, cfg, input_shape): + return {'in_channels': [i.channels for i in input_shape], } diff --git a/paddlers/models/ppdet/modeling/necks/ttf_fpn.py b/paddlers/models/ppdet/modeling/necks/ttf_fpn.py index ae2f245..f25cfc1 100644 --- a/paddlers/models/ppdet/modeling/necks/ttf_fpn.py +++ b/paddlers/models/ppdet/modeling/necks/ttf_fpn.py @@ -166,9 +166,9 @@ class TTFFPN(nn.Layer): shortcut_num (list): the number of convolution layers in each shortcut. [3,2,1] by default, means DarkNet53 backbone return_idx_1 has 3 convs in its shortcut, return_idx_2 has 2 convs and return_idx_3 has 1 conv. - norm_type (string): norm type, 'sync_bn', 'bn', 'gn' are optional. + norm_type (string): norm type, 'sync_bn', 'bn', 'gn' are optional. bn by default - lite_neck (bool): whether to use lite conv in TTFNet FPN, + lite_neck (bool): whether to use lite conv in TTFNet FPN, False by default fusion_method (string): the method to fusion upsample and lateral layer. 'add' and 'concat' are optional, add by default diff --git a/paddlers/models/ppdet/modeling/necks/yolo_fpn.py b/paddlers/models/ppdet/modeling/necks/yolo_fpn.py index a859dee..bd667a2 100644 --- a/paddlers/models/ppdet/modeling/necks/yolo_fpn.py +++ b/paddlers/models/ppdet/modeling/necks/yolo_fpn.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. import paddle @@ -17,10 +17,12 @@ import paddle.nn as nn import paddle.nn.functional as F from paddlers.models.ppdet.core.workspace import register, serializable from paddlers.models.ppdet.modeling.layers import DropBlock +from paddlers.models.ppdet.modeling.ops import get_act_fn from ..backbones.darknet import ConvBNLayer from ..shape_spec import ShapeSpec +from ..backbones.csp_darknet import BaseConv, DWConv, CSPLayer -__all__ = ['YOLOv3FPN', 'PPYOLOFPN', 'PPYOLOTinyFPN', 'PPYOLOPAN'] +__all__ = ['YOLOv3FPN', 'PPYOLOFPN', 'PPYOLOTinyFPN', 'PPYOLOPAN', 'YOLOCSPPAN'] def add_coord(x, data_format): @@ -114,7 +116,7 @@ class SPP(nn.Layer): ch_out, k, pool_size, - norm_type, + norm_type='bn', freeze_norm=False, name='', act='leaky', @@ -267,7 +269,7 @@ class PPYOLOTinyDetBlock(nn.Layer): self.conv_module = nn.Sequential() cfgs = [ - # name, in channels, out channels, filter_size, + # name, in channels, out channels, filter_size, # stride, padding, groups ['.0', ch_in, ch_out, 1, 1, 0, 1], ['.1', ch_out, ch_out, 5, 1, 2, ch_out], @@ -679,7 +681,7 @@ class PPYOLOTinyFPN(nn.Layer): detection_block_channels (list): channels in fpn norm_type (str): batch norm type, default bn data_format (str): data format, NCHW or NHWC - kwargs: extra key-value pairs, such as parameter of DropBlock and spp + kwargs: extra key-value pairs, such as parameter of DropBlock and spp """ super(PPYOLOTinyFPN, self).__init__() assert len(in_channels) > 0, "in_channels length should > 0" @@ -986,3 +988,112 @@ class PPYOLOPAN(nn.Layer): @property def out_shape(self): return [ShapeSpec(channels=c) for c in self._out_channels] + + +@register +@serializable +class YOLOCSPPAN(nn.Layer): + """ + YOLO CSP-PAN, used in YOLOv5 and YOLOX. + """ + __shared__ = ['depth_mult', 'data_format', 'act', 'trt'] + + def __init__(self, + depth_mult=1.0, + in_channels=[256, 512, 1024], + depthwise=False, + data_format='NCHW', + act='silu', + trt=False): + super(YOLOCSPPAN, self).__init__() + self.in_channels = in_channels + self._out_channels = in_channels + Conv = DWConv if depthwise else BaseConv + + self.data_format = data_format + act = get_act_fn( + act, trt=trt) if act is None or isinstance(act, + (str, dict)) else act + self.upsample = nn.Upsample(scale_factor=2, mode="nearest") + + # top-down fpn + self.lateral_convs = nn.LayerList() + self.fpn_blocks = nn.LayerList() + for idx in range(len(in_channels) - 1, 0, -1): + self.lateral_convs.append( + BaseConv( + int(in_channels[idx]), + int(in_channels[idx - 1]), + 1, + 1, + act=act)) + self.fpn_blocks.append( + CSPLayer( + int(in_channels[idx - 1] * 2), + int(in_channels[idx - 1]), + round(3 * depth_mult), + shortcut=False, + depthwise=depthwise, + act=act)) + + # bottom-up pan + self.downsample_convs = nn.LayerList() + self.pan_blocks = nn.LayerList() + for idx in range(len(in_channels) - 1): + self.downsample_convs.append( + Conv( + int(in_channels[idx]), + int(in_channels[idx]), + 3, + stride=2, + act=act)) + self.pan_blocks.append( + CSPLayer( + int(in_channels[idx] * 2), + int(in_channels[idx + 1]), + round(3 * depth_mult), + shortcut=False, + depthwise=depthwise, + act=act)) + + def forward(self, feats, for_mot=False): + assert len(feats) == len(self.in_channels) + + # top-down fpn + inner_outs = [feats[-1]] + for idx in range(len(self.in_channels) - 1, 0, -1): + feat_heigh = inner_outs[0] + feat_low = feats[idx - 1] + feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx]( + feat_heigh) + inner_outs[0] = feat_heigh + + upsample_feat = F.interpolate( + feat_heigh, + scale_factor=2., + mode="nearest", + data_format=self.data_format) + inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx]( + paddle.concat( + [upsample_feat, feat_low], axis=1)) + inner_outs.insert(0, inner_out) + + # bottom-up pan + outs = [inner_outs[0]] + for idx in range(len(self.in_channels) - 1): + feat_low = outs[-1] + feat_height = inner_outs[idx + 1] + downsample_feat = self.downsample_convs[idx](feat_low) + out = self.pan_blocks[idx](paddle.concat( + [downsample_feat, feat_height], axis=1)) + outs.append(out) + + return outs + + @classmethod + def from_config(cls, cfg, input_shape): + return {'in_channels': [i.channels for i in input_shape], } + + @property + def out_shape(self): + return [ShapeSpec(channels=c) for c in self._out_channels] diff --git a/paddlers/models/ppdet/modeling/ops.py b/paddlers/models/ppdet/modeling/ops.py index 005a131..26a3171 100644 --- a/paddlers/models/ppdet/modeling/ops.py +++ b/paddlers/models/ppdet/modeling/ops.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. import paddle @@ -17,31 +17,72 @@ import paddle.nn.functional as F import paddle.nn as nn from paddle import ParamAttr from paddle.regularizer import L2Decay +try: + import paddle._legacy_C_ops as C_ops +except: + import paddle._C_ops as C_ops -from paddle.fluid.framework import Variable, in_dygraph_mode -from paddle.fluid import core -from paddle.fluid.layer_helper import LayerHelper -from paddle.fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype +from paddle import in_dynamic_mode +from paddle.common_ops_import import Variable, LayerHelper, check_variable_and_dtype, check_type, check_dtype __all__ = [ - 'roi_pool', - 'roi_align', 'prior_box', 'generate_proposals', - 'iou_similarity', 'box_coder', - 'yolo_box', 'multiclass_nms', 'distribute_fpn_proposals', - 'collect_fpn_proposals', 'matrix_nms', 'batch_norm', 'mish', + 'silu', + 'swish', + 'identity', ] +def identity(x): + return x + + def mish(x): - return x * paddle.tanh(F.softplus(x)) + return F.mish(x) if hasattr(F, mish) else x * F.tanh(F.softplus(x)) + + +def silu(x): + return F.silu(x) + + +def swish(x): + return x * F.sigmoid(x) + + +TRT_ACT_SPEC = {'swish': swish, 'silu': swish} + +ACT_SPEC = {'mish': mish, 'silu': silu} + + +def get_act_fn(act=None, trt=False): + assert act is None or isinstance(act, ( + str, dict)), 'name of activation should be str, dict or None' + if not act: + return identity + + if isinstance(act, dict): + name = act['name'] + act.pop('name') + kwargs = act + else: + name = act + kwargs = dict() + + if trt and name in TRT_ACT_SPEC: + fn = TRT_ACT_SPEC[name] + elif name in ACT_SPEC: + fn = ACT_SPEC[name] + else: + fn = getattr(F, name) + + return lambda x: fn(x, **kwargs) def batch_norm(ch, @@ -77,390 +118,6 @@ def batch_norm(ch, return norm_layer -@paddle.jit.not_to_static -def roi_pool(input, - rois, - output_size, - spatial_scale=1.0, - rois_num=None, - name=None): - """ - - This operator implements the roi_pooling layer. - Region of interest pooling (also known as RoI pooling) is to perform max pooling on inputs of nonuniform sizes to obtain fixed-size feature maps (e.g. 7*7). - - The operator has three steps: - - 1. Dividing each region proposal into equal-sized sections with output_size(h, w); - 2. Finding the largest value in each section; - 3. Copying these max values to the output buffer. - - For more information, please refer to https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn - - Args: - input (Tensor): Input feature, 4D-Tensor with the shape of [N,C,H,W], - where N is the batch size, C is the input channel, H is Height, W is weight. - The data type is float32 or float64. - rois (Tensor): ROIs (Regions of Interest) to pool over. - 2D-Tensor or 2D-LoDTensor with the shape of [num_rois,4], the lod level is 1. - Given as [[x1, y1, x2, y2], ...], (x1, y1) is the top left coordinates, - and (x2, y2) is the bottom right coordinates. - output_size (int or tuple[int, int]): The pooled output size(h, w), data type is int32. If int, h and w are both equal to output_size. - spatial_scale (float, optional): Multiplicative spatial scale factor to translate ROI coords from their input scale to the scale used when pooling. Default: 1.0 - rois_num (Tensor): The number of RoIs in each image. Default: None - name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. - - - Returns: - Tensor: The pooled feature, 4D-Tensor with the shape of [num_rois, C, output_size[0], output_size[1]]. - - - Examples: - - .. code-block:: python - - import paddle - from paddlers.models.ppdet.modeling import ops - paddle.enable_static() - - x = paddle.static.data( - name='data', shape=[None, 256, 32, 32], dtype='float32') - rois = paddle.static.data( - name='rois', shape=[None, 4], dtype='float32') - rois_num = paddle.static.data(name='rois_num', shape=[None], dtype='int32') - - pool_out = ops.roi_pool( - input=x, - rois=rois, - output_size=(1, 1), - spatial_scale=1.0, - rois_num=rois_num) - """ - check_type(output_size, 'output_size', (int, tuple), 'roi_pool') - if isinstance(output_size, int): - output_size = (output_size, output_size) - - pooled_height, pooled_width = output_size - if in_dygraph_mode(): - assert rois_num is not None, "rois_num should not be None in dygraph mode." - pool_out, argmaxes = core.ops.roi_pool( - input, rois, rois_num, "pooled_height", pooled_height, - "pooled_width", pooled_width, "spatial_scale", spatial_scale) - return pool_out, argmaxes - - else: - check_variable_and_dtype(input, 'input', ['float32'], 'roi_pool') - check_variable_and_dtype(rois, 'rois', ['float32'], 'roi_pool') - helper = LayerHelper('roi_pool', **locals()) - dtype = helper.input_dtype() - pool_out = helper.create_variable_for_type_inference(dtype) - argmaxes = helper.create_variable_for_type_inference(dtype='int32') - - inputs = { - "X": input, - "ROIs": rois, - } - if rois_num is not None: - inputs['RoisNum'] = rois_num - helper.append_op( - type="roi_pool", - inputs=inputs, - outputs={"Out": pool_out, - "Argmax": argmaxes}, - attrs={ - "pooled_height": pooled_height, - "pooled_width": pooled_width, - "spatial_scale": spatial_scale - }) - return pool_out, argmaxes - - -@paddle.jit.not_to_static -def roi_align(input, - rois, - output_size, - spatial_scale=1.0, - sampling_ratio=-1, - rois_num=None, - aligned=True, - name=None): - """ - - Region of interest align (also known as RoI align) is to perform - bilinear interpolation on inputs of nonuniform sizes to obtain - fixed-size feature maps (e.g. 7*7) - - Dividing each region proposal into equal-sized sections with - the pooled_width and pooled_height. Location remains the origin - result. - - In each ROI bin, the value of the four regularly sampled locations - are computed directly through bilinear interpolation. The output is - the mean of four locations. - Thus avoid the misaligned problem. - - Args: - input (Tensor): Input feature, 4D-Tensor with the shape of [N,C,H,W], - where N is the batch size, C is the input channel, H is Height, W is weight. - The data type is float32 or float64. - rois (Tensor): ROIs (Regions of Interest) to pool over.It should be - a 2-D Tensor or 2-D LoDTensor of shape (num_rois, 4), the lod level is 1. - The data type is float32 or float64. Given as [[x1, y1, x2, y2], ...], - (x1, y1) is the top left coordinates, and (x2, y2) is the bottom right coordinates. - output_size (int or tuple[int, int]): The pooled output size(h, w), data type is int32. If int, h and w are both equal to output_size. - spatial_scale (float32, optional): Multiplicative spatial scale factor to translate ROI coords - from their input scale to the scale used when pooling. Default: 1.0 - sampling_ratio(int32, optional): number of sampling points in the interpolation grid. - If <=0, then grid points are adaptive to roi_width and pooled_w, likewise for height. Default: -1 - rois_num (Tensor): The number of RoIs in each image. Default: None - name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. - - Returns: - Tensor: - - Output: The output of ROIAlignOp is a 4-D tensor with shape (num_rois, channels, pooled_h, pooled_w). The data type is float32 or float64. - - - Examples: - .. code-block:: python - - import paddle - from paddlers.models.ppdet.modeling import ops - paddle.enable_static() - - x = paddle.static.data( - name='data', shape=[None, 256, 32, 32], dtype='float32') - rois = paddle.static.data( - name='rois', shape=[None, 4], dtype='float32') - rois_num = paddle.static.data(name='rois_num', shape=[None], dtype='int32') - align_out = ops.roi_align(input=x, - rois=rois, - ouput_size=(7, 7), - spatial_scale=0.5, - sampling_ratio=-1, - rois_num=rois_num) - """ - check_type(output_size, 'output_size', (int, tuple), 'roi_align') - if isinstance(output_size, int): - output_size = (output_size, output_size) - - pooled_height, pooled_width = output_size - - if in_dygraph_mode(): - assert rois_num is not None, "rois_num should not be None in dygraph mode." - align_out = core.ops.roi_align( - input, rois, rois_num, "pooled_height", pooled_height, - "pooled_width", pooled_width, "spatial_scale", spatial_scale, - "sampling_ratio", sampling_ratio, "aligned", aligned) - return align_out - - else: - check_variable_and_dtype(input, 'input', ['float32', 'float64'], - 'roi_align') - check_variable_and_dtype(rois, 'rois', ['float32', 'float64'], - 'roi_align') - helper = LayerHelper('roi_align', **locals()) - dtype = helper.input_dtype() - align_out = helper.create_variable_for_type_inference(dtype) - inputs = { - "X": input, - "ROIs": rois, - } - if rois_num is not None: - inputs['RoisNum'] = rois_num - helper.append_op( - type="roi_align", - inputs=inputs, - outputs={"Out": align_out}, - attrs={ - "pooled_height": pooled_height, - "pooled_width": pooled_width, - "spatial_scale": spatial_scale, - "sampling_ratio": sampling_ratio, - "aligned": aligned, - }) - return align_out - - -@paddle.jit.not_to_static -def iou_similarity(x, y, box_normalized=True, name=None): - """ - Computes intersection-over-union (IOU) between two box lists. - Box list 'X' should be a LoDTensor and 'Y' is a common Tensor, - boxes in 'Y' are shared by all instance of the batched inputs of X. - Given two boxes A and B, the calculation of IOU is as follows: - - $$ - IOU(A, B) = - \\frac{area(A\\cap B)}{area(A)+area(B)-area(A\\cap B)} - $$ - - Args: - x (Tensor): Box list X is a 2-D Tensor with shape [N, 4] holds N - boxes, each box is represented as [xmin, ymin, xmax, ymax], - the shape of X is [N, 4]. [xmin, ymin] is the left top - coordinate of the box if the input is image feature map, they - are close to the origin of the coordinate system. - [xmax, ymax] is the right bottom coordinate of the box. - The data type is float32 or float64. - y (Tensor): Box list Y holds M boxes, each box is represented as - [xmin, ymin, xmax, ymax], the shape of X is [N, 4]. - [xmin, ymin] is the left top coordinate of the box if the - input is image feature map, and [xmax, ymax] is the right - bottom coordinate of the box. The data type is float32 or float64. - box_normalized(bool): Whether treat the priorbox as a normalized box. - Set true by default. - name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. - - Returns: - Tensor: The output of iou_similarity op, a tensor with shape [N, M] - representing pairwise iou scores. The data type is same with x. - - Examples: - .. code-block:: python - - import paddle - from paddlers.models.ppdet.modeling import ops - paddle.enable_static() - - x = paddle.static.data(name='x', shape=[None, 4], dtype='float32') - y = paddle.static.data(name='y', shape=[None, 4], dtype='float32') - iou = ops.iou_similarity(x=x, y=y) - """ - - if in_dygraph_mode(): - out = core.ops.iou_similarity(x, y, 'box_normalized', box_normalized) - return out - else: - helper = LayerHelper("iou_similarity", **locals()) - out = helper.create_variable_for_type_inference(dtype=x.dtype) - - helper.append_op( - type="iou_similarity", - inputs={"X": x, - "Y": y}, - attrs={"box_normalized": box_normalized}, - outputs={"Out": out}) - return out - - -@paddle.jit.not_to_static -def collect_fpn_proposals(multi_rois, - multi_scores, - min_level, - max_level, - post_nms_top_n, - rois_num_per_level=None, - name=None): - """ - - **This OP only supports LoDTensor as input**. Concat multi-level RoIs - (Region of Interest) and select N RoIs with respect to multi_scores. - This operation performs the following steps: - - 1. Choose num_level RoIs and scores as input: num_level = max_level - min_level - 2. Concat multi-level RoIs and scores - 3. Sort scores and select post_nms_top_n scores - 4. Gather RoIs by selected indices from scores - 5. Re-sort RoIs by corresponding batch_id - - Args: - multi_rois(list): List of RoIs to collect. Element in list is 2-D - LoDTensor with shape [N, 4] and data type is float32 or float64, - N is the number of RoIs. - multi_scores(list): List of scores of RoIs to collect. Element in list - is 2-D LoDTensor with shape [N, 1] and data type is float32 or - float64, N is the number of RoIs. - min_level(int): The lowest level of FPN layer to collect - max_level(int): The highest level of FPN layer to collect - post_nms_top_n(int): The number of selected RoIs - rois_num_per_level(list, optional): The List of RoIs' numbers. - Each element is 1-D Tensor which contains the RoIs' number of each - image on each level and the shape is [B] and data type is - int32, B is the number of images. If it is not None then return - a 1-D Tensor contains the output RoIs' number of each image and - the shape is [B]. Default: None - name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. - - Returns: - Variable: - - fpn_rois(Variable): 2-D LoDTensor with shape [N, 4] and data type is - float32 or float64. Selected RoIs. - - rois_num(Tensor): 1-D Tensor contains the RoIs's number of each - image. The shape is [B] and data type is int32. B is the number of - images. - - Examples: - .. code-block:: python - - import paddle - from paddlers.models.ppdet.modeling import ops - paddle.enable_static() - multi_rois = [] - multi_scores = [] - for i in range(4): - multi_rois.append(paddle.static.data( - name='roi_'+str(i), shape=[None, 4], dtype='float32', lod_level=1)) - for i in range(4): - multi_scores.append(paddle.static.data( - name='score_'+str(i), shape=[None, 1], dtype='float32', lod_level=1)) - - fpn_rois = ops.collect_fpn_proposals( - multi_rois=multi_rois, - multi_scores=multi_scores, - min_level=2, - max_level=5, - post_nms_top_n=2000) - """ - check_type(multi_rois, 'multi_rois', list, 'collect_fpn_proposals') - check_type(multi_scores, 'multi_scores', list, 'collect_fpn_proposals') - num_lvl = max_level - min_level + 1 - input_rois = multi_rois[:num_lvl] - input_scores = multi_scores[:num_lvl] - - if in_dygraph_mode(): - assert rois_num_per_level is not None, "rois_num_per_level should not be None in dygraph mode." - attrs = ('post_nms_topN', post_nms_top_n) - output_rois, rois_num = core.ops.collect_fpn_proposals( - input_rois, input_scores, rois_num_per_level, *attrs) - return output_rois, rois_num - - else: - helper = LayerHelper('collect_fpn_proposals', **locals()) - dtype = helper.input_dtype('multi_rois') - check_dtype(dtype, 'multi_rois', ['float32', 'float64'], - 'collect_fpn_proposals') - output_rois = helper.create_variable_for_type_inference(dtype) - output_rois.stop_gradient = True - - inputs = { - 'MultiLevelRois': input_rois, - 'MultiLevelScores': input_scores, - } - outputs = {'FpnRois': output_rois} - if rois_num_per_level is not None: - inputs['MultiLevelRoIsNum'] = rois_num_per_level - rois_num = helper.create_variable_for_type_inference(dtype='int32') - rois_num.stop_gradient = True - outputs['RoisNum'] = rois_num - helper.append_op( - type='collect_fpn_proposals', - inputs=inputs, - outputs=outputs, - attrs={'post_nms_topN': post_nms_top_n}) - return output_rois, rois_num - - @paddle.jit.not_to_static def distribute_fpn_proposals(fpn_rois, min_level, @@ -471,14 +128,14 @@ def distribute_fpn_proposals(fpn_rois, rois_num=None, name=None): r""" - - **This op only takes LoDTensor as input.** In Feature Pyramid Networks - (FPN) models, it is needed to distribute all proposals into different FPN - level, with respect to scale of the proposals, the referring scale and the - referring level. Besides, to restore the order of proposals, we return an - array which indicates the original index of rois in current proposals. + + **This op only takes LoDTensor as input.** In Feature Pyramid Networks + (FPN) models, it is needed to distribute all proposals into different FPN + level, with respect to scale of the proposals, the referring scale and the + referring level. Besides, to restore the order of proposals, we return an + array which indicates the original index of rois in current proposals. To compute FPN level for each roi, the formula is given as follows: - + .. math:: roi\_scale &= \sqrt{BBoxArea(fpn\_roi)} @@ -489,36 +146,36 @@ def distribute_fpn_proposals(fpn_rois, Args: - fpn_rois(Variable): 2-D Tensor with shape [N, 4] and data type is + fpn_rois(Variable): 2-D Tensor with shape [N, 4] and data type is float32 or float64. The input fpn_rois. - min_level(int32): The lowest level of FPN layer where the proposals come + min_level(int32): The lowest level of FPN layer where the proposals come from. max_level(int32): The highest level of FPN layer where the proposals come from. refer_level(int32): The referring level of FPN layer with specified scale. refer_scale(int32): The referring scale of FPN layer with specified level. - rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image. + rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image. The shape is [B] and data type is int32. B is the number of images. - If it is not None then return a list of 1-D Tensor. Each element + If it is not None then return a list of 1-D Tensor. Each element is the output RoIs' number of each image on the corresponding level and the shape is [B]. None by default. - name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. + name(str, optional): For detailed information, please refer + to :ref:`api_guide_Name`. Usually name is no need to set and + None by default. Returns: Tuple: - multi_rois(List) : A list of 2-D LoDTensor with shape [M, 4] - and data type of float32 and float64. The length is + multi_rois(List) : A list of 2-D LoDTensor with shape [M, 4] + and data type of float32 and float64. The length is max_level-min_level+1. The proposals in each FPN level. - restore_ind(Variable): A 2-D Tensor with shape [N, 1], N is + restore_ind(Variable): A 2-D Tensor with shape [N, 1], N is the number of total rois. The data type is int32. It is used to restore the order of fpn_rois. - rois_num_per_level(List): A list of 1-D Tensor and each Tensor is - the RoIs' number in each image on the corresponding level. The shape + rois_num_per_level(List): A list of 1-D Tensor and each Tensor is + the RoIs' number in each image on the corresponding level. The shape is [B] and data type of int32. B is the number of images @@ -539,13 +196,14 @@ def distribute_fpn_proposals(fpn_rois, """ num_lvl = max_level - min_level + 1 - if in_dygraph_mode(): + if in_dynamic_mode(): assert rois_num is not None, "rois_num should not be None in dygraph mode." attrs = ('min_level', min_level, 'max_level', max_level, 'refer_level', refer_level, 'refer_scale', refer_scale, 'pixel_offset', pixel_offset) - multi_rois, restore_ind, rois_num_per_level = core.ops.distribute_fpn_proposals( + multi_rois, restore_ind, rois_num_per_level = C_ops.distribute_fpn_proposals( fpn_rois, rois_num, num_lvl, num_lvl, *attrs) + return multi_rois, restore_ind, rois_num_per_level else: @@ -573,6 +231,8 @@ def distribute_fpn_proposals(fpn_rois, for i in range(num_lvl) ] outputs['MultiLevelRoIsNum'] = rois_num_per_level + else: + rois_num_per_level = None helper.append_op( type='distribute_fpn_proposals', @@ -588,143 +248,6 @@ def distribute_fpn_proposals(fpn_rois, return multi_rois, restore_ind, rois_num_per_level -@paddle.jit.not_to_static -def yolo_box( - x, - origin_shape, - anchors, - class_num, - conf_thresh, - downsample_ratio, - clip_bbox=True, - scale_x_y=1., - name=None, ): - """ - - This operator generates YOLO detection boxes from output of YOLOv3 network. - - The output of previous network is in shape [N, C, H, W], while H and W - should be the same, H and W specify the grid size, each grid point predict - given number boxes, this given number, which following will be represented as S, - is specified by the number of anchors. In the second dimension(the channel - dimension), C should be equal to S * (5 + class_num), class_num is the object - category number of source dataset(such as 80 in coco dataset), so the - second(channel) dimension, apart from 4 box location coordinates x, y, w, h, - also includes confidence score of the box and class one-hot key of each anchor - box. - Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box - predictions should be as follows: - $$ - b_x = \\sigma(t_x) + c_x - $$ - $$ - b_y = \\sigma(t_y) + c_y - $$ - $$ - b_w = p_w e^{t_w} - $$ - $$ - b_h = p_h e^{t_h} - $$ - in the equation above, :math:`c_x, c_y` is the left top corner of current grid - and :math:`p_w, p_h` is specified by anchors. - The logistic regression value of the 5th channel of each anchor prediction boxes - represents the confidence score of each prediction box, and the logistic - regression value of the last :attr:`class_num` channels of each anchor prediction - boxes represents the classifcation scores. Boxes with confidence scores less than - :attr:`conf_thresh` should be ignored, and box final scores is the product of - confidence scores and classification scores. - $$ - score_{pred} = score_{conf} * score_{class} - $$ - - Args: - x (Tensor): The input tensor of YoloBox operator is a 4-D tensor with shape of [N, C, H, W]. - The second dimension(C) stores box locations, confidence score and - classification one-hot keys of each anchor box. Generally, X should be the output of YOLOv3 network. - The data type is float32 or float64. - origin_shape (Tensor): The image size tensor of YoloBox operator, This is a 2-D tensor with shape of [N, 2]. - This tensor holds height and width of each input image used for resizing output box in input image - scale. The data type is int32. - anchors (list|tuple): The anchor width and height, it will be parsed pair by pair. - class_num (int): The number of classes to predict. - conf_thresh (float): The confidence scores threshold of detection boxes. Boxes with confidence scores - under threshold should be ignored. - downsample_ratio (int): The downsample ratio from network input to YoloBox operator input, - so 32, 16, 8 should be set for the first, second, and thrid YoloBox operators. - clip_bbox (bool): Whether clip output bonding box in Input(ImgSize) boundary. Default true. - scale_x_y (float): Scale the center point of decoded bounding box. Default 1.0. - name (string): The default value is None. Normally there is no need - for user to set this property. For more information, - please refer to :ref:`api_guide_Name` - - Returns: - boxes Tensor: A 3-D tensor with shape [N, M, 4], the coordinates of boxes, N is the batch num, - M is output box number, and the 3rd dimension stores [xmin, ymin, xmax, ymax] coordinates of boxes. - scores Tensor: A 3-D tensor with shape [N, M, :attr:`class_num`], the coordinates of boxes, N is the batch num, - M is output box number. - - Raises: - TypeError: Attr anchors of yolo box must be list or tuple - TypeError: Attr class_num of yolo box must be an integer - TypeError: Attr conf_thresh of yolo box must be a float number - - Examples: - - .. code-block:: python - - import paddle - from paddlers.models.ppdet.modeling import ops - - paddle.enable_static() - x = paddle.static.data(name='x', shape=[None, 255, 13, 13], dtype='float32') - img_size = paddle.static.data(name='img_size',shape=[None, 2],dtype='int64') - anchors = [10, 13, 16, 30, 33, 23] - boxes,scores = ops.yolo_box(x=x, img_size=img_size, class_num=80, anchors=anchors, - conf_thresh=0.01, downsample_ratio=32) - """ - helper = LayerHelper('yolo_box', **locals()) - - if not isinstance(anchors, list) and not isinstance(anchors, tuple): - raise TypeError("Attr anchors of yolo_box must be list or tuple") - if not isinstance(class_num, int): - raise TypeError("Attr class_num of yolo_box must be an integer") - if not isinstance(conf_thresh, float): - raise TypeError("Attr ignore_thresh of yolo_box must be a float number") - - if in_dygraph_mode(): - attrs = ('anchors', anchors, 'class_num', class_num, 'conf_thresh', - conf_thresh, 'downsample_ratio', downsample_ratio, 'clip_bbox', - clip_bbox, 'scale_x_y', scale_x_y) - boxes, scores = core.ops.yolo_box(x, origin_shape, *attrs) - return boxes, scores - else: - boxes = helper.create_variable_for_type_inference(dtype=x.dtype) - scores = helper.create_variable_for_type_inference(dtype=x.dtype) - - attrs = { - "anchors": anchors, - "class_num": class_num, - "conf_thresh": conf_thresh, - "downsample_ratio": downsample_ratio, - "clip_bbox": clip_bbox, - "scale_x_y": scale_x_y, - } - - helper.append_op( - type='yolo_box', - inputs={ - "X": x, - "ImgSize": origin_shape, - }, - outputs={ - 'Boxes': boxes, - 'Scores': scores, - }, - attrs=attrs) - return boxes, scores - - @paddle.jit.not_to_static def prior_box(input, image, @@ -769,7 +292,7 @@ def prior_box(input, Caffe. Please note, this order affects the weights order of convolution layer followed by and does not affect the final detection results. Default: False. - name(str, optional): The default value is None. Normally there is no need for + name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` Returns: @@ -827,14 +350,14 @@ def prior_box(input, max_sizes = [max_sizes] cur_max_sizes = max_sizes - if in_dygraph_mode(): + if in_dynamic_mode(): attrs = ('min_sizes', min_sizes, 'aspect_ratios', aspect_ratios, 'variances', variance, 'flip', flip, 'clip', clip, 'step_w', steps[0], 'step_h', steps[1], 'offset', offset, 'min_max_aspect_ratios_order', min_max_aspect_ratios_order) if cur_max_sizes is not None: attrs += ('max_sizes', cur_max_sizes) - box, var = core.ops.prior_box(input, image, *attrs) + box, var = C_ops.prior_box(input, image, *attrs) return box, var else: attrs = { @@ -929,9 +452,9 @@ def multiclass_nms(bboxes, step. -1 means keeping all bboxes after NMS step. normalized (bool): Whether detections are normalized. Default: True return_index(bool): Whether return selected index. Default: False - rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image. + rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image. The shape is [B] and data type is int32. B is the number of images. - If it is not None then return a list of 1-D Tensor. Each element + If it is not None then return a list of 1-D Tensor. Each element is the output RoIs' number of each image on the corresponding level and the shape is [B]. None by default. name(str): Name of the multiclass nms op. Default: None. @@ -972,13 +495,13 @@ def multiclass_nms(bboxes, """ helper = LayerHelper('multiclass_nms3', **locals()) - if in_dygraph_mode(): + if in_dynamic_mode(): attrs = ('background_label', background_label, 'score_threshold', score_threshold, 'nms_top_k', nms_top_k, 'nms_threshold', nms_threshold, 'keep_top_k', keep_top_k, 'nms_eta', nms_eta, 'normalized', normalized) - output, index, nms_rois_num = core.ops.multiclass_nms3(bboxes, scores, - rois_num, *attrs) + output, index, nms_rois_num = C_ops.multiclass_nms3(bboxes, scores, + rois_num, *attrs) if not return_index: index = None return output, nms_rois_num, index @@ -1086,7 +609,7 @@ def matrix_nms(bboxes, from {0} to {1}) Index (Tensor): A 2-D Tensor with shape [No, 1] containing the selected indices, which are absolute values cross batches. - rois_num (Tensor): A 1-D Tensor with shape [N] containing + rois_num (Tensor): A 1-D Tensor with shape [N] containing the number of detected boxes in each image. Examples: .. code-block:: python @@ -1113,13 +636,13 @@ def matrix_nms(bboxes, check_type(gaussian_sigma, 'gaussian_sigma', float, 'matrix_nms') check_type(background_label, 'background_label', int, 'matrix_nms') - if in_dygraph_mode(): + if in_dynamic_mode(): attrs = ('background_label', background_label, 'score_threshold', score_threshold, 'post_threshold', post_threshold, 'nms_top_k', nms_top_k, 'gaussian_sigma', gaussian_sigma, 'use_gaussian', use_gaussian, 'keep_top_k', keep_top_k, 'normalized', normalized) - out, index, rois_num = core.ops.matrix_nms(bboxes, scores, *attrs) + out, index, rois_num = C_ops.matrix_nms(bboxes, scores, *attrs) if not return_index: index = None if not return_rois_num: @@ -1158,111 +681,6 @@ def matrix_nms(bboxes, return output, rois_num, index -def bipartite_match(dist_matrix, - match_type=None, - dist_threshold=None, - name=None): - """ - - This operator implements a greedy bipartite matching algorithm, which is - used to obtain the matching with the maximum distance based on the input - distance matrix. For input 2D matrix, the bipartite matching algorithm can - find the matched column for each row (matched means the largest distance), - also can find the matched row for each column. And this operator only - calculate matched indices from column to row. For each instance, - the number of matched indices is the column number of the input distance - matrix. **The OP only supports CPU**. - - There are two outputs, matched indices and distance. - A simple description, this algorithm matched the best (maximum distance) - row entity to the column entity and the matched indices are not duplicated - in each row of ColToRowMatchIndices. If the column entity is not matched - any row entity, set -1 in ColToRowMatchIndices. - - NOTE: the input DistMat can be LoDTensor (with LoD) or Tensor. - If LoDTensor with LoD, the height of ColToRowMatchIndices is batch size. - If Tensor, the height of ColToRowMatchIndices is 1. - - NOTE: This API is a very low level API. It is used by :code:`ssd_loss` - layer. Please consider to use :code:`ssd_loss` instead. - - Args: - dist_matrix(Tensor): This input is a 2-D LoDTensor with shape - [K, M]. The data type is float32 or float64. It is pair-wise - distance matrix between the entities represented by each row and - each column. For example, assumed one entity is A with shape [K], - another entity is B with shape [M]. The dist_matrix[i][j] is the - distance between A[i] and B[j]. The bigger the distance is, the - better matching the pairs are. NOTE: This tensor can contain LoD - information to represent a batch of inputs. One instance of this - batch can contain different numbers of entities. - match_type(str, optional): The type of matching method, should be - 'bipartite' or 'per_prediction'. None ('bipartite') by default. - dist_threshold(float32, optional): If `match_type` is 'per_prediction', - this threshold is to determine the extra matching bboxes based - on the maximum distance, 0.5 by default. - name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. - - Returns: - Tuple: - - matched_indices(Tensor): A 2-D Tensor with shape [N, M]. The data - type is int32. N is the batch size. If match_indices[i][j] is -1, it - means B[j] does not match any entity in i-th instance. - Otherwise, it means B[j] is matched to row - match_indices[i][j] in i-th instance. The row number of - i-th instance is saved in match_indices[i][j]. - - matched_distance(Tensor): A 2-D Tensor with shape [N, M]. The data - type is float32. N is batch size. If match_indices[i][j] is -1, - match_distance[i][j] is also -1.0. Otherwise, assumed - match_distance[i][j] = d, and the row offsets of each instance - are called LoD. Then match_distance[i][j] = - dist_matrix[d+LoD[i]][j]. - - Examples: - - .. code-block:: python - import paddle - from paddlers.models.ppdet.modeling import ops - from paddlers.models.ppdet.modeling.utils import iou_similarity - - paddle.enable_static() - - x = paddle.static.data(name='x', shape=[None, 4], dtype='float32') - y = paddle.static.data(name='y', shape=[None, 4], dtype='float32') - iou = iou_similarity(x=x, y=y) - matched_indices, matched_dist = ops.bipartite_match(iou) - """ - check_variable_and_dtype(dist_matrix, 'dist_matrix', - ['float32', 'float64'], 'bipartite_match') - - if in_dygraph_mode(): - match_indices, match_distance = core.ops.bipartite_match( - dist_matrix, "match_type", match_type, "dist_threshold", - dist_threshold) - return match_indices, match_distance - - helper = LayerHelper('bipartite_match', **locals()) - match_indices = helper.create_variable_for_type_inference(dtype='int32') - match_distance = helper.create_variable_for_type_inference( - dtype=dist_matrix.dtype) - helper.append_op( - type='bipartite_match', - inputs={'DistMat': dist_matrix}, - attrs={ - 'match_type': match_type, - 'dist_threshold': dist_threshold, - }, - outputs={ - 'ColToRowMatchIndices': match_indices, - 'ColToRowMatchDist': match_distance - }) - return match_indices, match_distance - - @paddle.jit.not_to_static def box_coder(prior_box, prior_box_var, @@ -1274,74 +692,74 @@ def box_coder(prior_box, r""" **Box Coder Layer** Encode/Decode the target bounding box with the priorbox information. - + The Encoding schema described below: .. math:: ox = (tx - px) / pw / pxv oy = (ty - py) / ph / pyv - ow = \log(\abs(tw / pw)) / pwv - oh = \log(\abs(th / ph)) / phv + ow = \log(\abs(tw / pw)) / pwv + oh = \log(\abs(th / ph)) / phv The Decoding schema described below: - + .. math:: - + ox = (pw * pxv * tx * + px) - tw / 2 oy = (ph * pyv * ty * + py) - th / 2 ow = \exp(pwv * tw) * pw + tw / 2 - oh = \exp(phv * th) * ph + th / 2 - where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, - width and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote - the priorbox's (anchor) center coordinates, width and height. `pxv`, - `pyv`, `pwv`, `phv` denote the variance of the priorbox and `ox`, `oy`, - `ow`, `oh` denote the encoded/decoded coordinates, width and height. - During Box Decoding, two modes for broadcast are supported. Say target - box has shape [N, M, 4], and the shape of prior box can be [N, 4] or - [M, 4]. Then prior box will broadcast to target box along the - assigned axis. + oh = \exp(phv * th) * ph + th / 2 + where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, + width and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote + the priorbox's (anchor) center coordinates, width and height. `pxv`, + `pyv`, `pwv`, `phv` denote the variance of the priorbox and `ox`, `oy`, + `ow`, `oh` denote the encoded/decoded coordinates, width and height. + During Box Decoding, two modes for broadcast are supported. Say target + box has shape [N, M, 4], and the shape of prior box can be [N, 4] or + [M, 4]. Then prior box will broadcast to target box along the + assigned axis. Args: - prior_box(Tensor): Box list prior_box is a 2-D Tensor with shape + prior_box(Tensor): Box list prior_box is a 2-D Tensor with shape [M, 4] holds M boxes and data type is float32 or float64. Each box - is represented as [xmin, ymin, xmax, ymax], [xmin, ymin] is the + is represented as [xmin, ymin, xmax, ymax], [xmin, ymin] is the left top coordinate of the anchor box, if the input is image feature - map, they are close to the origin of the coordinate system. - [xmax, ymax] is the right bottom coordinate of the anchor box. - prior_box_var(List|Tensor|None): prior_box_var supports three types - of input. One is Tensor with shape [M, 4] which holds M group and - data type is float32 or float64. The second is list consist of - 4 elements shared by all boxes and data type is float32 or float64. - Other is None and not involved in calculation. - target_box(Tensor): This input can be a 2-D LoDTensor with shape - [N, 4] when code_type is 'encode_center_size'. This input also can - be a 3-D Tensor with shape [N, M, 4] when code_type is - 'decode_center_size'. Each box is represented as - [xmin, ymin, xmax, ymax]. The data type is float32 or float64. + map, they are close to the origin of the coordinate system. + [xmax, ymax] is the right bottom coordinate of the anchor box. + prior_box_var(List|Tensor|None): prior_box_var supports three types + of input. One is Tensor with shape [M, 4] which holds M group and + data type is float32 or float64. The second is list consist of + 4 elements shared by all boxes and data type is float32 or float64. + Other is None and not involved in calculation. + target_box(Tensor): This input can be a 2-D LoDTensor with shape + [N, 4] when code_type is 'encode_center_size'. This input also can + be a 3-D Tensor with shape [N, M, 4] when code_type is + 'decode_center_size'. Each box is represented as + [xmin, ymin, xmax, ymax]. The data type is float32 or float64. code_type(str): The code type used with the target box. It can be - `encode_center_size` or `decode_center_size`. `encode_center_size` + `encode_center_size` or `decode_center_size`. `encode_center_size` by default. box_normalized(bool): Whether treat the priorbox as a normalized box. Set true by default. - axis(int): Which axis in PriorBox to broadcast for box decode, - for example, if axis is 0 and TargetBox has shape [N, M, 4] and + axis(int): Which axis in PriorBox to broadcast for box decode, + for example, if axis is 0 and TargetBox has shape [N, M, 4] and PriorBox has shape [M, 4], then PriorBox will broadcast to [N, M, 4] - for decoding. It is only valid when code type is - `decode_center_size`. Set 0 by default. - name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. + for decoding. It is only valid when code type is + `decode_center_size`. Set 0 by default. + name(str, optional): For detailed information, please refer + to :ref:`api_guide_Name`. Usually name is no need to set and + None by default. Returns: Tensor: - output_box(Tensor): When code_type is 'encode_center_size', the - output tensor of box_coder_op with shape [N, M, 4] representing the - result of N target boxes encoded with M Prior boxes and variances. - When code_type is 'decode_center_size', N represents the batch size + output_box(Tensor): When code_type is 'encode_center_size', the + output tensor of box_coder_op with shape [N, M, 4] representing the + result of N target boxes encoded with M Prior boxes and variances. + When code_type is 'decode_center_size', N represents the batch size and M represents the number of decoded boxes. Examples: - + .. code-block:: python - + import paddle from paddlers.models.ppdet.modeling import ops paddle.enable_static() @@ -1375,14 +793,14 @@ def box_coder(prior_box, check_variable_and_dtype(target_box, 'target_box', ['float32', 'float64'], 'box_coder') - if in_dygraph_mode(): + if in_dynamic_mode(): if isinstance(prior_box_var, Variable): - output_box = core.ops.box_coder( + output_box = C_ops.box_coder( prior_box, prior_box_var, target_box, "code_type", code_type, "box_normalized", box_normalized, "axis", axis) elif isinstance(prior_box_var, list): - output_box = core.ops.box_coder( + output_box = C_ops.box_coder( prior_box, None, target_box, "code_type", code_type, "box_normalized", box_normalized, "axis", axis, "variance", prior_box_var) @@ -1434,16 +852,16 @@ def generate_proposals(scores, """ **Generate proposal Faster-RCNN** This operation proposes RoIs according to each box with their - probability to be a foreground object and + probability to be a foreground object and the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals could be used to train detection net. For generating proposals, this operation performs following steps: 1. Transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4) - 2. Calculate box locations as proposals candidates. + 2. Calculate box locations as proposals candidates. 3. Clip boxes to image - 4. Remove predicted boxes with small area. + 4. Remove predicted boxes with small area. 5. Apply NMS to get final proposals as output. Args: scores(Tensor): A 4-D Tensor with shape [N, A, H, W] represents @@ -1454,7 +872,7 @@ def generate_proposals(scores, represents the difference between predicted box location and anchor location. The data type must be float32. im_shape(Tensor): A 2-D Tensor with shape [N, 2] represents H, W, the - origin image size or input size. The data type can be float32 or + origin image size or input size. The data type can be float32 or float64. anchors(Tensor): A 4-D Tensor represents the anchors with a layout of [H, W, A, 4]. H and W are height and width of the feature map, @@ -1472,13 +890,13 @@ def generate_proposals(scores, width < min_size. The data type must be float32. `0.1` by default. eta(float): Apply in adaptive NMS, if adaptive `threshold > 0.5`, `adaptive_threshold = adaptive_threshold * eta` in each iteration. - return_rois_num(bool): When setting True, it will return a 1D Tensor with shape [N, ] that includes Rois's + return_rois_num(bool): When setting True, it will return a 1D Tensor with shape [N, ] that includes Rois's num of each image in one batch. The N is the image's num. For example, the tensor has values [4,5] that represents - the first image has 4 Rois, the second image has 5 Rois. It only used in rcnn model. - 'False' by default. - name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. + the first image has 4 Rois, the second image has 5 Rois. It only used in rcnn model. + 'False' by default. + name(str, optional): For detailed information, please refer + to :ref:`api_guide_Name`. Usually name is no need to set and + None by default. Returns: tuple: @@ -1488,7 +906,7 @@ def generate_proposals(scores, Examples: .. code-block:: python - + import paddle from paddlers.models.ppdet.modeling import ops paddle.enable_static() @@ -1500,13 +918,15 @@ def generate_proposals(scores, rois, roi_probs = ops.generate_proposals(scores, bbox_deltas, im_shape, anchors, variances) """ - if in_dygraph_mode(): + if in_dynamic_mode(): assert return_rois_num, "return_rois_num should be True in dygraph mode." attrs = ('pre_nms_topN', pre_nms_top_n, 'post_nms_topN', post_nms_top_n, 'nms_thresh', nms_thresh, 'min_size', min_size, 'eta', eta, 'pixel_offset', pixel_offset) - rpn_rois, rpn_roi_probs, rpn_rois_num = core.ops.generate_proposals_v2( + rpn_rois, rpn_roi_probs, rpn_rois_num = C_ops.generate_proposals_v2( scores, bbox_deltas, im_shape, anchors, variances, *attrs) + if not return_rois_num: + rpn_rois_num = None return rpn_rois, rpn_roi_probs, rpn_rois_num else: @@ -1557,6 +977,8 @@ def generate_proposals(scores, outputs=outputs) rpn_rois.stop_gradient = True rpn_roi_probs.stop_gradient = True + if not return_rois_num: + rpn_rois_num = None return rpn_rois, rpn_roi_probs, rpn_rois_num diff --git a/paddlers/models/ppdet/modeling/post_process.py b/paddlers/models/ppdet/modeling/post_process.py index b9e556e..61f6b03 100644 --- a/paddlers/models/ppdet/modeling/post_process.py +++ b/paddlers/models/ppdet/modeling/post_process.py @@ -17,7 +17,7 @@ import paddle import paddle.nn as nn import paddle.nn.functional as F from paddlers.models.ppdet.core.workspace import register -from paddlers.models.ppdet.modeling.bbox_utils import nonempty_bbox, rbox2poly +from paddlers.models.ppdet.modeling.bbox_utils import nonempty_bbox from paddlers.models.ppdet.modeling.layers import TTFBox from .transformers import bbox_cxcywh_to_xyxy try: @@ -27,23 +27,30 @@ except Exception: __all__ = [ 'BBoxPostProcess', 'MaskPostProcess', 'FCOSPostProcess', - 'S2ANetBBoxPostProcess', 'JDEBBoxPostProcess', 'CenterNetPostProcess', - 'DETRBBoxPostProcess', 'SparsePostProcess' + 'JDEBBoxPostProcess', 'CenterNetPostProcess', 'DETRBBoxPostProcess', + 'SparsePostProcess' ] @register -class BBoxPostProcess(nn.Layer): - __shared__ = ['num_classes'] +class BBoxPostProcess(object): + __shared__ = ['num_classes', 'export_onnx', 'export_eb'] __inject__ = ['decode', 'nms'] - def __init__(self, num_classes=80, decode=None, nms=None): + def __init__(self, + num_classes=80, + decode=None, + nms=None, + export_onnx=False, + export_eb=False): super(BBoxPostProcess, self).__init__() self.num_classes = num_classes self.decode = decode self.nms = nms + self.export_onnx = export_onnx + self.export_eb = export_eb - def forward(self, head_out, rois, im_shape, scale_factor): + def __call__(self, head_out, rois, im_shape, scale_factor): """ Decode the bbox and do NMS if needed. @@ -52,6 +59,7 @@ class BBoxPostProcess(nn.Layer): rois (tuple): roi and rois_num of rpn_head output. im_shape (Tensor): The shape of the input image. scale_factor (Tensor): The scale factor of the input image. + export_onnx (bool): whether export model to onnx Returns: bbox_pred (Tensor): The output prediction with shape [N, 6], including labels, scores and bboxes. The size of bboxes are corresponding @@ -62,15 +70,26 @@ class BBoxPostProcess(nn.Layer): if self.nms is not None: bboxes, score = self.decode(head_out, rois, im_shape, scale_factor) bbox_pred, bbox_num, _ = self.nms(bboxes, score, self.num_classes) + else: bbox_pred, bbox_num = self.decode(head_out, rois, im_shape, scale_factor) + + if self.export_onnx: + # add fake box after postprocess when exporting onnx + fake_bboxes = paddle.to_tensor( + np.array( + [[0., 0.0, 0.0, 0.0, 1.0, 1.0]], dtype='float32')) + + bbox_pred = paddle.concat([bbox_pred, fake_bboxes]) + bbox_num = bbox_num + 1 + return bbox_pred, bbox_num def get_pred(self, bboxes, bbox_num, im_shape, scale_factor): """ - Rescale, clip and filter the bbox from the output of NMS to - get final prediction. + Rescale, clip and filter the bbox from the output of NMS to + get final prediction. Notes: Currently only support bs = 1. @@ -86,46 +105,59 @@ class BBoxPostProcess(nn.Layer): pred_result (Tensor): The final prediction results with shape [N, 6] including labels, scores and bboxes. """ - - bboxes_list = [] - bbox_num_list = [] - id_start = 0 - fake_bboxes = paddle.to_tensor( - np.array( - [[-1, 0.0, 0.0, 0.0, 0.0, 0.0]], dtype='float32')) - fake_bbox_num = paddle.to_tensor(np.array([1], dtype='int32')) - - # add fake bbox when output is empty for each batch - for i in range(bbox_num.shape[0]): - if bbox_num[i] == 0: - bboxes_i = fake_bboxes - bbox_num_i = fake_bbox_num - id_start += 1 - else: - bboxes_i = bboxes[id_start:id_start + bbox_num[i], :] - bbox_num_i = bbox_num[i] - id_start += bbox_num[i] - bboxes_list.append(bboxes_i) - bbox_num_list.append(bbox_num_i) - bboxes = paddle.concat(bboxes_list) - bbox_num = paddle.concat(bbox_num_list) + if self.export_eb: + # enable rcnn models for edgeboard hw to skip the following postprocess. + return bboxes, bboxes, bbox_num + + if not self.export_onnx: + bboxes_list = [] + bbox_num_list = [] + id_start = 0 + fake_bboxes = paddle.to_tensor( + np.array( + [[0., 0.0, 0.0, 0.0, 1.0, 1.0]], dtype='float32')) + fake_bbox_num = paddle.to_tensor(np.array([1], dtype='int32')) + + # add fake bbox when output is empty for each batch + for i in range(bbox_num.shape[0]): + if bbox_num[i] == 0: + bboxes_i = fake_bboxes + bbox_num_i = fake_bbox_num + else: + bboxes_i = bboxes[id_start:id_start + bbox_num[i], :] + bbox_num_i = bbox_num[i] + id_start += bbox_num[i] + bboxes_list.append(bboxes_i) + bbox_num_list.append(bbox_num_i) + bboxes = paddle.concat(bboxes_list) + bbox_num = paddle.concat(bbox_num_list) origin_shape = paddle.floor(im_shape / scale_factor + 0.5) - origin_shape_list = [] - scale_factor_list = [] - # scale_factor: scale_y, scale_x - for i in range(bbox_num.shape[0]): - expand_shape = paddle.expand(origin_shape[i:i + 1, :], - [bbox_num[i], 2]) - scale_y, scale_x = scale_factor[i][0], scale_factor[i][1] - scale = paddle.concat([scale_x, scale_y, scale_x, scale_y]) - expand_scale = paddle.expand(scale, [bbox_num[i], 4]) - origin_shape_list.append(expand_shape) - scale_factor_list.append(expand_scale) - - self.origin_shape_list = paddle.concat(origin_shape_list) - scale_factor_list = paddle.concat(scale_factor_list) + if not self.export_onnx: + origin_shape_list = [] + scale_factor_list = [] + # scale_factor: scale_y, scale_x + for i in range(bbox_num.shape[0]): + expand_shape = paddle.expand(origin_shape[i:i + 1, :], + [bbox_num[i], 2]) + scale_y, scale_x = scale_factor[i][0], scale_factor[i][1] + scale = paddle.concat([scale_x, scale_y, scale_x, scale_y]) + expand_scale = paddle.expand(scale, [bbox_num[i], 4]) + origin_shape_list.append(expand_shape) + scale_factor_list.append(expand_scale) + + self.origin_shape_list = paddle.concat(origin_shape_list) + scale_factor_list = paddle.concat(scale_factor_list) + + else: + # simplify the computation for bs=1 when exporting onnx + scale_y, scale_x = scale_factor[0][0], scale_factor[0][1] + scale = paddle.concat( + [scale_x, scale_y, scale_x, scale_y]).unsqueeze(0) + self.origin_shape_list = paddle.expand(origin_shape, + [bbox_num[0], 2]) + scale_factor_list = paddle.expand(scale, [bbox_num[0], 4]) # bboxes: [N, 6], label, score, bbox pred_label = bboxes[:, 0:1] @@ -148,7 +180,7 @@ class BBoxPostProcess(nn.Layer): pred_label = paddle.where(keep_mask, pred_label, paddle.ones_like(pred_label) * -1) pred_result = paddle.concat([pred_label, pred_score, pred_bbox], axis=1) - return pred_result + return bboxes, pred_result, bbox_num def get_origin_shape(self, ): return self.origin_shape_list @@ -156,6 +188,7 @@ class BBoxPostProcess(nn.Layer): @register class MaskPostProcess(object): + __shared__ = ['export_onnx', 'assign_on_cpu'] """ refer to: https://github.com/facebookresearch/detectron2/layers/mask_ops.py @@ -163,27 +196,36 @@ class MaskPostProcess(object): Get Mask output according to the output from model """ - def __init__(self, binary_thresh=0.5): + def __init__(self, + binary_thresh=0.5, + export_onnx=False, + assign_on_cpu=False): super(MaskPostProcess, self).__init__() self.binary_thresh = binary_thresh + self.export_onnx = export_onnx + self.assign_on_cpu = assign_on_cpu def paste_mask(self, masks, boxes, im_h, im_w): """ Paste the mask prediction to the original image. """ - + x0_int, y0_int = 0, 0 + x1_int, y1_int = im_w, im_h x0, y0, x1, y1 = paddle.split(boxes, 4, axis=1) - masks = paddle.unsqueeze(masks, [0, 1]) - img_y = paddle.arange(0, im_h, dtype='float32') + 0.5 - img_x = paddle.arange(0, im_w, dtype='float32') + 0.5 + N = masks.shape[0] + img_y = paddle.arange(y0_int, y1_int) + 0.5 + img_x = paddle.arange(x0_int, x1_int) + 0.5 + img_y = (img_y - y0) / (y1 - y0) * 2 - 1 img_x = (img_x - x0) / (x1 - x0) * 2 - 1 - img_x = paddle.unsqueeze(img_x, [1]) - img_y = paddle.unsqueeze(img_y, [2]) - N = boxes.shape[0] - - gx = paddle.expand(img_x, [N, img_y.shape[1], img_x.shape[2]]) - gy = paddle.expand(img_y, [N, img_y.shape[1], img_x.shape[2]]) + # img_x, img_y have shapes (N, w), (N, h) + + if self.assign_on_cpu: + paddle.set_device('cpu') + gx = img_x[:, None, :].expand( + [N, paddle.shape(img_y)[1], paddle.shape(img_x)[1]]) + gy = img_y[:, :, None].expand( + [N, paddle.shape(img_y)[1], paddle.shape(img_x)[1]]) grid = paddle.stack([gx, gy], axis=3) img_masks = F.grid_sample(masks, grid, align_corners=False) return img_masks[:, 0] @@ -206,22 +248,38 @@ class MaskPostProcess(object): """ num_mask = mask_out.shape[0] origin_shape = paddle.cast(origin_shape, 'int32') - # TODO: support bs > 1 and mask output dtype is bool - pred_result = paddle.zeros( - [num_mask, origin_shape[0][0], origin_shape[0][1]], dtype='int32') - if (len(bbox_num) == 1 and bbox_num[0] == 1) and bboxes[0][0] == -1: - return pred_result - - # TODO: optimize chunk paste - pred_result = [] - for i in range(bboxes.shape[0]): - im_h, im_w = origin_shape[i][0], origin_shape[i][1] - pred_mask = self.paste_mask(mask_out[i], bboxes[i:i + 1, 2:], im_h, - im_w) - pred_mask = pred_mask >= self.binary_thresh - pred_mask = paddle.cast(pred_mask, 'int32') - pred_result.append(pred_mask) - pred_result = paddle.concat(pred_result) + device = paddle.device.get_device() + + if self.export_onnx: + h, w = origin_shape[0][0], origin_shape[0][1] + mask_onnx = self.paste_mask(mask_out[:, None, :, :], bboxes[:, 2:], + h, w) + mask_onnx = mask_onnx >= self.binary_thresh + pred_result = paddle.cast(mask_onnx, 'int32') + + else: + max_h = paddle.max(origin_shape[:, 0]) + max_w = paddle.max(origin_shape[:, 1]) + pred_result = paddle.zeros( + [num_mask, max_h, max_w], dtype='int32') - 1 + + id_start = 0 + for i in range(paddle.shape(bbox_num)[0]): + bboxes_i = bboxes[id_start:id_start + bbox_num[i], :] + mask_out_i = mask_out[id_start:id_start + bbox_num[i], :, :] + im_h = origin_shape[i, 0] + im_w = origin_shape[i, 1] + bbox_num_i = bbox_num[id_start] + pred_mask = self.paste_mask(mask_out_i[:, None, :, :], + bboxes_i[:, 2:], im_h, im_w) + pred_mask = paddle.cast(pred_mask >= self.binary_thresh, + 'int32') + pred_result[id_start:id_start + bbox_num[i], :im_h, : + im_w] = pred_mask + id_start += bbox_num[i] + if self.assign_on_cpu: + paddle.set_device(device) + return pred_result @@ -245,109 +303,6 @@ class FCOSPostProcess(object): return bbox_pred, bbox_num -@register -class S2ANetBBoxPostProcess(nn.Layer): - __shared__ = ['num_classes'] - __inject__ = ['nms'] - - def __init__(self, num_classes=15, nms_pre=2000, min_bbox_size=0, nms=None): - super(S2ANetBBoxPostProcess, self).__init__() - self.num_classes = num_classes - self.nms_pre = paddle.to_tensor(nms_pre) - self.min_bbox_size = min_bbox_size - self.nms = nms - self.origin_shape_list = [] - self.fake_pred_cls_score_bbox = paddle.to_tensor( - np.array( - [[-1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]], - dtype='float32')) - self.fake_bbox_num = paddle.to_tensor(np.array([1], dtype='int32')) - - def forward(self, pred_scores, pred_bboxes): - """ - pred_scores : [N, M] score - pred_bboxes : [N, 5] xc, yc, w, h, a - im_shape : [N, 2] im_shape - scale_factor : [N, 2] scale_factor - """ - pred_ploys0 = rbox2poly(pred_bboxes) - pred_ploys = paddle.unsqueeze(pred_ploys0, axis=0) - - # pred_scores [NA, 16] --> [16, NA] - pred_scores0 = paddle.transpose(pred_scores, [1, 0]) - pred_scores = paddle.unsqueeze(pred_scores0, axis=0) - - pred_cls_score_bbox, bbox_num, _ = self.nms(pred_ploys, pred_scores, - self.num_classes) - # Prevent empty bbox_pred from decode or NMS. - # Bboxes and score before NMS may be empty due to the score threshold. - if pred_cls_score_bbox.shape[0] <= 0 or pred_cls_score_bbox.shape[ - 1] <= 1: - pred_cls_score_bbox = self.fake_pred_cls_score_bbox - bbox_num = self.fake_bbox_num - - pred_cls_score_bbox = paddle.reshape(pred_cls_score_bbox, [-1, 10]) - return pred_cls_score_bbox, bbox_num - - def get_pred(self, bboxes, bbox_num, im_shape, scale_factor): - """ - Rescale, clip and filter the bbox from the output of NMS to - get final prediction. - Args: - bboxes(Tensor): bboxes [N, 10] - bbox_num(Tensor): bbox_num - im_shape(Tensor): [1 2] - scale_factor(Tensor): [1 2] - Returns: - bbox_pred(Tensor): The output is the prediction with shape [N, 8] - including labels, scores and bboxes. The size of - bboxes are corresponding to the original image. - """ - origin_shape = paddle.floor(im_shape / scale_factor + 0.5) - - origin_shape_list = [] - scale_factor_list = [] - # scale_factor: scale_y, scale_x - for i in range(bbox_num.shape[0]): - expand_shape = paddle.expand(origin_shape[i:i + 1, :], - [bbox_num[i], 2]) - scale_y, scale_x = scale_factor[i][0], scale_factor[i][1] - scale = paddle.concat([ - scale_x, scale_y, scale_x, scale_y, scale_x, scale_y, scale_x, - scale_y - ]) - expand_scale = paddle.expand(scale, [bbox_num[i], 8]) - origin_shape_list.append(expand_shape) - scale_factor_list.append(expand_scale) - - origin_shape_list = paddle.concat(origin_shape_list) - scale_factor_list = paddle.concat(scale_factor_list) - - # bboxes: [N, 10], label, score, bbox - pred_label_score = bboxes[:, 0:2] - pred_bbox = bboxes[:, 2:] - - # rescale bbox to original image - pred_bbox = pred_bbox.reshape([-1, 8]) - scaled_bbox = pred_bbox / scale_factor_list - origin_h = origin_shape_list[:, 0] - origin_w = origin_shape_list[:, 1] - - bboxes = scaled_bbox - zeros = paddle.zeros_like(origin_h) - x1 = paddle.maximum(paddle.minimum(bboxes[:, 0], origin_w - 1), zeros) - y1 = paddle.maximum(paddle.minimum(bboxes[:, 1], origin_h - 1), zeros) - x2 = paddle.maximum(paddle.minimum(bboxes[:, 2], origin_w - 1), zeros) - y2 = paddle.maximum(paddle.minimum(bboxes[:, 3], origin_h - 1), zeros) - x3 = paddle.maximum(paddle.minimum(bboxes[:, 4], origin_w - 1), zeros) - y3 = paddle.maximum(paddle.minimum(bboxes[:, 5], origin_h - 1), zeros) - x4 = paddle.maximum(paddle.minimum(bboxes[:, 6], origin_w - 1), zeros) - y4 = paddle.maximum(paddle.minimum(bboxes[:, 7], origin_h - 1), zeros) - pred_bbox = paddle.stack([x1, y1, x2, y2, x3, y3, x4, y4], axis=-1) - pred_result = paddle.concat([pred_label_score, pred_bbox], axis=1) - return pred_result - - @register class JDEBBoxPostProcess(nn.Layer): __shared__ = ['num_classes'] @@ -378,18 +333,18 @@ class JDEBBoxPostProcess(nn.Layer): def forward(self, head_out, anchors): """ - Decode the bbox and do NMS for JDE model. + Decode the bbox and do NMS for JDE model. Args: head_out (list): Bbox_pred and cls_prob of bbox_head output. anchors (list): Anchors of JDE model. Returns: - boxes_idx (Tensor): The index of kept bboxes after decode 'JDEBox'. + boxes_idx (Tensor): The index of kept bboxes after decode 'JDEBox'. bbox_pred (Tensor): The output is the prediction with shape [N, 6] including labels, scores and bboxes. bbox_num (Tensor): The number of prediction of each batch with shape [N]. - nms_keep_idx (Tensor): The index of kept bboxes after NMS. + nms_keep_idx (Tensor): The index of kept bboxes after NMS. """ boxes_idx, yolo_boxes_scores = self.decode(head_out, anchors) @@ -484,7 +439,7 @@ class CenterNetPostProcess(TTFBox): x2 = xs + wh[:, 0:1] / 2 y2 = ys + wh[:, 1:2] / 2 - n, c, feat_h, feat_w = hm.shape[:] + n, c, feat_h, feat_w = paddle.shape(hm) padw = (feat_w * self.down_ratio - im_shape[0, 1]) / 2 padh = (feat_h * self.down_ratio - im_shape[0, 0]) / 2 x1 = x1 * self.down_ratio @@ -505,11 +460,10 @@ class CenterNetPostProcess(TTFBox): boxes_shape = bboxes.shape[:] scale_expand = paddle.expand(scale_expand, shape=boxes_shape) bboxes = paddle.divide(bboxes, scale_expand) + results = paddle.concat([clses, scores, bboxes], axis=1) if self.for_mot: - results = paddle.concat([bboxes, scores, clses], axis=1) return results, inds, topk_clses else: - results = paddle.concat([clses, scores, bboxes], axis=1) return results, paddle.shape(results)[0:1], topk_clses @@ -672,8 +626,23 @@ class SparsePostProcess(object): return bbox_pred, bbox_num -def nms(dets, thresh): - """Apply classic DPM-style greedy NMS.""" +def multiclass_nms(bboxs, num_classes, match_threshold=0.6, match_metric='iou'): + final_boxes = [] + for c in range(num_classes): + idxs = bboxs[:, 0] == c + if np.count_nonzero(idxs) == 0: continue + r = nms(bboxs[idxs, 1:], match_threshold, match_metric) + final_boxes.append(np.concatenate([np.full((r.shape[0], 1), c), r], 1)) + return final_boxes + + +def nms(dets, match_threshold=0.6, match_metric='iou'): + """ Apply NMS to avoid detecting too many overlapping bounding boxes. + Args: + dets: shape [N, 5], [score, x1, y1, x2, y2] + match_metric: 'iou' or 'ios' + match_threshold: overlap thresh for match metric. + """ if dets.shape[0] == 0: return dets[[], :] scores = dets[:, 0] @@ -681,25 +650,12 @@ def nms(dets, thresh): y1 = dets[:, 2] x2 = dets[:, 3] y2 = dets[:, 4] - areas = (x2 - x1 + 1) * (y2 - y1 + 1) order = scores.argsort()[::-1] ndets = dets.shape[0] suppressed = np.zeros((ndets), dtype=np.int) - # nominal indices - # _i, _j - # sorted indices - # i, j - # temp variables for box i's (the box currently under consideration) - # ix1, iy1, ix2, iy2, iarea - - # variables for computing overlap with box j (lower scoring box) - # xx1, yy1, xx2, yy2 - # w, h - # inter, ovr - for _i in range(ndets): i = order[_i] if suppressed[i] == 1: @@ -720,8 +676,15 @@ def nms(dets, thresh): w = max(0.0, xx2 - xx1 + 1) h = max(0.0, yy2 - yy1 + 1) inter = w * h - ovr = inter / (iarea + areas[j] - inter) - if ovr >= thresh: + if match_metric == 'iou': + union = iarea + areas[j] - inter + match_value = inter / union + elif match_metric == 'ios': + smaller = min(iarea, areas[j]) + match_value = inter / smaller + else: + raise ValueError() + if match_value >= match_threshold: suppressed[j] = 1 keep = np.where(suppressed == 0)[0] dets = dets[keep, :] diff --git a/paddlers/models/ppdet/modeling/proposal_generator/anchor_generator.py b/paddlers/models/ppdet/modeling/proposal_generator/anchor_generator.py index 40538a6..ae5b074 100644 --- a/paddlers/models/ppdet/modeling/proposal_generator/anchor_generator.py +++ b/paddlers/models/ppdet/modeling/proposal_generator/anchor_generator.py @@ -12,16 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. -# The code is based on +# The code is based on # https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/anchor_generator.py import math import paddle import paddle.nn as nn +import numpy as np from paddlers.models.ppdet.core.workspace import register +__all__ = ['AnchorGenerator', 'RetinaAnchorGenerator', 'S2ANetAnchorGenerator'] + @register class AnchorGenerator(nn.Layer): @@ -29,18 +32,18 @@ class AnchorGenerator(nn.Layer): Generate anchors according to the feature maps Args: - anchor_sizes (list[float] | list[list[float]]): The anchor sizes at - each feature point. list[float] means all feature levels share the - same sizes. list[list[float]] means the anchor sizes for + anchor_sizes (list[float] | list[list[float]]): The anchor sizes at + each feature point. list[float] means all feature levels share the + same sizes. list[list[float]] means the anchor sizes for each level. The sizes stand for the scale of input size. aspect_ratios (list[float] | list[list[float]]): The aspect ratios at each feature point. list[float] means all feature levels share the same ratios. list[list[float]] means the aspect ratios for each level. - strides (list[float]): The strides of feature maps which generate + strides (list[float]): The strides of feature maps which generate anchors offset (float): The offset of the coordinate of anchors, default 0. - + """ def __init__(self, @@ -129,3 +132,135 @@ class AnchorGenerator(nn.Layer): For FPN models, `num_anchors` on every feature map is the same. """ return len(self.cell_anchors[0]) + + +@register +class RetinaAnchorGenerator(AnchorGenerator): + def __init__(self, + octave_base_scale=4, + scales_per_octave=3, + aspect_ratios=[0.5, 1.0, 2.0], + strides=[8.0, 16.0, 32.0, 64.0, 128.0], + variance=[1.0, 1.0, 1.0, 1.0], + offset=0.0): + anchor_sizes = [] + for s in strides: + anchor_sizes.append([ + s * octave_base_scale * 2**(i/scales_per_octave) \ + for i in range(scales_per_octave)]) + super(RetinaAnchorGenerator, self).__init__( + anchor_sizes=anchor_sizes, + aspect_ratios=aspect_ratios, + strides=strides, + variance=variance, + offset=offset) + + +@register +class S2ANetAnchorGenerator(nn.Layer): + """ + AnchorGenerator by paddle + """ + + def __init__(self, base_size, scales, ratios, scale_major=True, ctr=None): + super(S2ANetAnchorGenerator, self).__init__() + self.base_size = base_size + self.scales = paddle.to_tensor(scales) + self.ratios = paddle.to_tensor(ratios) + self.scale_major = scale_major + self.ctr = ctr + self.base_anchors = self.gen_base_anchors() + + @property + def num_base_anchors(self): + return self.base_anchors.shape[0] + + def gen_base_anchors(self): + w = self.base_size + h = self.base_size + if self.ctr is None: + x_ctr = 0.5 * (w - 1) + y_ctr = 0.5 * (h - 1) + else: + x_ctr, y_ctr = self.ctr + + h_ratios = paddle.sqrt(self.ratios) + w_ratios = 1 / h_ratios + if self.scale_major: + ws = (w * w_ratios[:] * self.scales[:]).reshape([-1]) + hs = (h * h_ratios[:] * self.scales[:]).reshape([-1]) + else: + ws = (w * self.scales[:] * w_ratios[:]).reshape([-1]) + hs = (h * self.scales[:] * h_ratios[:]).reshape([-1]) + + base_anchors = paddle.stack( + [ + x_ctr - 0.5 * (ws - 1), y_ctr - 0.5 * (hs - 1), + x_ctr + 0.5 * (ws - 1), y_ctr + 0.5 * (hs - 1) + ], + axis=-1) + base_anchors = paddle.round(base_anchors) + return base_anchors + + def _meshgrid(self, x, y, row_major=True): + yy, xx = paddle.meshgrid(y, x) + yy = yy.reshape([-1]) + xx = xx.reshape([-1]) + if row_major: + return xx, yy + else: + return yy, xx + + def forward(self, featmap_size, stride=16): + # featmap_size*stride project it to original area + + feat_h = featmap_size[0] + feat_w = featmap_size[1] + shift_x = paddle.arange(0, feat_w, 1, 'int32') * stride + shift_y = paddle.arange(0, feat_h, 1, 'int32') * stride + shift_xx, shift_yy = self._meshgrid(shift_x, shift_y) + shifts = paddle.stack([shift_xx, shift_yy, shift_xx, shift_yy], axis=-1) + + all_anchors = self.base_anchors[:, :] + shifts[:, :] + all_anchors = all_anchors.cast(paddle.float32).reshape( + [feat_h * feat_w, 4]) + all_anchors = self.rect2rbox(all_anchors) + return all_anchors + + def valid_flags(self, featmap_size, valid_size): + feat_h, feat_w = featmap_size + valid_h, valid_w = valid_size + assert valid_h <= feat_h and valid_w <= feat_w + valid_x = paddle.zeros([feat_w], dtype='int32') + valid_y = paddle.zeros([feat_h], dtype='int32') + valid_x[:valid_w] = 1 + valid_y[:valid_h] = 1 + valid_xx, valid_yy = self._meshgrid(valid_x, valid_y) + valid = valid_xx & valid_yy + valid = paddle.reshape(valid, [-1, 1]) + valid = paddle.expand(valid, [-1, self.num_base_anchors]).reshape([-1]) + return valid + + def rect2rbox(self, bboxes): + """ + :param bboxes: shape (L, 4) (xmin, ymin, xmax, ymax) + :return: dbboxes: shape (L, 5) (x_ctr, y_ctr, w, h, angle) + """ + x1, y1, x2, y2 = paddle.split(bboxes, 4, axis=-1) + + x_ctr = (x1 + x2) / 2.0 + y_ctr = (y1 + y2) / 2.0 + edges1 = paddle.abs(x2 - x1) + edges2 = paddle.abs(y2 - y1) + + rbox_w = paddle.maximum(edges1, edges2) + rbox_h = paddle.minimum(edges1, edges2) + + # set angle + inds = edges1 < edges2 + inds = paddle.cast(inds, paddle.float32) + rboxes_angle = inds * np.pi / 2.0 + + rboxes = paddle.concat( + (x_ctr, y_ctr, rbox_w, rbox_h, rboxes_angle), axis=-1) + return rboxes diff --git a/paddlers/models/ppdet/modeling/proposal_generator/proposal_generator.py b/paddlers/models/ppdet/modeling/proposal_generator/proposal_generator.py index 2af84bc..1fc9544 100644 --- a/paddlers/models/ppdet/modeling/proposal_generator/proposal_generator.py +++ b/paddlers/models/ppdet/modeling/proposal_generator/proposal_generator.py @@ -24,7 +24,7 @@ class ProposalGenerator(object): """ Proposal generation module - For more details, please refer to the document of generate_proposals + For more details, please refer to the document of generate_proposals in ppdet/modeing/ops.py Args: @@ -38,8 +38,8 @@ class ProposalGenerator(object): eta (float): Apply in adaptive NMS, if adaptive `threshold > 0.5`, `adaptive_threshold = adaptive_threshold * eta` in each iteration. default 1. - topk_after_collect (bool): whether to adopt topk after batch - collection. If topk_after_collect is true, box filter will not be + topk_after_collect (bool): whether to adopt topk after batch + collection. If topk_after_collect is true, box filter will not be used after NMS at each image in proposal generation. default false """ @@ -62,16 +62,31 @@ class ProposalGenerator(object): top_n = self.pre_nms_top_n if self.topk_after_collect else self.post_nms_top_n variances = paddle.ones_like(anchors) - rpn_rois, rpn_rois_prob, rpn_rois_num = ops.generate_proposals( - scores, - bbox_deltas, - im_shape, - anchors, - variances, - pre_nms_top_n=self.pre_nms_top_n, - post_nms_top_n=top_n, - nms_thresh=self.nms_thresh, - min_size=self.min_size, - eta=self.eta, - return_rois_num=True) + if hasattr(paddle.vision.ops, "generate_proposals"): + rpn_rois, rpn_rois_prob, rpn_rois_num = paddle.vision.ops.generate_proposals( + scores, + bbox_deltas, + im_shape, + anchors, + variances, + pre_nms_top_n=self.pre_nms_top_n, + post_nms_top_n=top_n, + nms_thresh=self.nms_thresh, + min_size=self.min_size, + eta=self.eta, + return_rois_num=True) + else: + rpn_rois, rpn_rois_prob, rpn_rois_num = ops.generate_proposals( + scores, + bbox_deltas, + im_shape, + anchors, + variances, + pre_nms_top_n=self.pre_nms_top_n, + post_nms_top_n=top_n, + nms_thresh=self.nms_thresh, + min_size=self.min_size, + eta=self.eta, + return_rois_num=True) + return rpn_rois, rpn_rois_prob, rpn_rois_num, self.post_nms_top_n diff --git a/paddlers/models/ppdet/modeling/proposal_generator/rpn_head.py b/paddlers/models/ppdet/modeling/proposal_generator/rpn_head.py index e0605cc..f301da9 100644 --- a/paddlers/models/ppdet/modeling/proposal_generator/rpn_head.py +++ b/paddlers/models/ppdet/modeling/proposal_generator/rpn_head.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. import paddle @@ -21,6 +21,7 @@ from paddlers.models.ppdet.core.workspace import register from .anchor_generator import AnchorGenerator from .target_layer import RPNTargetAssign from .proposal_generator import ProposalGenerator +from ..cls_utils import _get_class_default_kwargs class RPNFeat(nn.Layer): @@ -66,18 +67,24 @@ class RPNHead(nn.Layer): in_channel (int): channel of input feature maps which can be derived by from_config """ + __shared__ = ['export_onnx'] + __inject__ = ['loss_rpn_bbox'] def __init__(self, - anchor_generator=AnchorGenerator().__dict__, - rpn_target_assign=RPNTargetAssign().__dict__, - train_proposal=ProposalGenerator(12000, 2000).__dict__, - test_proposal=ProposalGenerator().__dict__, - in_channel=1024): + anchor_generator=_get_class_default_kwargs(AnchorGenerator), + rpn_target_assign=_get_class_default_kwargs(RPNTargetAssign), + train_proposal=_get_class_default_kwargs(ProposalGenerator, + 12000, 2000), + test_proposal=_get_class_default_kwargs(ProposalGenerator), + in_channel=1024, + export_onnx=False, + loss_rpn_bbox=None): super(RPNHead, self).__init__() self.anchor_generator = anchor_generator self.rpn_target_assign = rpn_target_assign self.train_proposal = train_proposal self.test_proposal = test_proposal + self.export_onnx = export_onnx if isinstance(anchor_generator, dict): self.anchor_generator = AnchorGenerator(**anchor_generator) if isinstance(rpn_target_assign, dict): @@ -86,6 +93,7 @@ class RPNHead(nn.Layer): self.train_proposal = ProposalGenerator(**train_proposal) if isinstance(test_proposal, dict): self.test_proposal = ProposalGenerator(**test_proposal) + self.loss_rpn_bbox = loss_rpn_bbox num_anchors = self.anchor_generator.num_anchors self.rpn_feat = RPNFeat(in_channel, in_channel) @@ -149,49 +157,90 @@ class RPNHead(nn.Layer): # Collect multi-level proposals for each batch # Get 'topk' of them as final output - bs_rois_collect = [] - bs_rois_num_collect = [] - batch_size = paddle.slice(paddle.shape(im_shape), [0], [0], [1]) - # Generate proposals for each level and each batch. - # Discard batch-computing to avoid sorting bbox cross different batches. - for i in range(batch_size): - rpn_rois_list = [] - rpn_prob_list = [] - rpn_rois_num_list = [] + if self.export_onnx: + # bs = 1 when exporting onnx + onnx_rpn_rois_list = [] + onnx_rpn_prob_list = [] + onnx_rpn_rois_num_list = [] for rpn_score, rpn_delta, anchor in zip(scores, bbox_deltas, anchors): - rpn_rois, rpn_rois_prob, rpn_rois_num, post_nms_top_n = prop_gen( - scores=rpn_score[i:i + 1], - bbox_deltas=rpn_delta[i:i + 1], + onnx_rpn_rois, onnx_rpn_rois_prob, onnx_rpn_rois_num, onnx_post_nms_top_n = prop_gen( + scores=rpn_score[0:1], + bbox_deltas=rpn_delta[0:1], anchors=anchor, - im_shape=im_shape[i:i + 1]) - if rpn_rois.shape[0] > 0: + im_shape=im_shape[0:1]) + onnx_rpn_rois_list.append(onnx_rpn_rois) + onnx_rpn_prob_list.append(onnx_rpn_rois_prob) + onnx_rpn_rois_num_list.append(onnx_rpn_rois_num) + + onnx_rpn_rois = paddle.concat(onnx_rpn_rois_list) + onnx_rpn_prob = paddle.concat(onnx_rpn_prob_list).flatten() + + onnx_top_n = paddle.to_tensor(onnx_post_nms_top_n).cast('int32') + onnx_num_rois = paddle.shape(onnx_rpn_prob)[0].cast('int32') + k = paddle.minimum(onnx_top_n, onnx_num_rois) + onnx_topk_prob, onnx_topk_inds = paddle.topk(onnx_rpn_prob, k) + onnx_topk_rois = paddle.gather(onnx_rpn_rois, onnx_topk_inds) + # TODO(wangguanzhong): Now bs_rois_collect in export_onnx is moved outside conditional branch + # due to problems in dy2static of paddle. Will fix it when updating paddle framework. + # bs_rois_collect = [onnx_topk_rois] + # bs_rois_num_collect = paddle.shape(onnx_topk_rois)[0] + + else: + bs_rois_collect = [] + bs_rois_num_collect = [] + + batch_size = paddle.slice(paddle.shape(im_shape), [0], [0], [1]) + + # Generate proposals for each level and each batch. + # Discard batch-computing to avoid sorting bbox cross different batches. + for i in range(batch_size): + rpn_rois_list = [] + rpn_prob_list = [] + rpn_rois_num_list = [] + + for rpn_score, rpn_delta, anchor in zip(scores, bbox_deltas, + anchors): + rpn_rois, rpn_rois_prob, rpn_rois_num, post_nms_top_n = prop_gen( + scores=rpn_score[i:i + 1], + bbox_deltas=rpn_delta[i:i + 1], + anchors=anchor, + im_shape=im_shape[i:i + 1]) rpn_rois_list.append(rpn_rois) rpn_prob_list.append(rpn_rois_prob) rpn_rois_num_list.append(rpn_rois_num) - if len(scores) > 1: - rpn_rois = paddle.concat(rpn_rois_list) - rpn_prob = paddle.concat(rpn_prob_list).flatten() - - if rpn_prob.shape[0] > post_nms_top_n: - topk_prob, topk_inds = paddle.topk(rpn_prob, post_nms_top_n) - topk_rois = paddle.gather(rpn_rois, topk_inds) + if len(scores) > 1: + rpn_rois = paddle.concat(rpn_rois_list) + rpn_prob = paddle.concat(rpn_prob_list).flatten() + + num_rois = paddle.shape(rpn_prob)[0].cast('int32') + if num_rois > post_nms_top_n: + topk_prob, topk_inds = paddle.topk(rpn_prob, + post_nms_top_n) + topk_rois = paddle.gather(rpn_rois, topk_inds) + else: + topk_rois = rpn_rois + topk_prob = rpn_prob else: - topk_rois = rpn_rois - topk_prob = rpn_prob - else: - topk_rois = rpn_rois_list[0] - topk_prob = rpn_prob_list[0].flatten() + topk_rois = rpn_rois_list[0] + topk_prob = rpn_prob_list[0].flatten() - bs_rois_collect.append(topk_rois) - bs_rois_num_collect.append(paddle.shape(topk_rois)[0]) + bs_rois_collect.append(topk_rois) + bs_rois_num_collect.append(paddle.shape(topk_rois)[0]) - bs_rois_num_collect = paddle.concat(bs_rois_num_collect) + bs_rois_num_collect = paddle.concat(bs_rois_num_collect) - return bs_rois_collect, bs_rois_num_collect + if self.export_onnx: + output_rois = [onnx_topk_rois] + output_rois_num = paddle.shape(onnx_topk_rois)[0] + else: + output_rois = bs_rois_collect + output_rois_num = bs_rois_num_collect + + return output_rois, output_rois_num def get_loss(self, pred_scores, pred_deltas, anchors, inputs): """ @@ -252,7 +301,12 @@ class RPNHead(nn.Layer): loc_tgt = paddle.concat(loc_tgt) loc_tgt = paddle.gather(loc_tgt, pos_ind) loc_tgt.stop_gradient = True - loss_rpn_reg = paddle.abs(loc_pred - loc_tgt).sum() + + if self.loss_rpn_bbox is None: + loss_rpn_reg = paddle.abs(loc_pred - loc_tgt).sum() + else: + loss_rpn_reg = self.loss_rpn_bbox(loc_pred, loc_tgt).sum() + return { 'loss_rpn_cls': loss_rpn_cls / norm, 'loss_rpn_reg': loss_rpn_reg / norm diff --git a/paddlers/models/ppdet/modeling/proposal_generator/target.py b/paddlers/models/ppdet/modeling/proposal_generator/target.py index b92d0b3..82930be 100644 --- a/paddlers/models/ppdet/modeling/proposal_generator/target.py +++ b/paddlers/models/ppdet/modeling/proposal_generator/target.py @@ -39,7 +39,7 @@ def rpn_anchor_target(anchors, matches, match_labels = label_box( anchors, gt_bbox, rpn_positive_overlap, rpn_negative_overlap, True, ignore_thresh, is_crowd_i, assign_on_cpu) - # Step2: sample anchor + # Step2: sample anchor fg_inds, bg_inds = subsample_labels(match_labels, rpn_batch_size_per_im, rpn_fg_fraction, 0, use_random) # Fill with the ignore label (-1), then set positive and negative labels @@ -48,7 +48,7 @@ def rpn_anchor_target(anchors, labels = paddle.scatter(labels, bg_inds, paddle.zeros_like(bg_inds)) if fg_inds.shape[0] > 0: labels = paddle.scatter(labels, fg_inds, paddle.ones_like(fg_inds)) - # Step3: make output + # Step3: make output if gt_bbox.shape[0] == 0: matched_gt_boxes = paddle.zeros([matches.shape[0], 4]) tgt_delta = paddle.zeros([matches.shape[0], 4]) @@ -74,9 +74,11 @@ def label_box(anchors, is_crowd=None, assign_on_cpu=False): if assign_on_cpu: + device = paddle.device.get_device() paddle.set_device("cpu") iou = bbox_overlaps(gt_boxes, anchors) - paddle.set_device("gpu") + paddle.set_device(device) + else: iou = bbox_overlaps(gt_boxes, anchors) n_gt = gt_boxes.shape[0] @@ -89,7 +91,7 @@ def label_box(anchors, default_matches = paddle.full((iou.shape[1], ), 0, dtype='int64') default_match_labels = paddle.full((iou.shape[1], ), 0, dtype='int32') return default_matches, default_match_labels - # if ignore_thresh > 0, remove anchor if it is closed to + # if ignore_thresh > 0, remove anchor if it is closed to # one of the crowded ground-truth if n_gt_crowd > 0: N_a = anchors.shape[0] @@ -184,7 +186,8 @@ def generate_proposal_target(rpn_rois, use_random=True, is_cascade=False, cascade_iou=0.5, - assign_on_cpu=False): + assign_on_cpu=False, + add_gt_as_proposals=True): rois_with_gt = [] tgt_labels = [] @@ -202,7 +205,7 @@ def generate_proposal_target(rpn_rois, gt_class = paddle.squeeze(gt_classes[i], axis=-1) # Concat RoIs and gt boxes except cascade rcnn or none gt - if not is_cascade and gt_bbox.shape[0] > 0: + if add_gt_as_proposals and gt_bbox.shape[0] > 0: bbox = paddle.concat([rpn_roi, gt_bbox]) else: bbox = rpn_roi @@ -211,12 +214,12 @@ def generate_proposal_target(rpn_rois, matches, match_labels = label_box(bbox, gt_bbox, fg_thresh, bg_thresh, False, ignore_thresh, is_crowd_i, assign_on_cpu) - # Step2: sample bbox + # Step2: sample bbox sampled_inds, sampled_gt_classes = sample_bbox( matches, match_labels, gt_class, batch_size_per_im, fg_fraction, num_classes, use_random, is_cascade) - # Step3: make output + # Step3: make output rois_per_image = bbox if is_cascade else paddle.gather(bbox, sampled_inds) sampled_gt_ind = matches if is_cascade else paddle.gather(matches, @@ -337,7 +340,7 @@ def generate_mask_target(gt_segms, rois, labels_int32, sampled_gt_inds, # generate fake roi if foreground is empty if fg_inds.numel() == 0: has_fg = False - fg_inds = paddle.ones([1], dtype='int32') + fg_inds = paddle.ones([1, 1], dtype='int64') inds_per_im = sampled_gt_inds[k] inds_per_im = paddle.gather(inds_per_im, fg_inds) @@ -356,7 +359,7 @@ def generate_mask_target(gt_segms, rois, labels_int32, sampled_gt_inds, fg_inds_new = fg_inds.reshape([-1]).numpy() results = [] if len(gt_segms_per_im) > 0: - for j in fg_inds_new: + for j in range(fg_inds_new.shape[0]): results.append( rasterize_polygons_within_box(new_segm[j], boxes[j], resolution)) diff --git a/paddlers/models/ppdet/modeling/proposal_generator/target_layer.py b/paddlers/models/ppdet/modeling/proposal_generator/target_layer.py index a52ead1..57e5539 100644 --- a/paddlers/models/ppdet/modeling/proposal_generator/target_layer.py +++ b/paddlers/models/ppdet/modeling/proposal_generator/target_layer.py @@ -1,15 +1,15 @@ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. import sys import paddle @@ -29,27 +29,27 @@ class RPNTargetAssign(object): The assignment consists of three steps: 1. Match anchor and ground-truth box, label the anchor with foreground or background sample - 2. Sample anchors to keep the properly ratio between foreground and + 2. Sample anchors to keep the properly ratio between foreground and background 3. Generate the targets for classification and regression branch Args: - batch_size_per_im (int): Total number of RPN samples per image. + batch_size_per_im (int): Total number of RPN samples per image. default 256 fg_fraction (float): Fraction of anchors that is labeled foreground, default 0.5 positive_overlap (float): Minimum overlap required between an anchor - and ground-truth box for the (anchor, gt box) pair to be + and ground-truth box for the (anchor, gt box) pair to be a foreground sample. default 0.7 negative_overlap (float): Maximum overlap allowed between an anchor - and ground-truth box for the (anchor, gt box) pair to be + and ground-truth box for the (anchor, gt box) pair to be a background sample. default 0.3 ignore_thresh(float): Threshold for ignoring the is_crowd ground-truth if the value is larger than zero. - use_random (bool): Use random sampling to choose foreground and + use_random (bool): Use random sampling to choose foreground and background boxes, default true. - assign_on_cpu (bool): In case the number of gt box is too large, + assign_on_cpu (bool): In case the number of gt box is too large, compute IoU on CPU, default false. """ @@ -104,13 +104,13 @@ class BBoxAssigner(object): The assignment consists of three steps: 1. Match RoIs and ground-truth box, label the RoIs with foreground or background sample - 2. Sample anchors to keep the properly ratio between foreground and + 2. Sample anchors to keep the properly ratio between foreground and background 3. Generate the targets for classification and regression branch Args: - batch_size_per_im (int): Total number of RoIs per image. - default 512 + batch_size_per_im (int): Total number of RoIs per image. + default 512 fg_fraction (float): Fraction of RoIs that is labeled foreground, default 0.25 fg_thresh (float): Minimum overlap required between a RoI @@ -121,12 +121,12 @@ class BBoxAssigner(object): a background sample. default 0.5 ignore_thresh(float): Threshold for ignoring the is_crowd ground-truth if the value is larger than zero. - use_random (bool): Use random sampling to choose foreground and + use_random (bool): Use random sampling to choose foreground and background boxes, default true cascade_iou (list[iou]): The list of overlap to select foreground and background of each stage, which is only used In Cascade RCNN. num_classes (int): The number of class. - assign_on_cpu (bool): In case the number of gt box is too large, + assign_on_cpu (bool): In case the number of gt box is too large, compute IoU on CPU, default false. """ @@ -156,7 +156,8 @@ class BBoxAssigner(object): rpn_rois_num, inputs, stage=0, - is_cascade=False): + is_cascade=False, + add_gt_as_proposals=True): gt_classes = inputs['gt_class'] gt_boxes = inputs['gt_bbox'] is_crowd = inputs.get('is_crowd', None) @@ -166,7 +167,7 @@ class BBoxAssigner(object): rpn_rois, gt_classes, gt_boxes, self.batch_size_per_im, self.fg_fraction, self.fg_thresh, self.bg_thresh, self.num_classes, self.ignore_thresh, is_crowd, self.use_random, is_cascade, - self.cascade_iou[stage], self.assign_on_cpu) + self.cascade_iou[stage], self.assign_on_cpu, add_gt_as_proposals) rois = outs[0] rois_num = outs[-1] # tgt_labels, tgt_bboxes, tgt_gt_inds @@ -254,7 +255,7 @@ class MaskAssigner(object): The assignment consists of three steps: 1. Select RoIs labels with foreground. - 2. Encode the RoIs and corresponding gt polygons to generate + 2. Encode the RoIs and corresponding gt polygons to generate mask target Args: @@ -365,21 +366,11 @@ class RBoxAssigner(object): def assign_anchor(self, anchors, gt_bboxes, - gt_lables, + gt_labels, pos_iou_thr, neg_iou_thr, min_iou_thr=0.0, ignore_iof_thr=-2): - """ - - Args: - anchors: - gt_bboxes:[M, 5] rc,yc,w,h,angle - gt_lables: - - Returns: - - """ assert anchors.shape[1] == 4 or anchors.shape[1] == 5 assert gt_bboxes.shape[1] == 4 or gt_bboxes.shape[1] == 5 anchors_xc_yc = anchors @@ -392,9 +383,9 @@ class RBoxAssigner(object): gt_bboxes_xc_yc = paddle.to_tensor(gt_bboxes_xc_yc) try: - from rbox_iou_ops import rbox_iou + from ext_op import rbox_iou except Exception as e: - print("import custom_ops error, try install rbox_iou_ops " \ + print("import custom_ops error, try install ext_op " \ "following ppdet/ext_op/README.md", e) sys.stdout.flush() sys.exit(-1) @@ -428,12 +419,12 @@ class RBoxAssigner(object): # (4) assign max_iou as pos_ids >=0 anchor_gt_bbox_iou_inds = anchor_gt_bbox_inds[gt_bbox_anchor_iou_inds] # gt_bbox_anchor_iou_inds = np.logical_and(gt_bbox_anchor_iou_inds, anchor_gt_bbox_iou >= min_iou_thr) - labels[gt_bbox_anchor_iou_inds] = gt_lables[anchor_gt_bbox_iou_inds] + labels[gt_bbox_anchor_iou_inds] = gt_labels[anchor_gt_bbox_iou_inds] # (5) assign >= pos_iou_thr as pos_ids iou_pos_iou_thr_ids = anchor_gt_bbox_iou >= pos_iou_thr iou_pos_iou_thr_ids_box_inds = anchor_gt_bbox_inds[iou_pos_iou_thr_ids] - labels[iou_pos_iou_thr_ids] = gt_lables[iou_pos_iou_thr_ids_box_inds] + labels[iou_pos_iou_thr_ids] = gt_labels[iou_pos_iou_thr_ids_box_inds] return anchor_gt_bbox_inds, anchor_gt_bbox_iou, labels def __call__(self, anchors, gt_bboxes, gt_labels, is_crowd): diff --git a/paddlers/models/ppdet/modeling/rbox_utils.py b/paddlers/models/ppdet/modeling/rbox_utils.py new file mode 100644 index 0000000..19bca8d --- /dev/null +++ b/paddlers/models/ppdet/modeling/rbox_utils.py @@ -0,0 +1,159 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import paddle +import numpy as np +import cv2 + + +def norm_angle(angle, range=[-np.pi / 4, np.pi]): + return (angle - range[0]) % range[1] + range[0] + + +# rbox function implemented using numpy +def poly2rbox_le135_np(poly): + """convert poly to rbox [-pi / 4, 3 * pi / 4] + + Args: + poly: [x1, y1, x2, y2, x3, y3, x4, y4] + + Returns: + rbox: [cx, cy, w, h, angle] + """ + poly = np.array(poly[:8], dtype=np.float32) + + pt1 = (poly[0], poly[1]) + pt2 = (poly[2], poly[3]) + pt3 = (poly[4], poly[5]) + pt4 = (poly[6], poly[7]) + + edge1 = np.sqrt((pt1[0] - pt2[0]) * (pt1[0] - pt2[0]) + (pt1[1] - pt2[1]) * + (pt1[1] - pt2[1])) + edge2 = np.sqrt((pt2[0] - pt3[0]) * (pt2[0] - pt3[0]) + (pt2[1] - pt3[1]) * + (pt2[1] - pt3[1])) + + width = max(edge1, edge2) + height = min(edge1, edge2) + + rbox_angle = 0 + if edge1 > edge2: + rbox_angle = np.arctan2(float(pt2[1] - pt1[1]), float(pt2[0] - pt1[0])) + elif edge2 >= edge1: + rbox_angle = np.arctan2(float(pt4[1] - pt1[1]), float(pt4[0] - pt1[0])) + + rbox_angle = norm_angle(rbox_angle) + + x_ctr = float(pt1[0] + pt3[0]) / 2 + y_ctr = float(pt1[1] + pt3[1]) / 2 + return [x_ctr, y_ctr, width, height, rbox_angle] + + +def poly2rbox_oc_np(poly): + """convert poly to rbox (0, pi / 2] + + Args: + poly: [x1, y1, x2, y2, x3, y3, x4, y4] + + Returns: + rbox: [cx, cy, w, h, angle] + """ + points = np.array(poly, dtype=np.float32).reshape((-1, 2)) + (cx, cy), (w, h), angle = cv2.minAreaRect(points) + # using the new OpenCV Rotated BBox definition since 4.5.1 + # if angle < 0, opencv is older than 4.5.1, angle is in [-90, 0) + if angle < 0: + angle += 90 + w, h = h, w + + # convert angle to [0, 90) + if angle == -0.0: + angle = 0.0 + if angle == 90.0: + angle = 0.0 + w, h = h, w + + angle = angle / 180 * np.pi + return [cx, cy, w, h, angle] + + +def poly2rbox_np(polys, rbox_type='oc'): + """ + polys: [x0,y0,x1,y1,x2,y2,x3,y3] + to + rboxes: [x_ctr,y_ctr,w,h,angle] + """ + assert rbox_type in ['oc', 'le135'], 'only oc or le135 is supported now' + poly2rbox_fn = poly2rbox_oc_np if rbox_type == 'oc' else poly2rbox_le135_np + rboxes = [] + for poly in polys: + x, y, w, h, angle = poly2rbox_fn(poly) + rbox = np.array([x, y, w, h, angle], dtype=np.float32) + rboxes.append(rbox) + + return np.array(rboxes) + + +def cal_line_length(point1, point2): + return math.sqrt( + math.pow(point1[0] - point2[0], 2) + math.pow(point1[1] - point2[1], 2)) + + +def get_best_begin_point_single(coordinate): + x1, y1, x2, y2, x3, y3, x4, y4 = coordinate + xmin = min(x1, x2, x3, x4) + ymin = min(y1, y2, y3, y4) + xmax = max(x1, x2, x3, x4) + ymax = max(y1, y2, y3, y4) + combinate = [[[x1, y1], [x2, y2], [x3, y3], [x4, y4]], + [[x4, y4], [x1, y1], [x2, y2], [x3, y3]], + [[x3, y3], [x4, y4], [x1, y1], [x2, y2]], + [[x2, y2], [x3, y3], [x4, y4], [x1, y1]]] + dst_coordinate = [[xmin, ymin], [xmax, ymin], [xmax, ymax], [xmin, ymax]] + force = 100000000.0 + force_flag = 0 + for i in range(4): + temp_force = cal_line_length(combinate[i][0], dst_coordinate[0]) \ + + cal_line_length(combinate[i][1], dst_coordinate[1]) \ + + cal_line_length(combinate[i][2], dst_coordinate[2]) \ + + cal_line_length(combinate[i][3], dst_coordinate[3]) + if temp_force < force: + force = temp_force + force_flag = i + if force_flag != 0: + pass + return np.array(combinate[force_flag]).reshape(8) + + +def rbox2poly_np(rboxes): + """ + rboxes:[x_ctr,y_ctr,w,h,angle] + to + poly:[x0,y0,x1,y1,x2,y2,x3,y3] + """ + polys = [] + for i in range(len(rboxes)): + x_ctr, y_ctr, width, height, angle = rboxes[i][:5] + tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2 + rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]]) + R = np.array([[np.cos(angle), -np.sin(angle)], + [np.sin(angle), np.cos(angle)]]) + poly = R.dot(rect) + x0, x1, x2, x3 = poly[0, :4] + x_ctr + y0, y1, y2, y3 = poly[1, :4] + y_ctr + poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float32) + poly = get_best_begin_point_single(poly) + polys.append(poly) + polys = np.array(polys) + return polys diff --git a/paddlers/models/ppdet/modeling/reid/__init__.py b/paddlers/models/ppdet/modeling/reid/__init__.py index 968e95c..2630ecf 100644 --- a/paddlers/models/ppdet/modeling/reid/__init__.py +++ b/paddlers/models/ppdet/modeling/reid/__init__.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. from . import jde_embedding_head @@ -17,9 +17,11 @@ from . import fairmot_embedding_head from . import resnet from . import pyramidal_embedding from . import pplcnet_embedding +from . import resnet_embedding from .fairmot_embedding_head import * from .jde_embedding_head import * from .resnet import * from .pyramidal_embedding import * from .pplcnet_embedding import * +from .resnet_embedding import * diff --git a/paddlers/models/ppdet/modeling/reid/fairmot_embedding_head.py b/paddlers/models/ppdet/modeling/reid/fairmot_embedding_head.py old mode 100644 new mode 100755 index e4d5364..88fda65 --- a/paddlers/models/ppdet/modeling/reid/fairmot_embedding_head.py +++ b/paddlers/models/ppdet/modeling/reid/fairmot_embedding_head.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. import numpy as np @@ -33,7 +33,7 @@ class FairMOTEmbeddingHead(nn.Layer): ch_head (int): the channel of features before fed into embedding, 256 by default. ch_emb (int): the channel of the embedding feature, 128 by default. num_identities_dict (dict): the number of identities of each category, - support single class and multi-calss, {0: 14455} as default. + support single class and multi-calss, {0: 14455} as default. """ def __init__(self, diff --git a/paddlers/models/ppdet/modeling/reid/jde_embedding_head.py b/paddlers/models/ppdet/modeling/reid/jde_embedding_head.py index b13e2d3..8764692 100644 --- a/paddlers/models/ppdet/modeling/reid/jde_embedding_head.py +++ b/paddlers/models/ppdet/modeling/reid/jde_embedding_head.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import @@ -152,9 +152,8 @@ class JDEEmbeddingHead(nn.Layer): scale_factor = targets['scale_factor'][0].numpy() bboxes[:, 2:] = self.scale_coords(bboxes[:, 2:], input_shape, im_shape, scale_factor) - # tlwhs, scores, cls_ids - pred_dets = paddle.concat( - (bboxes[:, 2:], bboxes[:, 1:2], bboxes[:, 0:1]), axis=1) + # cls_ids, scores, tlwhs + pred_dets = bboxes return pred_dets, pred_embs def scale_coords(self, coords, input_shape, im_shape, scale_factor): diff --git a/paddlers/models/ppdet/modeling/reid/pplcnet_embedding.py b/paddlers/models/ppdet/modeling/reid/pplcnet_embedding.py index 1da21d2..1915a60 100644 --- a/paddlers/models/ppdet/modeling/reid/pplcnet_embedding.py +++ b/paddlers/models/ppdet/modeling/reid/pplcnet_embedding.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import diff --git a/paddlers/models/ppdet/modeling/reid/pyramidal_embedding.py b/paddlers/models/ppdet/modeling/reid/pyramidal_embedding.py index 9099ecd..6ee384d 100644 --- a/paddlers/models/ppdet/modeling/reid/pyramidal_embedding.py +++ b/paddlers/models/ppdet/modeling/reid/pyramidal_embedding.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import @@ -21,7 +21,7 @@ import paddle.nn as nn import paddle.nn.functional as F from paddle.nn.initializer import Normal, Constant from paddle import ParamAttr -from .resnet import * +from .resnet import ResNet50, ResNet101 from paddlers.models.ppdet.core.workspace import register __all__ = ['PCBPyramid'] @@ -46,6 +46,7 @@ class PCBPyramid(nn.Layer): def __init__(self, input_ch=2048, + model_name='ResNet101', num_stripes=6, used_levels=(1, 1, 1, 1, 1, 1), num_classes=751, @@ -60,10 +61,11 @@ class PCBPyramid(nn.Layer): self.num_in_each_level = [i for i in range(self.num_stripes, 0, -1)] self.num_branches = sum(self.num_in_each_level) - self.base = ResNet101( - lr_mult=0.1, - last_conv_stride=last_conv_stride, - last_conv_dilation=last_conv_dilation) + assert model_name in ['ResNet50', 'ResNet101' + ], "Unsupported ReID arch: {}".format(model_name) + self.base = eval(model_name)(lr_mult=0.1, + last_conv_stride=last_conv_stride, + last_conv_dilation=last_conv_dilation) self.dropout_layer = nn.Dropout(p=0.2) self.pyramid_conv_list0, self.pyramid_fc_list0 = self.basic_branch( num_conv_out_channels, input_ch) diff --git a/paddlers/models/ppdet/modeling/reid/resnet.py b/paddlers/models/ppdet/modeling/reid/resnet.py index 4316f83..2e2a855 100644 --- a/paddlers/models/ppdet/modeling/reid/resnet.py +++ b/paddlers/models/ppdet/modeling/reid/resnet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -55,7 +55,7 @@ class ConvBNLayer(nn.Layer): bias_attr=False, data_format=data_format) - self._batch_norm = nn.BatchNorm2D(num_filters, data_layout=data_format) + self._batch_norm = nn.BatchNorm2D(num_filters) self.act = act def forward(self, inputs): diff --git a/paddlers/models/ppdet/modeling/reid/resnet_embedding.py b/paddlers/models/ppdet/modeling/reid/resnet_embedding.py new file mode 100644 index 0000000..32d17be --- /dev/null +++ b/paddlers/models/ppdet/modeling/reid/resnet_embedding.py @@ -0,0 +1,43 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import paddle +import paddle.nn.functional as F +from paddle import nn +from .resnet import ResNet50, ResNet101 +from paddlers.models.ppdet.core.workspace import register + +__all__ = ['ResNetEmbedding'] + + +@register +class ResNetEmbedding(nn.Layer): + in_planes = 2048 + + def __init__(self, model_name='ResNet50', last_stride=1): + super(ResNetEmbedding, self).__init__() + assert model_name in ['ResNet50', 'ResNet101' + ], "Unsupported ReID arch: {}".format(model_name) + self.base = eval(model_name)(last_conv_stride=last_stride) + self.gap = nn.AdaptiveAvgPool2D(output_size=1) + self.flatten = nn.Flatten(start_axis=1, stop_axis=-1) + self.bn = nn.BatchNorm1D(self.in_planes, bias_attr=False) + + def forward(self, x): + base_out = self.base(x) + global_feat = self.gap(base_out) + global_feat = self.flatten(global_feat) + global_feat = self.bn(global_feat) + return global_feat diff --git a/paddlers/models/ppdet/modeling/shape_spec.py b/paddlers/models/ppdet/modeling/shape_spec.py index d42042a..bf6d11e 100644 --- a/paddlers/models/ppdet/modeling/shape_spec.py +++ b/paddlers/models/ppdet/modeling/shape_spec.py @@ -1,15 +1,15 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. # The code is based on: diff --git a/paddlers/models/ppdet/modeling/tests/__init__.py b/paddlers/models/ppdet/modeling/tests/__init__.py new file mode 100644 index 0000000..5135585 --- /dev/null +++ b/paddlers/models/ppdet/modeling/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlers/models/ppdet/modeling/tests/imgs/coco2017_val2017_000000000139.jpg b/paddlers/models/ppdet/modeling/tests/imgs/coco2017_val2017_000000000139.jpg new file mode 100644 index 0000000..19023f7 Binary files /dev/null and b/paddlers/models/ppdet/modeling/tests/imgs/coco2017_val2017_000000000139.jpg differ diff --git a/paddlers/models/ppdet/modeling/tests/imgs/coco2017_val2017_000000000724.jpg b/paddlers/models/ppdet/modeling/tests/imgs/coco2017_val2017_000000000724.jpg new file mode 100644 index 0000000..2a17e0c Binary files /dev/null and b/paddlers/models/ppdet/modeling/tests/imgs/coco2017_val2017_000000000724.jpg differ diff --git a/paddlers/models/ppdet/modeling/tests/test_architectures.py b/paddlers/models/ppdet/modeling/tests/test_architectures.py new file mode 100644 index 0000000..31cac3d --- /dev/null +++ b/paddlers/models/ppdet/modeling/tests/test_architectures.py @@ -0,0 +1,69 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import unittest +import paddlers.models.ppdet as ppdet + + +class TestFasterRCNN(unittest.TestCase): + def setUp(self): + self.set_config() + + def set_config(self): + self.cfg_file = 'configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml' + + def test_trainer(self): + # Trainer __init__ will build model and DataLoader + # 'train' and 'eval' mode include dataset loading + # use 'test' mode to simplify tests + cfg = ppdet.core.workspace.load_config(self.cfg_file) + trainer = ppdet.engine.Trainer(cfg, mode='test') + + +class TestMaskRCNN(TestFasterRCNN): + def set_config(self): + self.cfg_file = 'configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.yml' + + +class TestCascadeRCNN(TestFasterRCNN): + def set_config(self): + self.cfg_file = 'configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.yml' + + +class TestYolov3(TestFasterRCNN): + def set_config(self): + self.cfg_file = 'configs/yolov3/yolov3_darknet53_270e_coco.yml' + + +class TestSSD(TestFasterRCNN): + def set_config(self): + self.cfg_file = 'configs/ssd/ssd_vgg16_300_240e_voc.yml' + + +class TestGFL(TestFasterRCNN): + def set_config(self): + self.cfg_file = 'configs/gfl/gfl_r50_fpn_1x_coco.yml' + + +class TestPicoDet(TestFasterRCNN): + def set_config(self): + self.cfg_file = 'configs/picodet/picodet_s_320_coco_lcnet.yml' + + +if __name__ == '__main__': + unittest.main() diff --git a/paddlers/models/ppdet/modeling/tests/test_base.py b/paddlers/models/ppdet/modeling/tests/test_base.py new file mode 100644 index 0000000..0123cfd --- /dev/null +++ b/paddlers/models/ppdet/modeling/tests/test_base.py @@ -0,0 +1,70 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest + +import contextlib + +import paddle +from paddle.static import Program + + +class LayerTest(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.seed = 111 + + @classmethod + def tearDownClass(cls): + pass + + def _get_place(self, force_to_use_cpu=False): + # this option for ops that only have cpu kernel + if force_to_use_cpu: + return 'cpu' + else: + return paddle.device.get_device() + + @contextlib.contextmanager + def static_graph(self): + paddle.enable_static() + scope = paddle.static.Scope() + program = Program() + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(program): + paddle.seed(self.seed) + paddle.framework.random._manual_program_seed(self.seed) + yield + + def get_static_graph_result(self, + feed, + fetch_list, + with_lod=False, + force_to_use_cpu=False): + exe = paddle.static.Executor(self._get_place(force_to_use_cpu)) + exe.run(paddle.static.default_startup_program()) + return exe.run(paddle.static.default_main_program(), + feed=feed, + fetch_list=fetch_list, + return_numpy=(not with_lod)) + + @contextlib.contextmanager + def dynamic_graph(self, force_to_use_cpu=False): + paddle.disable_static() + place = self._get_place(force_to_use_cpu=force_to_use_cpu) + paddle.device.set_device(place) + paddle.seed(self.seed) + paddle.framework.random._manual_program_seed(self.seed) + yield diff --git a/paddlers/models/ppdet/modeling/tests/test_mstest.py b/paddlers/models/ppdet/modeling/tests/test_mstest.py new file mode 100644 index 0000000..ba62333 --- /dev/null +++ b/paddlers/models/ppdet/modeling/tests/test_mstest.py @@ -0,0 +1,62 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import unittest +from paddlers.models.ppdet.core.workspace import load_config +from paddlers.models.ppdet.engine import Trainer + + +class TestMultiScaleInference(unittest.TestCase): + def setUp(self): + self.set_config() + + def set_config(self): + self.mstest_cfg_file = 'configs/faster_rcnn/faster_rcnn_r34_fpn_multiscaletest_1x_coco.yml' + + # test evaluation with multi scale test + def test_eval_mstest(self): + cfg = load_config(self.mstest_cfg_file) + trainer = Trainer(cfg, mode='eval') + + cfg.weights = 'https://paddledet.bj.bcebos.com/models/faster_rcnn_r34_fpn_1x_coco.pdparams' + trainer.load_weights(cfg.weights) + + trainer.evaluate() + + # test inference with multi scale test + def test_infer_mstest(self): + cfg = load_config(self.mstest_cfg_file) + trainer = Trainer(cfg, mode='test') + + cfg.weights = 'https://paddledet.bj.bcebos.com/models/faster_rcnn_r34_fpn_1x_coco.pdparams' + trainer.load_weights(cfg.weights) + tests_img_root = os.path.join(os.path.dirname(__file__), 'imgs') + + # input images to predict + imgs = [ + 'coco2017_val2017_000000000139.jpg', + 'coco2017_val2017_000000000724.jpg' + ] + imgs = [os.path.join(tests_img_root, img) for img in imgs] + trainer.predict( + imgs, draw_threshold=0.5, output_dir='output', save_results=False) + + +if __name__ == '__main__': + unittest.main() diff --git a/paddlers/models/ppdet/modeling/tests/test_ops.py b/paddlers/models/ppdet/modeling/tests/test_ops.py new file mode 100644 index 0000000..3bf2f28 --- /dev/null +++ b/paddlers/models/ppdet/modeling/tests/test_ops.py @@ -0,0 +1,584 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import os, sys +# add python path of PadleDetection to sys.path +parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 4))) +if parent_path not in sys.path: + sys.path.append(parent_path) + +import unittest +import numpy as np + +import paddle + +import paddlers.models.ppdet.modeling.ops as ops +from paddlers.models.ppdet.modeling.tests.test_base import LayerTest + + +def make_rois(h, w, rois_num, output_size): + rois = np.zeros((0, 4)).astype('float32') + for roi_num in rois_num: + roi = np.zeros((roi_num, 4)).astype('float32') + roi[:, 0] = np.random.randint(0, h - output_size[0], size=roi_num) + roi[:, 1] = np.random.randint(0, w - output_size[1], size=roi_num) + roi[:, 2] = np.random.randint(roi[:, 0] + output_size[0], h) + roi[:, 3] = np.random.randint(roi[:, 1] + output_size[1], w) + rois = np.vstack((rois, roi)) + return rois + + +def softmax(x): + # clip to shiftx, otherwise, when calc loss with + # log(exp(shiftx)), may get log(0)=INF + shiftx = (x - np.max(x)).clip(-64.) + exps = np.exp(shiftx) + return exps / np.sum(exps) + + +class TestDistributeFpnProposals(LayerTest): + def test_distribute_fpn_proposals(self): + rois_np = np.random.rand(10, 4).astype('float32') + rois_num_np = np.array([4, 6]).astype('int32') + with self.static_graph(): + rois = paddle.static.data( + name='rois', shape=[10, 4], dtype='float32') + rois_num = paddle.static.data( + name='rois_num', shape=[None], dtype='int32') + multi_rois, restore_ind, rois_num_per_level = ops.distribute_fpn_proposals( + fpn_rois=rois, + min_level=2, + max_level=5, + refer_level=4, + refer_scale=224, + rois_num=rois_num) + fetch_list = multi_rois + [restore_ind] + rois_num_per_level + output_stat = self.get_static_graph_result( + feed={'rois': rois_np, + 'rois_num': rois_num_np}, + fetch_list=fetch_list, + with_lod=True) + output_stat_np = [] + for output in output_stat: + output_np = np.array(output) + if len(output_np) > 0: + output_stat_np.append(output_np) + + with self.dynamic_graph(): + rois_dy = paddle.to_tensor(rois_np) + rois_num_dy = paddle.to_tensor(rois_num_np) + multi_rois_dy, restore_ind_dy, rois_num_per_level_dy = ops.distribute_fpn_proposals( + fpn_rois=rois_dy, + min_level=2, + max_level=5, + refer_level=4, + refer_scale=224, + rois_num=rois_num_dy) + output_dy = multi_rois_dy + [restore_ind_dy] + rois_num_per_level_dy + output_dy_np = [] + for output in output_dy: + output_np = output.numpy() + if len(output_np) > 0: + output_dy_np.append(output_np) + + for res_stat, res_dy in zip(output_stat_np, output_dy_np): + self.assertTrue(np.array_equal(res_stat, res_dy)) + + def test_distribute_fpn_proposals_error(self): + with self.static_graph(): + fpn_rois = paddle.static.data( + name='data_error', shape=[10, 4], dtype='int32', lod_level=1) + self.assertRaises( + TypeError, + ops.distribute_fpn_proposals, + fpn_rois=fpn_rois, + min_level=2, + max_level=5, + refer_level=4, + refer_scale=224) + + paddle.disable_static() + + +class TestROIAlign(LayerTest): + def test_roi_align(self): + b, c, h, w = 2, 12, 20, 20 + inputs_np = np.random.rand(b, c, h, w).astype('float32') + rois_num = [4, 6] + output_size = (7, 7) + rois_np = make_rois(h, w, rois_num, output_size) + rois_num_np = np.array(rois_num).astype('int32') + with self.static_graph(): + inputs = paddle.static.data( + name='inputs', shape=[b, c, h, w], dtype='float32') + rois = paddle.static.data( + name='rois', shape=[10, 4], dtype='float32') + rois_num = paddle.static.data( + name='rois_num', shape=[None], dtype='int32') + + output = paddle.vision.ops.roi_align( + x=inputs, + boxes=rois, + boxes_num=rois_num, + output_size=output_size) + output_np, = self.get_static_graph_result( + feed={ + 'inputs': inputs_np, + 'rois': rois_np, + 'rois_num': rois_num_np + }, + fetch_list=output, + with_lod=False) + + with self.dynamic_graph(): + inputs_dy = paddle.to_tensor(inputs_np) + rois_dy = paddle.to_tensor(rois_np) + rois_num_dy = paddle.to_tensor(rois_num_np) + + output_dy = paddle.vision.ops.roi_align( + x=inputs_dy, + boxes=rois_dy, + boxes_num=rois_num_dy, + output_size=output_size) + output_dy_np = output_dy.numpy() + + self.assertTrue(np.array_equal(output_np, output_dy_np)) + + def test_roi_align_error(self): + with self.static_graph(): + inputs = paddle.static.data( + name='inputs', shape=[2, 12, 20, 20], dtype='float32') + rois = paddle.static.data( + name='data_error', shape=[10, 4], dtype='int32', lod_level=1) + self.assertRaises( + TypeError, + paddle.vision.ops.roi_align, + input=inputs, + rois=rois, + output_size=(7, 7)) + + paddle.disable_static() + + +class TestROIPool(LayerTest): + def test_roi_pool(self): + b, c, h, w = 2, 12, 20, 20 + inputs_np = np.random.rand(b, c, h, w).astype('float32') + rois_num = [4, 6] + output_size = (7, 7) + rois_np = make_rois(h, w, rois_num, output_size) + rois_num_np = np.array(rois_num).astype('int32') + with self.static_graph(): + inputs = paddle.static.data( + name='inputs', shape=[b, c, h, w], dtype='float32') + rois = paddle.static.data( + name='rois', shape=[10, 4], dtype='float32') + rois_num = paddle.static.data( + name='rois_num', shape=[None], dtype='int32') + + output = paddle.vision.ops.roi_pool( + x=inputs, + boxes=rois, + boxes_num=rois_num, + output_size=output_size) + output_np, = self.get_static_graph_result( + feed={ + 'inputs': inputs_np, + 'rois': rois_np, + 'rois_num': rois_num_np + }, + fetch_list=[output], + with_lod=False) + + with self.dynamic_graph(): + inputs_dy = paddle.to_tensor(inputs_np) + rois_dy = paddle.to_tensor(rois_np) + rois_num_dy = paddle.to_tensor(rois_num_np) + + output_dy = paddle.vision.ops.roi_pool( + x=inputs_dy, + boxes=rois_dy, + boxes_num=rois_num_dy, + output_size=output_size) + output_dy_np = output_dy.numpy() + + self.assertTrue(np.array_equal(output_np, output_dy_np)) + + def test_roi_pool_error(self): + with self.static_graph(): + inputs = paddle.static.data( + name='inputs', shape=[2, 12, 20, 20], dtype='float32') + rois = paddle.static.data( + name='data_error', shape=[10, 4], dtype='int32', lod_level=1) + self.assertRaises( + TypeError, + paddle.vision.ops.roi_pool, + input=inputs, + rois=rois, + output_size=(7, 7)) + + paddle.disable_static() + + +class TestPriorBox(LayerTest): + def test_prior_box(self): + input_np = np.random.rand(2, 10, 32, 32).astype('float32') + image_np = np.random.rand(2, 10, 40, 40).astype('float32') + min_sizes = [2, 4] + with self.static_graph(): + input = paddle.static.data( + name='input', shape=[2, 10, 32, 32], dtype='float32') + image = paddle.static.data( + name='image', shape=[2, 10, 40, 40], dtype='float32') + + box, var = ops.prior_box( + input=input, + image=image, + min_sizes=min_sizes, + clip=True, + flip=True) + box_np, var_np = self.get_static_graph_result( + feed={ + 'input': input_np, + 'image': image_np, + }, + fetch_list=[box, var], + with_lod=False) + + with self.dynamic_graph(): + inputs_dy = paddle.to_tensor(input_np) + image_dy = paddle.to_tensor(image_np) + + box_dy, var_dy = ops.prior_box( + input=inputs_dy, + image=image_dy, + min_sizes=min_sizes, + clip=True, + flip=True) + box_dy_np = box_dy.numpy() + var_dy_np = var_dy.numpy() + + self.assertTrue(np.array_equal(box_np, box_dy_np)) + self.assertTrue(np.array_equal(var_np, var_dy_np)) + + def test_prior_box_error(self): + with self.static_graph(): + input = paddle.static.data( + name='input', shape=[2, 10, 32, 32], dtype='int32') + image = paddle.static.data( + name='image', shape=[2, 10, 40, 40], dtype='int32') + self.assertRaises( + TypeError, + ops.prior_box, + input=input, + image=image, + min_sizes=[2, 4], + clip=True, + flip=True) + + paddle.disable_static() + + +class TestMulticlassNms(LayerTest): + def test_multiclass_nms(self): + boxes_np = np.random.rand(10, 81, 4).astype('float32') + scores_np = np.random.rand(10, 81).astype('float32') + rois_num_np = np.array([2, 8]).astype('int32') + with self.static_graph(): + boxes = paddle.static.data( + name='bboxes', + shape=[None, 81, 4], + dtype='float32', + lod_level=1) + scores = paddle.static.data( + name='scores', shape=[None, 81], dtype='float32', lod_level=1) + rois_num = paddle.static.data( + name='rois_num', shape=[None], dtype='int32') + + output = ops.multiclass_nms( + bboxes=boxes, + scores=scores, + background_label=0, + score_threshold=0.5, + nms_top_k=400, + nms_threshold=0.3, + keep_top_k=200, + normalized=False, + return_index=True, + rois_num=rois_num) + out_np, index_np, nms_rois_num_np = self.get_static_graph_result( + feed={ + 'bboxes': boxes_np, + 'scores': scores_np, + 'rois_num': rois_num_np + }, + fetch_list=output, + with_lod=True) + out_np = np.array(out_np) + index_np = np.array(index_np) + nms_rois_num_np = np.array(nms_rois_num_np) + + with self.dynamic_graph(): + boxes_dy = paddle.to_tensor(boxes_np) + scores_dy = paddle.to_tensor(scores_np) + rois_num_dy = paddle.to_tensor(rois_num_np) + + out_dy, index_dy, nms_rois_num_dy = ops.multiclass_nms( + bboxes=boxes_dy, + scores=scores_dy, + background_label=0, + score_threshold=0.5, + nms_top_k=400, + nms_threshold=0.3, + keep_top_k=200, + normalized=False, + return_index=True, + rois_num=rois_num_dy) + out_dy_np = out_dy.numpy() + index_dy_np = index_dy.numpy() + nms_rois_num_dy_np = nms_rois_num_dy.numpy() + + self.assertTrue(np.array_equal(out_np, out_dy_np)) + self.assertTrue(np.array_equal(index_np, index_dy_np)) + self.assertTrue(np.array_equal(nms_rois_num_np, nms_rois_num_dy_np)) + + def test_multiclass_nms_error(self): + with self.static_graph(): + boxes = paddle.static.data( + name='bboxes', shape=[81, 4], dtype='float32', lod_level=1) + scores = paddle.static.data( + name='scores', shape=[81], dtype='float32', lod_level=1) + rois_num = paddle.static.data( + name='rois_num', shape=[40, 41], dtype='int32') + self.assertRaises( + TypeError, + ops.multiclass_nms, + boxes=boxes, + scores=scores, + background_label=0, + score_threshold=0.5, + nms_top_k=400, + nms_threshold=0.3, + keep_top_k=200, + normalized=False, + return_index=True, + rois_num=rois_num) + + +class TestMatrixNMS(LayerTest): + def test_matrix_nms(self): + N, M, C = 7, 1200, 21 + BOX_SIZE = 4 + nms_top_k = 400 + keep_top_k = 200 + score_threshold = 0.01 + post_threshold = 0. + + scores_np = np.random.random((N * M, C)).astype('float32') + scores_np = np.apply_along_axis(softmax, 1, scores_np) + scores_np = np.reshape(scores_np, (N, M, C)) + scores_np = np.transpose(scores_np, (0, 2, 1)) + + boxes_np = np.random.random((N, M, BOX_SIZE)).astype('float32') + boxes_np[:, :, 0:2] = boxes_np[:, :, 0:2] * 0.5 + boxes_np[:, :, 2:4] = boxes_np[:, :, 2:4] * 0.5 + 0.5 + + with self.static_graph(): + boxes = paddle.static.data( + name='boxes', shape=[N, M, BOX_SIZE], dtype='float32') + scores = paddle.static.data( + name='scores', shape=[N, C, M], dtype='float32') + out, index, _ = ops.matrix_nms( + bboxes=boxes, + scores=scores, + score_threshold=score_threshold, + post_threshold=post_threshold, + nms_top_k=nms_top_k, + keep_top_k=keep_top_k, + return_index=True) + out_np, index_np = self.get_static_graph_result( + feed={'boxes': boxes_np, + 'scores': scores_np}, + fetch_list=[out, index], + with_lod=True) + + with self.dynamic_graph(): + boxes_dy = paddle.to_tensor(boxes_np) + scores_dy = paddle.to_tensor(scores_np) + + out_dy, index_dy, _ = ops.matrix_nms( + bboxes=boxes_dy, + scores=scores_dy, + score_threshold=score_threshold, + post_threshold=post_threshold, + nms_top_k=nms_top_k, + keep_top_k=keep_top_k, + return_index=True) + out_dy_np = out_dy.numpy() + index_dy_np = index_dy.numpy() + + self.assertTrue(np.array_equal(out_np, out_dy_np)) + self.assertTrue(np.array_equal(index_np, index_dy_np)) + + def test_matrix_nms_error(self): + with self.static_graph(): + bboxes = paddle.static.data( + name='bboxes', shape=[7, 1200, 4], dtype='float32') + scores = paddle.static.data( + name='data_error', shape=[7, 21, 1200], dtype='int32') + self.assertRaises( + TypeError, + ops.matrix_nms, + bboxes=bboxes, + scores=scores, + score_threshold=0.01, + post_threshold=0., + nms_top_k=400, + keep_top_k=200, + return_index=True) + + paddle.disable_static() + + +class TestBoxCoder(LayerTest): + def test_box_coder(self): + + prior_box_np = np.random.random((81, 4)).astype('float32') + prior_box_var_np = np.random.random((81, 4)).astype('float32') + target_box_np = np.random.random((20, 81, 4)).astype('float32') + + # static + with self.static_graph(): + prior_box = paddle.static.data( + name='prior_box', shape=[81, 4], dtype='float32') + prior_box_var = paddle.static.data( + name='prior_box_var', shape=[81, 4], dtype='float32') + target_box = paddle.static.data( + name='target_box', shape=[20, 81, 4], dtype='float32') + + boxes = ops.box_coder( + prior_box=prior_box, + prior_box_var=prior_box_var, + target_box=target_box, + code_type="decode_center_size", + box_normalized=False) + + boxes_np, = self.get_static_graph_result( + feed={ + 'prior_box': prior_box_np, + 'prior_box_var': prior_box_var_np, + 'target_box': target_box_np, + }, + fetch_list=[boxes], + with_lod=False) + + # dygraph + with self.dynamic_graph(): + prior_box_dy = paddle.to_tensor(prior_box_np) + prior_box_var_dy = paddle.to_tensor(prior_box_var_np) + target_box_dy = paddle.to_tensor(target_box_np) + + boxes_dy = ops.box_coder( + prior_box=prior_box_dy, + prior_box_var=prior_box_var_dy, + target_box=target_box_dy, + code_type="decode_center_size", + box_normalized=False) + + boxes_dy_np = boxes_dy.numpy() + + self.assertTrue(np.array_equal(boxes_np, boxes_dy_np)) + + def test_box_coder_error(self): + with self.static_graph(): + prior_box = paddle.static.data( + name='prior_box', shape=[81, 4], dtype='int32') + prior_box_var = paddle.static.data( + name='prior_box_var', shape=[81, 4], dtype='float32') + target_box = paddle.static.data( + name='target_box', shape=[20, 81, 4], dtype='float32') + + self.assertRaises(TypeError, ops.box_coder, prior_box, + prior_box_var, target_box) + + paddle.disable_static() + + +class TestGenerateProposals(LayerTest): + def test_generate_proposals(self): + scores_np = np.random.rand(2, 3, 4, 4).astype('float32') + bbox_deltas_np = np.random.rand(2, 12, 4, 4).astype('float32') + im_shape_np = np.array([[8, 8], [6, 6]]).astype('float32') + anchors_np = np.reshape(np.arange(4 * 4 * 3 * 4), + [4, 4, 3, 4]).astype('float32') + variances_np = np.ones((4, 4, 3, 4)).astype('float32') + + with self.static_graph(): + scores = paddle.static.data( + name='scores', shape=[2, 3, 4, 4], dtype='float32') + bbox_deltas = paddle.static.data( + name='bbox_deltas', shape=[2, 12, 4, 4], dtype='float32') + im_shape = paddle.static.data( + name='im_shape', shape=[2, 2], dtype='float32') + anchors = paddle.static.data( + name='anchors', shape=[4, 4, 3, 4], dtype='float32') + variances = paddle.static.data( + name='var', shape=[4, 4, 3, 4], dtype='float32') + rois, roi_probs, rois_num = ops.generate_proposals( + scores, + bbox_deltas, + im_shape, + anchors, + variances, + pre_nms_top_n=10, + post_nms_top_n=5, + return_rois_num=True) + rois_stat, roi_probs_stat, rois_num_stat = self.get_static_graph_result( + feed={ + 'scores': scores_np, + 'bbox_deltas': bbox_deltas_np, + 'im_shape': im_shape_np, + 'anchors': anchors_np, + 'var': variances_np + }, + fetch_list=[rois, roi_probs, rois_num], + with_lod=True) + + with self.dynamic_graph(): + scores_dy = paddle.to_tensor(scores_np) + bbox_deltas_dy = paddle.to_tensor(bbox_deltas_np) + im_shape_dy = paddle.to_tensor(im_shape_np) + anchors_dy = paddle.to_tensor(anchors_np) + variances_dy = paddle.to_tensor(variances_np) + rois, roi_probs, rois_num = ops.generate_proposals( + scores_dy, + bbox_deltas_dy, + im_shape_dy, + anchors_dy, + variances_dy, + pre_nms_top_n=10, + post_nms_top_n=5, + return_rois_num=True) + rois_dy = rois.numpy() + roi_probs_dy = roi_probs.numpy() + rois_num_dy = rois_num.numpy() + + self.assertTrue(np.array_equal(np.array(rois_stat), rois_dy)) + self.assertTrue(np.array_equal(np.array(roi_probs_stat), roi_probs_dy)) + self.assertTrue(np.array_equal(np.array(rois_num_stat), rois_num_dy)) + + +if __name__ == '__main__': + unittest.main() diff --git a/paddlers/models/ppdet/modeling/tests/test_yolov3_loss.py b/paddlers/models/ppdet/modeling/tests/test_yolov3_loss.py new file mode 100644 index 0000000..af41c97 --- /dev/null +++ b/paddlers/models/ppdet/modeling/tests/test_yolov3_loss.py @@ -0,0 +1,406 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division + +import unittest + +import paddle +import paddle.nn.functional as F +# add python path of PadleDetection to sys.path +import os +import sys +parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 4))) +if parent_path not in sys.path: + sys.path.append(parent_path) + +from paddlers.models.ppdet.modeling.losses import YOLOv3Loss +from paddlers.models.ppdet.data.transform.op_helper import jaccard_overlap +from paddlers.models.ppdet.modeling.bbox_utils import iou_similarity +import numpy as np +np.random.seed(0) + + +def _split_output(output, an_num, num_classes): + """ + Split output feature map to x, y, w, h, objectness, classification + along channel dimension + """ + x = paddle.strided_slice( + output, + axes=[1], + starts=[0], + ends=[output.shape[1]], + strides=[5 + num_classes]) + y = paddle.strided_slice( + output, + axes=[1], + starts=[1], + ends=[output.shape[1]], + strides=[5 + num_classes]) + w = paddle.strided_slice( + output, + axes=[1], + starts=[2], + ends=[output.shape[1]], + strides=[5 + num_classes]) + h = paddle.strided_slice( + output, + axes=[1], + starts=[3], + ends=[output.shape[1]], + strides=[5 + num_classes]) + obj = paddle.strided_slice( + output, + axes=[1], + starts=[4], + ends=[output.shape[1]], + strides=[5 + num_classes]) + clss = [] + stride = output.shape[1] // an_num + for m in range(an_num): + clss.append( + paddle.slice( + output, + axes=[1], + starts=[stride * m + 5], + ends=[stride * m + 5 + num_classes])) + cls = paddle.transpose(paddle.stack(clss, axis=1), perm=[0, 1, 3, 4, 2]) + return (x, y, w, h, obj, cls) + + +def _split_target(target): + """ + split target to x, y, w, h, objectness, classification + along dimension 2 + target is in shape [N, an_num, 6 + class_num, H, W] + """ + tx = target[:, :, 0, :, :] + ty = target[:, :, 1, :, :] + tw = target[:, :, 2, :, :] + th = target[:, :, 3, :, :] + tscale = target[:, :, 4, :, :] + tobj = target[:, :, 5, :, :] + tcls = paddle.transpose(target[:, :, 6:, :, :], perm=[0, 1, 3, 4, 2]) + tcls.stop_gradient = True + return (tx, ty, tw, th, tscale, tobj, tcls) + + +def _calc_obj_loss(output, obj, tobj, gt_box, batch_size, anchors, num_classes, + downsample, ignore_thresh, scale_x_y): + # A prediction bbox overlap any gt_bbox over ignore_thresh, + # objectness loss will be ignored, process as follows: + # 1. get pred bbox, which is same with YOLOv3 infer mode, use yolo_box here + # NOTE: img_size is set as 1.0 to get noramlized pred bbox + bbox, prob = paddle.vision.ops.yolo_box( + x=output, + img_size=paddle.ones( + shape=[batch_size, 2], dtype="int32"), + anchors=anchors, + class_num=num_classes, + conf_thresh=0., + downsample_ratio=downsample, + clip_bbox=False, + scale_x_y=scale_x_y) + # 2. split pred bbox and gt bbox by sample, calculate IoU between pred bbox + # and gt bbox in each sample + if batch_size > 1: + preds = paddle.split(bbox, batch_size, axis=0) + gts = paddle.split(gt_box, batch_size, axis=0) + else: + preds = [bbox] + gts = [gt_box] + probs = [prob] + ious = [] + for pred, gt in zip(preds, gts): + + def box_xywh2xyxy(box): + x = box[:, 0] + y = box[:, 1] + w = box[:, 2] + h = box[:, 3] + return paddle.stack( + [ + x - w / 2., + y - h / 2., + x + w / 2., + y + h / 2., + ], axis=1) + + pred = paddle.squeeze(pred, axis=[0]) + gt = box_xywh2xyxy(paddle.squeeze(gt, axis=[0])) + ious.append(iou_similarity(pred, gt)) + iou = paddle.stack(ious, axis=0) + # 3. Get iou_mask by IoU between gt bbox and prediction bbox, + # Get obj_mask by tobj(holds gt_score), calculate objectness loss + max_iou = paddle.max(iou, axis=-1) + iou_mask = paddle.cast(max_iou <= ignore_thresh, dtype="float32") + output_shape = paddle.shape(output) + an_num = len(anchors) // 2 + iou_mask = paddle.reshape(iou_mask, (-1, an_num, output_shape[2], + output_shape[3])) + iou_mask.stop_gradient = True + # NOTE: tobj holds gt_score, obj_mask holds object existence mask + obj_mask = paddle.cast(tobj > 0., dtype="float32") + obj_mask.stop_gradient = True + # For positive objectness grids, objectness loss should be calculated + # For negative objectness grids, objectness loss is calculated only iou_mask == 1.0 + obj_sigmoid = F.sigmoid(obj) + loss_obj = F.binary_cross_entropy(obj_sigmoid, obj_mask, reduction='none') + loss_obj_pos = paddle.sum(loss_obj * tobj, axis=[1, 2, 3]) + loss_obj_neg = paddle.sum(loss_obj * (1.0 - obj_mask) * iou_mask, + axis=[1, 2, 3]) + return loss_obj_pos, loss_obj_neg + + +def fine_grained_loss(output, + target, + gt_box, + batch_size, + num_classes, + anchors, + ignore_thresh, + downsample, + scale_x_y=1., + eps=1e-10): + an_num = len(anchors) // 2 + x, y, w, h, obj, cls = _split_output(output, an_num, num_classes) + tx, ty, tw, th, tscale, tobj, tcls = _split_target(target) + + tscale_tobj = tscale * tobj + + scale_x_y = scale_x_y + + if (abs(scale_x_y - 1.0) < eps): + x = F.sigmoid(x) + y = F.sigmoid(y) + loss_x = F.binary_cross_entropy(x, tx, reduction='none') * tscale_tobj + loss_x = paddle.sum(loss_x, axis=[1, 2, 3]) + loss_y = F.binary_cross_entropy(y, ty, reduction='none') * tscale_tobj + loss_y = paddle.sum(loss_y, axis=[1, 2, 3]) + else: + dx = scale_x_y * F.sigmoid(x) - 0.5 * (scale_x_y - 1.0) + dy = scale_x_y * F.sigmoid(y) - 0.5 * (scale_x_y - 1.0) + loss_x = paddle.abs(dx - tx) * tscale_tobj + loss_x = paddle.sum(loss_x, axis=[1, 2, 3]) + loss_y = paddle.abs(dy - ty) * tscale_tobj + loss_y = paddle.sum(loss_y, axis=[1, 2, 3]) + + # NOTE: we refined loss function of (w, h) as L1Loss + loss_w = paddle.abs(w - tw) * tscale_tobj + loss_w = paddle.sum(loss_w, axis=[1, 2, 3]) + loss_h = paddle.abs(h - th) * tscale_tobj + loss_h = paddle.sum(loss_h, axis=[1, 2, 3]) + + loss_obj_pos, loss_obj_neg = _calc_obj_loss( + output, obj, tobj, gt_box, batch_size, anchors, num_classes, downsample, + ignore_thresh, scale_x_y) + + cls = F.sigmoid(cls) + loss_cls = F.binary_cross_entropy(cls, tcls, reduction='none') + tobj = paddle.unsqueeze(tobj, axis=-1) + + loss_cls = paddle.multiply(loss_cls, tobj) + loss_cls = paddle.sum(loss_cls, axis=[1, 2, 3, 4]) + + loss_xys = paddle.mean(loss_x + loss_y) + loss_whs = paddle.mean(loss_w + loss_h) + loss_objs = paddle.mean(loss_obj_pos + loss_obj_neg) + loss_clss = paddle.mean(loss_cls) + + losses_all = { + "loss_xy": paddle.sum(loss_xys), + "loss_wh": paddle.sum(loss_whs), + "loss_loc": paddle.sum(loss_xys) + paddle.sum(loss_whs), + "loss_obj": paddle.sum(loss_objs), + "loss_cls": paddle.sum(loss_clss), + } + return losses_all, x, y, tx, ty + + +def gt2yolotarget(gt_bbox, gt_class, gt_score, anchors, mask, num_classes, size, + stride): + grid_h, grid_w = size + h, w = grid_h * stride, grid_w * stride + an_hw = np.array(anchors) / np.array([[w, h]]) + target = np.zeros( + (len(mask), 6 + num_classes, grid_h, grid_w), dtype=np.float32) + for b in range(gt_bbox.shape[0]): + gx, gy, gw, gh = gt_bbox[b, :] + cls = gt_class[b] + score = gt_score[b] + if gw <= 0. or gh <= 0. or score <= 0.: + continue + + # find best match anchor index + best_iou = 0. + best_idx = -1 + for an_idx in range(an_hw.shape[0]): + iou = jaccard_overlap([0., 0., gw, gh], + [0., 0., an_hw[an_idx, 0], an_hw[an_idx, 1]]) + if iou > best_iou: + best_iou = iou + best_idx = an_idx + + gi = int(gx * grid_w) + gj = int(gy * grid_h) + + # gtbox should be regresed in this layes if best match + # anchor index in anchor mask of this layer + if best_idx in mask: + best_n = mask.index(best_idx) + + # x, y, w, h, scale + target[best_n, 0, gj, gi] = gx * grid_w - gi + target[best_n, 1, gj, gi] = gy * grid_h - gj + target[best_n, 2, gj, gi] = np.log(gw * w / anchors[best_idx][0]) + target[best_n, 3, gj, gi] = np.log(gh * h / anchors[best_idx][1]) + target[best_n, 4, gj, gi] = 2.0 - gw * gh + + # objectness record gt_score + # if target[best_n, 5, gj, gi] > 0: + # print('find 1 duplicate') + target[best_n, 5, gj, gi] = score + + # classification + target[best_n, 6 + cls, gj, gi] = 1. + + return target + + +class TestYolov3LossOp(unittest.TestCase): + def setUp(self): + self.initTestCase() + x = np.random.uniform(0, 1, self.x_shape).astype('float64') + gtbox = np.random.random(size=self.gtbox_shape).astype('float64') + gtlabel = np.random.randint(0, self.class_num, self.gtbox_shape[:2]) + gtmask = np.random.randint(0, 2, self.gtbox_shape[:2]) + gtbox = gtbox * gtmask[:, :, np.newaxis] + gtlabel = gtlabel * gtmask + + gtscore = np.ones(self.gtbox_shape[:2]).astype('float64') + if self.gtscore: + gtscore = np.random.random(self.gtbox_shape[:2]).astype('float64') + + target = [] + for box, label, score in zip(gtbox, gtlabel, gtscore): + target.append( + gt2yolotarget(box, label, score, self.anchors, self.anchor_mask, + self.class_num, (self.h, self.w + ), self.downsample_ratio)) + + self.target = np.array(target).astype('float64') + + self.mask_anchors = [] + for i in self.anchor_mask: + self.mask_anchors.extend(self.anchors[i]) + self.x = x + self.gtbox = gtbox + self.gtlabel = gtlabel + self.gtscore = gtscore + + def initTestCase(self): + self.b = 8 + self.h = 19 + self.w = 19 + self.anchors = [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45], + [59, 119], [116, 90], [156, 198], [373, 326]] + self.anchor_mask = [6, 7, 8] + self.na = len(self.anchor_mask) + self.class_num = 80 + self.ignore_thresh = 0.7 + self.downsample_ratio = 32 + self.x_shape = (self.b, len(self.anchor_mask) * (5 + self.class_num), + self.h, self.w) + self.gtbox_shape = (self.b, 40, 4) + self.gtscore = True + self.use_label_smooth = False + self.scale_x_y = 1. + + def test_loss(self): + x, gtbox, gtlabel, gtscore, target = self.x, self.gtbox, self.gtlabel, self.gtscore, self.target + yolo_loss = YOLOv3Loss( + ignore_thresh=self.ignore_thresh, + label_smooth=self.use_label_smooth, + num_classes=self.class_num, + downsample=self.downsample_ratio, + scale_x_y=self.scale_x_y) + x = paddle.to_tensor(x.astype(np.float32)) + gtbox = paddle.to_tensor(gtbox.astype(np.float32)) + gtlabel = paddle.to_tensor(gtlabel.astype(np.float32)) + gtscore = paddle.to_tensor(gtscore.astype(np.float32)) + t = paddle.to_tensor(target.astype(np.float32)) + anchor = [self.anchors[i] for i in self.anchor_mask] + (yolo_loss1, px, py, tx, ty) = fine_grained_loss( + output=x, + target=t, + gt_box=gtbox, + batch_size=self.b, + num_classes=self.class_num, + anchors=self.mask_anchors, + ignore_thresh=self.ignore_thresh, + downsample=self.downsample_ratio, + scale_x_y=self.scale_x_y) + yolo_loss2 = yolo_loss.yolov3_loss( + x, t, gtbox, anchor, self.downsample_ratio, self.scale_x_y) + for k in yolo_loss2: + self.assertAlmostEqual( + yolo_loss1[k].numpy()[0], + yolo_loss2[k].numpy()[0], + delta=1e-2, + msg=k) + + +class TestYolov3LossNoGTScore(TestYolov3LossOp): + def initTestCase(self): + self.b = 1 + self.h = 76 + self.w = 76 + self.anchors = [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45], + [59, 119], [116, 90], [156, 198], [373, 326]] + self.anchor_mask = [0, 1, 2] + self.na = len(self.anchor_mask) + self.class_num = 80 + self.ignore_thresh = 0.7 + self.downsample_ratio = 8 + self.x_shape = (self.b, len(self.anchor_mask) * (5 + self.class_num), + self.h, self.w) + self.gtbox_shape = (self.b, 40, 4) + self.gtscore = False + self.use_label_smooth = False + self.scale_x_y = 1. + + +class TestYolov3LossWithScaleXY(TestYolov3LossOp): + def initTestCase(self): + self.b = 5 + self.h = 38 + self.w = 38 + self.anchors = [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45], + [59, 119], [116, 90], [156, 198], [373, 326]] + self.anchor_mask = [3, 4, 5] + self.na = len(self.anchor_mask) + self.class_num = 80 + self.ignore_thresh = 0.7 + self.downsample_ratio = 16 + self.x_shape = (self.b, len(self.anchor_mask) * (5 + self.class_num), + self.h, self.w) + self.gtbox_shape = (self.b, 40, 4) + self.gtscore = True + self.use_label_smooth = False + self.scale_x_y = 1.2 + + +if __name__ == "__main__": + unittest.main() diff --git a/paddlers/models/ppdet/modeling/transformers/deformable_transformer.py b/paddlers/models/ppdet/modeling/transformers/deformable_transformer.py index 76ff1bb..5c79004 100644 --- a/paddlers/models/ppdet/modeling/transformers/deformable_transformer.py +++ b/paddlers/models/ppdet/modeling/transformers/deformable_transformer.py @@ -13,7 +13,7 @@ # limitations under the License. # # Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR) -# Copyright (c) 2022 SenseTime. All Rights Reserved. +# Copyright (c) 2020 SenseTime. All Rights Reserved. from __future__ import absolute_import from __future__ import division diff --git a/paddlers/models/ppdet/optimizer/__init__.py b/paddlers/models/ppdet/optimizer/__init__.py new file mode 100644 index 0000000..6173792 --- /dev/null +++ b/paddlers/models/ppdet/optimizer/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .optimizer import * +from .ema import ModelEMA diff --git a/paddlers/models/ppdet/optimizer/adamw.py b/paddlers/models/ppdet/optimizer/adamw.py new file mode 100644 index 0000000..821135d --- /dev/null +++ b/paddlers/models/ppdet/optimizer/adamw.py @@ -0,0 +1,244 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle.optimizer import AdamW +from functools import partial +import re + + +def layerwise_lr_decay(decay_rate, name_dict, n_layers, param): + """ + Args: + decay_rate (float): + The layer-wise decay ratio. + name_dict (dict): + The keys of name_dict is dynamic name of model while the value + of name_dict is static name. + Use model.named_parameters() to get name_dict. + n_layers (int): + Total number of layers in the transformer encoder. + """ + ratio = 1.0 + static_name = name_dict[param.name] + if 'blocks.' in static_name or 'layers.' in static_name: + idx_1 = static_name.find('blocks.') + idx_2 = static_name.find('layers.') + assert any([x >= 0 for x in [idx_1, idx_2]]), '' + idx = idx_1 if idx_1 >= 0 else idx_2 + # idx = re.findall('[blocks|layers]\.(\d+)\.', static_name)[0] + + layer = int(static_name[idx:].split('.')[1]) + ratio = decay_rate**(n_layers - layer) + + elif 'cls_token' in static_name or 'patch_embed' in static_name: + ratio = decay_rate**(n_layers + 1) + + param.optimize_attr['learning_rate'] *= ratio + + +class AdamWDL(AdamW): + r""" + The AdamWDL optimizer is implemented based on the AdamW Optimization with dynamic lr setting. + Generally it's used for transformer model. + + We use "layerwise_lr_decay" as default dynamic lr setting method of AdamWDL. + “Layer-wise decay” means exponentially decaying the learning rates of individual + layers in a top-down manner. For example, suppose the 24-th layer uses a learning + rate l, and the Layer-wise decay rate is α, then the learning rate of layer m + is lα^(24-m). See more details on: https://arxiv.org/abs/1906.08237. + + .. math:: + & t = t + 1 + + & moment\_1\_out = {\beta}_1 * moment\_1 + (1 - {\beta}_1) * grad + + & moment\_2\_out = {\beta}_2 * moment\_2 + (1 - {\beta}_2) * grad * grad + + & learning\_rate = learning\_rate * \frac{\sqrt{1 - {\beta}_2^t}}{1 - {\beta}_1^t} + + & param\_out = param - learning\_rate * (\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + \lambda * param) + + Args: + learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``. + It can be a float value or a LRScheduler. The default value is 0.001. + beta1 (float, optional): The exponential decay rate for the 1st moment estimates. + It should be a float number or a Tensor with shape [1] and data type as float32. + The default value is 0.9. + beta2 (float, optional): The exponential decay rate for the 2nd moment estimates. + It should be a float number or a Tensor with shape [1] and data type as float32. + The default value is 0.999. + epsilon (float, optional): A small float value for numerical stability. + It should be a float number or a Tensor with shape [1] and data type as float32. + The default value is 1e-08. + parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. + weight_decay (float, optional): The weight decay coefficient, it can be float or Tensor. The default value is 0.01. + apply_decay_param_fun (function|None, optional): If it is not None, + only tensors that makes apply_decay_param_fun(Tensor.name)==True + will be updated. It only works when we want to specify tensors. + Default: None. + grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of + some derived class of ``GradientClipBase`` . There are three cliping strategies + ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , + :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. + lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators. + The accumulators are updated at every step. Every element of the two moving-average + is updated in both dense mode and sparse mode. If the size of parameter is very large, + then the update may be very slow. The lazy mode only update the element that has + gradient in current mini-batch, so it will be much more faster. But this mode has + different semantics with the original Adam algorithm and may lead to different result. + The default value is False. + multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false. + layerwise_decay (float, optional): The layer-wise decay ratio. Defaults to 1.0. + n_layers (int, optional): The total number of encoder layers. Defaults to 12. + set_param_lr_fun (function|None, optional): If it's not None, set_param_lr_fun() will set the the parameter + learning rate before it executes Adam Operator. Defaults to :ref:`layerwise_lr_decay`. + name_dict (dict, optional): The keys of name_dict is dynamic name of model while the value + of name_dict is static name. Use model.named_parameters() to get name_dict. + name (str, optional): Normally there is no need for user to set this property. + For more information, please refer to :ref:`api_guide_Name`. + The default value is None. + + Examples: + .. code-block:: python + + import paddle + from paddlenlp.ops.optimizer import AdamWDL + def simple_lr_setting(decay_rate, name_dict, n_layers, param): + ratio = 1.0 + static_name = name_dict[param.name] + if "weight" in static_name: + ratio = decay_rate**0.5 + param.optimize_attr["learning_rate"] *= ratio + + linear = paddle.nn.Linear(10, 10) + + name_dict = dict() + for n, p in linear.named_parameters(): + name_dict[p.name] = n + + inp = paddle.rand([10,10], dtype="float32") + out = linear(inp) + loss = paddle.mean(out) + + adamwdl = AdamWDL( + learning_rate=1e-4, + parameters=linear.parameters(), + set_param_lr_fun=simple_lr_setting, + layerwise_decay=0.8, + name_dict=name_dict) + + loss.backward() + adamwdl.step() + adamwdl.clear_grad() + """ + + def __init__(self, + learning_rate=0.001, + beta1=0.9, + beta2=0.999, + epsilon=1e-8, + parameters=None, + weight_decay=0.01, + apply_decay_param_fun=None, + grad_clip=None, + lazy_mode=False, + multi_precision=False, + layerwise_decay=1.0, + n_layers=12, + set_param_lr_func=None, + name_dict=None, + name=None): + if not isinstance(layerwise_decay, float): + raise TypeError("coeff should be float or Tensor.") + self.layerwise_decay = layerwise_decay + self.n_layers = n_layers + self.set_param_lr_func = partial( + set_param_lr_func, layerwise_decay, name_dict, + n_layers) if set_param_lr_func is not None else set_param_lr_func + super(AdamWDL, self).__init__( + learning_rate=learning_rate, + parameters=parameters, + beta1=beta1, + beta2=beta2, + epsilon=epsilon, + grad_clip=grad_clip, + name=name, + apply_decay_param_fun=apply_decay_param_fun, + weight_decay=weight_decay, + lazy_mode=lazy_mode, + multi_precision=multi_precision) + + def _append_optimize_op(self, block, param_and_grad): + if self.set_param_lr_func is None: + return super(AdamWDL, self)._append_optimize_op(block, + param_and_grad) + + self._append_decoupled_weight_decay(block, param_and_grad) + prev_lr = param_and_grad[0].optimize_attr["learning_rate"] + self.set_param_lr_func(param_and_grad[0]) + # excute Adam op + res = super(AdamW, self)._append_optimize_op(block, param_and_grad) + param_and_grad[0].optimize_attr["learning_rate"] = prev_lr + return res + + +def build_adamwdl(model, + lr=1e-4, + weight_decay=0.05, + betas=(0.9, 0.999), + layer_decay=0.65, + num_layers=None, + filter_bias_and_bn=True, + skip_decay_names=None, + set_param_lr_func='layerwise_lr_decay'): + + if skip_decay_names and filter_bias_and_bn: + decay_dict = { + param.name: not (len(param.shape) == 1 or name.endswith('.bias') or + any([_n in name for _n in skip_decay_names])) + for name, param in model.named_parameters() + } + parameters = [p for p in model.parameters()] + + else: + parameters = model.parameters() + + opt_args = dict( + parameters=parameters, learning_rate=lr, weight_decay=weight_decay) + + if decay_dict is not None: + opt_args['apply_decay_param_fun'] = lambda n: decay_dict[n] + + if isinstance(set_param_lr_func, str): + set_param_lr_func = eval(set_param_lr_func) + opt_args['set_param_lr_func'] = set_param_lr_func + + opt_args['beta1'] = betas[0] + opt_args['beta2'] = betas[1] + + opt_args['layerwise_decay'] = layer_decay + name_dict = {p.name: n for n, p in model.named_parameters()} + + opt_args['name_dict'] = name_dict + opt_args['n_layers'] = num_layers + + optimizer = AdamWDL(**opt_args) + + return optimizer diff --git a/paddlers/models/ppdet/optimizer/ema.py b/paddlers/models/ppdet/optimizer/ema.py new file mode 100644 index 0000000..927d357 --- /dev/null +++ b/paddlers/models/ppdet/optimizer/ema.py @@ -0,0 +1,136 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +import weakref + + +class ModelEMA(object): + """ + Exponential Weighted Average for Deep Neutal Networks + Args: + model (nn.Layer): Detector of model. + decay (int): The decay used for updating ema parameter. + Ema's parameter are updated with the formula: + `ema_param = decay * ema_param + (1 - decay) * cur_param`. + Defaults is 0.9998. + ema_decay_type (str): type in ['threshold', 'normal', 'exponential'], + 'threshold' as default. + cycle_epoch (int): The epoch of interval to reset ema_param and + step. Defaults is -1, which means not reset. Its function is to + add a regular effect to ema, which is set according to experience + and is effective when the total training epoch is large. + ema_black_list (set|list|tuple, optional): The custom EMA black_list. + Blacklist of weight names that will not participate in EMA + calculation. Default: None. + """ + + def __init__(self, + model, + decay=0.9998, + ema_decay_type='threshold', + cycle_epoch=-1, + ema_black_list=None): + self.step = 0 + self.epoch = 0 + self.decay = decay + self.ema_decay_type = ema_decay_type + self.cycle_epoch = cycle_epoch + self.ema_black_list = self._match_ema_black_list( + model.state_dict().keys(), ema_black_list) + self.state_dict = dict() + for k, v in model.state_dict().items(): + if k in self.ema_black_list: + self.state_dict[k] = v + else: + self.state_dict[k] = paddle.zeros_like(v) + + self._model_state = { + k: weakref.ref(p) + for k, p in model.state_dict().items() + } + + def reset(self): + self.step = 0 + self.epoch = 0 + for k, v in self.state_dict.items(): + if k in self.ema_black_list: + self.state_dict[k] = v + else: + self.state_dict[k] = paddle.zeros_like(v) + + def resume(self, state_dict, step=0): + for k, v in state_dict.items(): + if k in self.state_dict: + if self.state_dict[k].dtype == v.dtype: + self.state_dict[k] = v + else: + self.state_dict[k] = v.astype(self.state_dict[k].dtype) + self.step = step + + def update(self, model=None): + if self.ema_decay_type == 'threshold': + decay = min(self.decay, (1 + self.step) / (10 + self.step)) + elif self.ema_decay_type == 'exponential': + decay = self.decay * (1 - math.exp(-(self.step + 1) / 2000)) + else: + decay = self.decay + self._decay = decay + + if model is not None: + model_dict = model.state_dict() + else: + model_dict = {k: p() for k, p in self._model_state.items()} + assert all( + [v is not None for _, v in model_dict.items()]), 'python gc.' + + for k, v in self.state_dict.items(): + if k not in self.ema_black_list: + v = decay * v + (1 - decay) * model_dict[k] + v.stop_gradient = True + self.state_dict[k] = v + self.step += 1 + + def apply(self): + if self.step == 0: + return self.state_dict + state_dict = dict() + for k, v in self.state_dict.items(): + if k in self.ema_black_list: + v.stop_gradient = True + state_dict[k] = v + else: + if self.ema_decay_type != 'exponential': + v = v / (1 - self._decay**self.step) + v.stop_gradient = True + state_dict[k] = v + self.epoch += 1 + if self.cycle_epoch > 0 and self.epoch == self.cycle_epoch: + self.reset() + + return state_dict + + def _match_ema_black_list(self, weight_name, ema_black_list=None): + out_list = set() + if ema_black_list: + for name in weight_name: + for key in ema_black_list: + if key in name: + out_list.add(name) + return out_list diff --git a/paddlers/models/ppdet/optimizer.py b/paddlers/models/ppdet/optimizer/optimizer.py similarity index 59% rename from paddlers/models/ppdet/optimizer.py rename to paddlers/models/ppdet/optimizer/optimizer.py index c13df2d..7566dd8 100644 --- a/paddlers/models/ppdet/optimizer.py +++ b/paddlers/models/ppdet/optimizer/optimizer.py @@ -16,6 +16,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import sys import math import paddle import paddle.nn as nn @@ -24,6 +25,9 @@ import paddle.optimizer as optimizer import paddle.regularizer as regularizer from paddlers.models.ppdet.core.workspace import register, serializable +import copy + +from .adamw import AdamWDL, build_adamwdl __all__ = ['LearningRate', 'OptimizerBuilder'] @@ -40,12 +44,21 @@ class CosineDecay(object): max_epochs (int): max epochs for the training process. if you commbine cosine decay with warmup, it is recommended that the max_iters is much larger than the warmup iter + use_warmup (bool): whether to use warmup. Default: True. + min_lr_ratio (float): minimum learning rate ratio. Default: 0. + last_plateau_epochs (int): use minimum learning rate in + the last few epochs. Default: 0. """ - def __init__(self, max_epochs=1000, use_warmup=True, eta_min=0): + def __init__(self, + max_epochs=1000, + use_warmup=True, + min_lr_ratio=0., + last_plateau_epochs=0): self.max_epochs = max_epochs self.use_warmup = use_warmup - self.eta_min = eta_min + self.min_lr_ratio = min_lr_ratio + self.last_plateau_epochs = last_plateau_epochs def __call__(self, base_lr=None, @@ -55,20 +68,38 @@ class CosineDecay(object): assert base_lr is not None, "either base LR or values should be provided" max_iters = self.max_epochs * int(step_per_epoch) - + last_plateau_iters = self.last_plateau_epochs * int(step_per_epoch) + min_lr = base_lr * self.min_lr_ratio if boundary is not None and value is not None and self.use_warmup: + # use warmup warmup_iters = len(boundary) for i in range(int(boundary[-1]), max_iters): boundary.append(i) - - decayed_lr = base_lr * 0.5 * (math.cos( - (i - warmup_iters) * math.pi / - (max_iters - warmup_iters)) + 1) - value.append(decayed_lr) + if i < max_iters - last_plateau_iters: + decayed_lr = min_lr + (base_lr - min_lr) * 0.5 * (math.cos( + (i - warmup_iters) * math.pi / + (max_iters - warmup_iters - last_plateau_iters)) + 1) + value.append(decayed_lr) + else: + value.append(min_lr) + return optimizer.lr.PiecewiseDecay(boundary, value) + elif last_plateau_iters > 0: + # not use warmup, but set `last_plateau_epochs` > 0 + boundary = [] + value = [] + for i in range(max_iters): + if i < max_iters - last_plateau_iters: + decayed_lr = min_lr + (base_lr - min_lr) * 0.5 * (math.cos( + i * math.pi / (max_iters - last_plateau_iters)) + 1) + value.append(decayed_lr) + else: + value.append(min_lr) + if i > 0: + boundary.append(i) return optimizer.lr.PiecewiseDecay(boundary, value) return optimizer.lr.CosineAnnealingDecay( - base_lr, T_max=max_iters, eta_min=self.eta_min) + base_lr, T_max=max_iters, eta_min=min_lr) @serializable @@ -130,19 +161,25 @@ class LinearWarmup(object): Args: steps (int): warm up steps start_factor (float): initial learning rate factor + epochs (int|None): use epochs as warm up steps, the priority + of `epochs` is higher than `steps`. Default: None. """ - def __init__(self, steps=500, start_factor=1. / 3): + def __init__(self, steps=500, start_factor=1. / 3, epochs=None): super(LinearWarmup, self).__init__() self.steps = steps self.start_factor = start_factor + self.epochs = epochs def __call__(self, base_lr, step_per_epoch): boundary = [] value = [] - for i in range(self.steps + 1): - if self.steps > 0: - alpha = i / self.steps + warmup_steps = self.epochs * step_per_epoch \ + if self.epochs is not None else self.steps + warmup_steps = max(warmup_steps, 1) + for i in range(warmup_steps + 1): + if warmup_steps > 0: + alpha = i / warmup_steps factor = self.start_factor * (1 - alpha) + alpha lr = base_lr * factor value.append(lr) @@ -152,25 +189,30 @@ class LinearWarmup(object): @serializable -class BurninWarmup(object): +class ExpWarmup(object): """ - Warm up learning rate in burnin mode + Warm up learning rate in exponential mode Args: - steps (int): warm up steps + steps (int): warm up steps. + epochs (int|None): use epochs as warm up steps, the priority + of `epochs` is higher than `steps`. Default: None. + power (int): Exponential coefficient. Default: 2. """ - def __init__(self, steps=1000): - super(BurninWarmup, self).__init__() + def __init__(self, steps=1000, epochs=None, power=2): + super(ExpWarmup, self).__init__() self.steps = steps + self.epochs = epochs + self.power = power def __call__(self, base_lr, step_per_epoch): boundary = [] value = [] - burnin = min(self.steps, step_per_epoch) - for i in range(burnin + 1): - factor = (i * 1.0 / burnin)**4 - lr = base_lr * factor - value.append(lr) + warmup_steps = self.epochs * step_per_epoch if self.epochs is not None else self.steps + warmup_steps = max(warmup_steps, 1) + for i in range(warmup_steps + 1): + factor = (i / float(warmup_steps))**self.power + value.append(base_lr * factor) if i > 0: boundary.append(i) return boundary, value @@ -192,7 +234,18 @@ class LearningRate(object): schedulers=[PiecewiseDecay(), LinearWarmup()]): super(LearningRate, self).__init__() self.base_lr = base_lr - self.schedulers = schedulers + self.schedulers = [] + + schedulers = copy.deepcopy(schedulers) + for sched in schedulers: + if isinstance(sched, dict): + # support dict sched instantiate + module = sys.modules[__name__] + type = sched.pop("name") + scheduler = getattr(module, type)(**sched) + self.schedulers.append(scheduler) + else: + self.schedulers.append(sched) def __call__(self, step_per_epoch): assert len(self.schedulers) >= 1 @@ -245,93 +298,53 @@ class OptimizerBuilder(): optim_args = self.optimizer.copy() optim_type = optim_args['type'] del optim_args['type'] + + if optim_type == 'AdamWDL': + return build_adamwdl(model, lr=learning_rate, **optim_args) + if optim_type != 'AdamW': optim_args['weight_decay'] = regularization + op = getattr(optimizer, optim_type) - if 'without_weight_decay_params' in optim_args: - keys = optim_args['without_weight_decay_params'] - params = [{ - 'params': [ - p for n, p in model.named_parameters() - if any([k in n for k in keys]) - ], - 'weight_decay': 0. - }, { - 'params': [ - p for n, p in model.named_parameters() - if all([k not in n for k in keys]) - ] - }] - del optim_args['without_weight_decay_params'] + if 'param_groups' in optim_args: + assert isinstance(optim_args['param_groups'], list), '' + + param_groups = optim_args.pop('param_groups') + + params, visited = [], [] + for group in param_groups: + assert isinstance(group, + dict) and 'params' in group and isinstance( + group['params'], list), '' + _params = { + n: p + for n, p in model.named_parameters() + if any([k in n + for k in group['params']]) and p.trainable is True + } + _group = group.copy() + _group.update({'params': list(_params.values())}) + + params.append(_group) + visited.extend(list(_params.keys())) + + ext_params = [ + p for n, p in model.named_parameters() + if n not in visited and p.trainable is True + ] + + if len(ext_params) < len(model.parameters()): + params.append({'params': ext_params}) + + elif len(ext_params) > len(model.parameters()): + raise RuntimeError + else: - params = model.parameters() + _params = model.parameters() + params = [param for param in _params if param.trainable is True] return op(learning_rate=learning_rate, parameters=params, grad_clip=grad_clip, **optim_args) - - -class ModelEMA(object): - """ - Exponential Weighted Average for Deep Neutal Networks - Args: - model (nn.Layer): Detector of model. - decay (int): The decay used for updating ema parameter. - Ema's parameter are updated with the formula: - `ema_param = decay * ema_param + (1 - decay) * cur_param`. - Defaults is 0.9998. - use_thres_step (bool): Whether set decay by thres_step or not - cycle_epoch (int): The epoch of interval to reset ema_param and - step. Defaults is -1, which means not reset. Its function is to - add a regular effect to ema, which is set according to experience - and is effective when the total training epoch is large. - """ - - def __init__(self, - model, - decay=0.9998, - use_thres_step=False, - cycle_epoch=-1): - self.step = 0 - self.epoch = 0 - self.decay = decay - self.state_dict = dict() - for k, v in model.state_dict().items(): - self.state_dict[k] = paddle.zeros_like(v) - self.use_thres_step = use_thres_step - self.cycle_epoch = cycle_epoch - - def reset(self): - self.step = 0 - self.epoch = 0 - for k, v in self.state_dict.items(): - self.state_dict[k] = paddle.zeros_like(v) - - def update(self, model): - if self.use_thres_step: - decay = min(self.decay, (1 + self.step) / (10 + self.step)) - else: - decay = self.decay - self._decay = decay - model_dict = model.state_dict() - for k, v in self.state_dict.items(): - v = decay * v + (1 - decay) * model_dict[k] - v.stop_gradient = True - self.state_dict[k] = v - self.step += 1 - - def apply(self): - if self.step == 0: - return self.state_dict - state_dict = dict() - for k, v in self.state_dict.items(): - v = v / (1 - self._decay**self.step) - v.stop_gradient = True - state_dict[k] = v - self.epoch += 1 - if self.cycle_epoch > 0 and self.epoch == self.cycle_epoch: - self.reset() - - return state_dict diff --git a/paddlers/models/ppdet/slim/__init__.py b/paddlers/models/ppdet/slim/__init__.py index 1cc1541..6a3f210 100644 --- a/paddlers/models/ppdet/slim/__init__.py +++ b/paddlers/models/ppdet/slim/__init__.py @@ -21,6 +21,7 @@ from .prune import * from .quant import * from .distill import * from .unstructured_prune import * +from .ofa import * import yaml from paddlers.models.ppdet.core.workspace import load_config @@ -34,8 +35,21 @@ def build_slim_model(cfg, slim_cfg, mode='train'): return cfg if slim_load_cfg['slim'] == 'Distill': - model = DistillModel(cfg, slim_cfg) + if "slim_method" in slim_load_cfg and slim_load_cfg[ + 'slim_method'] == "FGD": + model = FGDDistillModel(cfg, slim_cfg) + else: + model = DistillModel(cfg, slim_cfg) cfg['model'] = model + cfg['slim_type'] = cfg.slim + elif slim_load_cfg['slim'] == 'OFA': + load_config(slim_cfg) + model = create(cfg.architecture) + load_pretrain_weight(model, cfg.weights) + slim = create(cfg.slim) + cfg['slim'] = slim + cfg['model'] = slim(model, model.state_dict()) + cfg['slim_type'] = cfg.slim elif slim_load_cfg['slim'] == 'DistillPrune': if mode == 'train': model = DistillModel(cfg, slim_cfg) @@ -55,9 +69,9 @@ def build_slim_model(cfg, slim_cfg, mode='train'): load_config(slim_cfg) load_pretrain_weight(model, cfg.weights) slim = create(cfg.slim) - cfg['slim_type'] = cfg.slim - cfg['model'] = slim(model) cfg['slim'] = slim + cfg['model'] = slim(model) + cfg['slim_type'] = cfg.slim elif slim_load_cfg['slim'] == 'UnstructuredPruner': load_config(slim_cfg) slim = create(cfg.slim) @@ -72,7 +86,7 @@ def build_slim_model(cfg, slim_cfg, mode='train'): slim = create(cfg.slim) cfg['slim_type'] = cfg.slim # TODO: fix quant export model in framework. - if mode == 'test' and slim_load_cfg['slim'] == 'QAT': + if mode == 'test' and 'QAT' in slim_load_cfg['slim']: slim.quant_config['activation_preprocess_type'] = None cfg['model'] = slim(model) cfg['slim'] = slim diff --git a/paddlers/models/ppdet/slim/distill.py b/paddlers/models/ppdet/slim/distill.py index 2562363..9fb29b1 100644 --- a/paddlers/models/ppdet/slim/distill.py +++ b/paddlers/models/ppdet/slim/distill.py @@ -19,6 +19,7 @@ from __future__ import print_function import paddle import paddle.nn as nn import paddle.nn.functional as F +from paddle import ParamAttr from paddlers.models.ppdet.core.workspace import register, create, load_config from paddlers.models.ppdet.modeling import ops @@ -63,6 +64,111 @@ class DistillModel(nn.Layer): return self.student_model(inputs) +class FGDDistillModel(nn.Layer): + """ + Build FGD distill model. + Args: + cfg: The student config. + slim_cfg: The teacher and distill config. + """ + + def __init__(self, cfg, slim_cfg): + super(FGDDistillModel, self).__init__() + + self.is_inherit = True + # build student model before load slim config + self.student_model = create(cfg.architecture) + self.arch = cfg.architecture + stu_pretrain = cfg['pretrain_weights'] + slim_cfg = load_config(slim_cfg) + self.teacher_cfg = slim_cfg + self.loss_cfg = slim_cfg + tea_pretrain = cfg['pretrain_weights'] + + self.teacher_model = create(self.teacher_cfg.architecture) + self.teacher_model.eval() + + for param in self.teacher_model.parameters(): + param.trainable = False + + if 'pretrain_weights' in cfg and stu_pretrain: + if self.is_inherit and 'pretrain_weights' in self.teacher_cfg and self.teacher_cfg.pretrain_weights: + load_pretrain_weight(self.student_model, + self.teacher_cfg.pretrain_weights) + logger.debug( + "Inheriting! loading teacher weights to student model!") + + load_pretrain_weight(self.student_model, stu_pretrain) + + if 'pretrain_weights' in self.teacher_cfg and self.teacher_cfg.pretrain_weights: + load_pretrain_weight(self.teacher_model, + self.teacher_cfg.pretrain_weights) + + self.fgd_loss_dic = self.build_loss( + self.loss_cfg.distill_loss, + name_list=self.loss_cfg['distill_loss_name']) + + def build_loss(self, + cfg, + name_list=[ + 'neck_f_4', 'neck_f_3', 'neck_f_2', 'neck_f_1', + 'neck_f_0' + ]): + loss_func = dict() + for idx, k in enumerate(name_list): + loss_func[k] = create(cfg) + return loss_func + + def forward(self, inputs): + if self.training: + s_body_feats = self.student_model.backbone(inputs) + s_neck_feats = self.student_model.neck(s_body_feats) + + with paddle.no_grad(): + t_body_feats = self.teacher_model.backbone(inputs) + t_neck_feats = self.teacher_model.neck(t_body_feats) + + loss_dict = {} + for idx, k in enumerate(self.fgd_loss_dic): + loss_dict[k] = self.fgd_loss_dic[k](s_neck_feats[idx], + t_neck_feats[idx], inputs) + if self.arch == "RetinaNet": + loss = self.student_model.head(s_neck_feats, inputs) + elif self.arch == "PicoDet": + head_outs = self.student_model.head( + s_neck_feats, self.student_model.export_post_process) + loss_gfl = self.student_model.head.get_loss(head_outs, inputs) + total_loss = paddle.add_n(list(loss_gfl.values())) + loss = {} + loss.update(loss_gfl) + loss.update({'loss': total_loss}) + else: + raise ValueError(f"Unsupported model {self.arch}") + for k in loss_dict: + loss['loss'] += loss_dict[k] + loss[k] = loss_dict[k] + return loss + else: + body_feats = self.student_model.backbone(inputs) + neck_feats = self.student_model.neck(body_feats) + head_outs = self.student_model.head(neck_feats) + if self.arch == "RetinaNet": + bbox, bbox_num = self.student_model.head.post_process( + head_outs, inputs['im_shape'], inputs['scale_factor']) + return {'bbox': bbox, 'bbox_num': bbox_num} + elif self.arch == "PicoDet": + head_outs = self.student_model.head( + neck_feats, self.student_model.export_post_process) + scale_factor = inputs['scale_factor'] + bboxes, bbox_num = self.student_model.head.post_process( + head_outs, + scale_factor, + export_nms=self.student_model.export_nms) + return {'bbox': bboxes, 'bbox_num': bbox_num} + else: + raise ValueError(f"Unsupported model {self.arch}") + + @register class DistillYOLOv3Loss(nn.Layer): def __init__(self, weight=1000): @@ -107,3 +213,278 @@ class DistillYOLOv3Loss(nn.Layer): loss = (distill_reg_loss + distill_cls_loss + distill_obj_loss ) * self.weight return loss + + +def parameter_init(mode="kaiming", value=0.): + if mode == "kaiming": + weight_attr = paddle.nn.initializer.KaimingUniform() + elif mode == "constant": + weight_attr = paddle.nn.initializer.Constant(value=value) + else: + weight_attr = paddle.nn.initializer.KaimingUniform() + + weight_init = ParamAttr(initializer=weight_attr) + return weight_init + + +@register +class FGDFeatureLoss(nn.Layer): + """ + The code is reference from https://github.com/yzd-v/FGD/blob/master/mmdet/distillation/losses/fgd.py + Paddle version of `Focal and Global Knowledge Distillation for Detectors` + + Args: + student_channels(int): The number of channels in the student's FPN feature map. Default to 256. + teacher_channels(int): The number of channels in the teacher's FPN feature map. Default to 256. + temp (float, optional): The temperature coefficient. Defaults to 0.5. + alpha_fgd (float, optional): The weight of fg_loss. Defaults to 0.001 + beta_fgd (float, optional): The weight of bg_loss. Defaults to 0.0005 + gamma_fgd (float, optional): The weight of mask_loss. Defaults to 0.001 + lambda_fgd (float, optional): The weight of relation_loss. Defaults to 0.000005 + """ + + def __init__(self, + student_channels=256, + teacher_channels=256, + temp=0.5, + alpha_fgd=0.001, + beta_fgd=0.0005, + gamma_fgd=0.001, + lambda_fgd=0.000005): + super(FGDFeatureLoss, self).__init__() + self.temp = temp + self.alpha_fgd = alpha_fgd + self.beta_fgd = beta_fgd + self.gamma_fgd = gamma_fgd + self.lambda_fgd = lambda_fgd + + kaiming_init = parameter_init("kaiming") + zeros_init = parameter_init("constant", 0.0) + + if student_channels != teacher_channels: + self.align = nn.Conv2D( + student_channels, + teacher_channels, + kernel_size=1, + stride=1, + padding=0, + weight_attr=kaiming_init) + student_channels = teacher_channels + else: + self.align = None + + self.conv_mask_s = nn.Conv2D( + student_channels, 1, kernel_size=1, weight_attr=kaiming_init) + self.conv_mask_t = nn.Conv2D( + teacher_channels, 1, kernel_size=1, weight_attr=kaiming_init) + + self.stu_conv_block = nn.Sequential( + nn.Conv2D( + student_channels, + student_channels // 2, + kernel_size=1, + weight_attr=zeros_init), + nn.LayerNorm([student_channels // 2, 1, 1]), + nn.ReLU(), + nn.Conv2D( + student_channels // 2, + student_channels, + kernel_size=1, + weight_attr=zeros_init)) + self.tea_conv_block = nn.Sequential( + nn.Conv2D( + teacher_channels, + teacher_channels // 2, + kernel_size=1, + weight_attr=zeros_init), + nn.LayerNorm([teacher_channels // 2, 1, 1]), + nn.ReLU(), + nn.Conv2D( + teacher_channels // 2, + teacher_channels, + kernel_size=1, + weight_attr=zeros_init)) + + def spatial_channel_attention(self, x, t=0.5): + shape = paddle.shape(x) + N, C, H, W = shape + + _f = paddle.abs(x) + spatial_map = paddle.reshape( + paddle.mean( + _f, axis=1, keepdim=True) / t, [N, -1]) + spatial_map = F.softmax(spatial_map, axis=1, dtype="float32") * H * W + spatial_att = paddle.reshape(spatial_map, [N, H, W]) + + channel_map = paddle.mean( + paddle.mean( + _f, axis=2, keepdim=False), axis=2, keepdim=False) + channel_att = F.softmax(channel_map / t, axis=1, dtype="float32") * C + return [spatial_att, channel_att] + + def spatial_pool(self, x, mode="teacher"): + batch, channel, width, height = x.shape + x_copy = x + x_copy = paddle.reshape(x_copy, [batch, channel, height * width]) + x_copy = x_copy.unsqueeze(1) + if mode.lower() == "student": + context_mask = self.conv_mask_s(x) + else: + context_mask = self.conv_mask_t(x) + + context_mask = paddle.reshape(context_mask, [batch, 1, height * width]) + context_mask = F.softmax(context_mask, axis=2) + context_mask = context_mask.unsqueeze(-1) + context = paddle.matmul(x_copy, context_mask) + context = paddle.reshape(context, [batch, channel, 1, 1]) + + return context + + def mask_loss(self, stu_channel_att, tea_channel_att, stu_spatial_att, + tea_spatial_att): + def _func(a, b): + return paddle.sum(paddle.abs(a - b)) / len(a) + + mask_loss = _func(stu_channel_att, tea_channel_att) + _func( + stu_spatial_att, tea_spatial_att) + + return mask_loss + + def feature_loss(self, stu_feature, tea_feature, Mask_fg, Mask_bg, + tea_channel_att, tea_spatial_att): + + Mask_fg = Mask_fg.unsqueeze(axis=1) + Mask_bg = Mask_bg.unsqueeze(axis=1) + + tea_channel_att = tea_channel_att.unsqueeze(axis=-1) + tea_channel_att = tea_channel_att.unsqueeze(axis=-1) + + tea_spatial_att = tea_spatial_att.unsqueeze(axis=1) + + fea_t = paddle.multiply(tea_feature, paddle.sqrt(tea_spatial_att)) + fea_t = paddle.multiply(fea_t, paddle.sqrt(tea_channel_att)) + fg_fea_t = paddle.multiply(fea_t, paddle.sqrt(Mask_fg)) + bg_fea_t = paddle.multiply(fea_t, paddle.sqrt(Mask_bg)) + + fea_s = paddle.multiply(stu_feature, paddle.sqrt(tea_spatial_att)) + fea_s = paddle.multiply(fea_s, paddle.sqrt(tea_channel_att)) + fg_fea_s = paddle.multiply(fea_s, paddle.sqrt(Mask_fg)) + bg_fea_s = paddle.multiply(fea_s, paddle.sqrt(Mask_bg)) + + fg_loss = F.mse_loss(fg_fea_s, fg_fea_t, reduction="sum") / len(Mask_fg) + bg_loss = F.mse_loss(bg_fea_s, bg_fea_t, reduction="sum") / len(Mask_bg) + + return fg_loss, bg_loss + + def relation_loss(self, stu_feature, tea_feature): + context_s = self.spatial_pool(stu_feature, "student") + context_t = self.spatial_pool(tea_feature, "teacher") + + out_s = stu_feature + self.stu_conv_block(context_s) + out_t = tea_feature + self.tea_conv_block(context_t) + + rela_loss = F.mse_loss(out_s, out_t, reduction="sum") / len(out_s) + + return rela_loss + + def mask_value(self, mask, xl, xr, yl, yr, value): + mask[xl:xr, yl:yr] = paddle.maximum(mask[xl:xr, yl:yr], value) + return mask + + def forward(self, stu_feature, tea_feature, inputs): + """Forward function. + Args: + stu_feature(Tensor): Bs*C*H*W, student's feature map + tea_feature(Tensor): Bs*C*H*W, teacher's feature map + inputs: The inputs with gt bbox and input shape info. + """ + assert stu_feature.shape[-2:] == stu_feature.shape[-2:], \ + f'The shape of Student feature {stu_feature.shape} and Teacher feature {tea_feature.shape} should be the same.' + assert "gt_bbox" in inputs.keys() and "im_shape" in inputs.keys( + ), "ERROR! FGDFeatureLoss need gt_bbox and im_shape as inputs." + gt_bboxes = inputs['gt_bbox'] + ins_shape = [ + inputs['im_shape'][i] for i in range(inputs['im_shape'].shape[0]) + ] + + index_gt = [] + for i in range(len(gt_bboxes)): + if gt_bboxes[i].size > 2: + index_gt.append(i) + # only distill feature with labeled GTbox + if len(index_gt) != len(gt_bboxes): + index_gt_t = paddle.to_tensor(index_gt) + preds_S = paddle.index_select(preds_S, index_gt_t) + preds_T = paddle.index_select(preds_T, index_gt_t) + + ins_shape = [ins_shape[c] for c in index_gt] + gt_bboxes = [gt_bboxes[c] for c in index_gt] + assert len(gt_bboxes) == preds_T.shape[ + 0], f"The number of selected GT box [{len(gt_bboxes)}] should be same with first dim of input tensor [{preds_T.shape[0]}]." + + if self.align is not None: + stu_feature = self.align(stu_feature) + + N, C, H, W = stu_feature.shape + + tea_spatial_att, tea_channel_att = self.spatial_channel_attention( + tea_feature, self.temp) + stu_spatial_att, stu_channel_att = self.spatial_channel_attention( + stu_feature, self.temp) + + Mask_fg = paddle.zeros(tea_spatial_att.shape) + Mask_bg = paddle.ones_like(tea_spatial_att) + one_tmp = paddle.ones([*tea_spatial_att.shape[1:]]) + zero_tmp = paddle.zeros([*tea_spatial_att.shape[1:]]) + Mask_fg.stop_gradient = True + Mask_bg.stop_gradient = True + one_tmp.stop_gradient = True + zero_tmp.stop_gradient = True + + wmin, wmax, hmin, hmax, area = [], [], [], [], [] + + for i in range(N): + tmp_box = paddle.ones_like(gt_bboxes[i]) + tmp_box.stop_gradient = True + tmp_box[:, 0] = gt_bboxes[i][:, 0] / ins_shape[i][1] * W + tmp_box[:, 2] = gt_bboxes[i][:, 2] / ins_shape[i][1] * W + tmp_box[:, 1] = gt_bboxes[i][:, 1] / ins_shape[i][0] * H + tmp_box[:, 3] = gt_bboxes[i][:, 3] / ins_shape[i][0] * H + + zero = paddle.zeros_like(tmp_box[:, 0], dtype="int32") + ones = paddle.ones_like(tmp_box[:, 2], dtype="int32") + zero.stop_gradient = True + ones.stop_gradient = True + + wmin.append( + paddle.cast(paddle.floor(tmp_box[:, 0]), "int32").maximum(zero)) + wmax.append(paddle.cast(paddle.ceil(tmp_box[:, 2]), "int32")) + hmin.append( + paddle.cast(paddle.floor(tmp_box[:, 1]), "int32").maximum(zero)) + hmax.append(paddle.cast(paddle.ceil(tmp_box[:, 3]), "int32")) + + area_recip = 1.0 / ( + hmax[i].reshape([1, -1]) + 1 - hmin[i].reshape([1, -1])) / ( + wmax[i].reshape([1, -1]) + 1 - wmin[i].reshape([1, -1])) + + for j in range(len(gt_bboxes[i])): + Mask_fg[i] = self.mask_value(Mask_fg[i], hmin[i][j], + hmax[i][j] + 1, wmin[i][j], + wmax[i][j] + 1, area_recip[0][j]) + + Mask_bg[i] = paddle.where(Mask_fg[i] > zero_tmp, zero_tmp, one_tmp) + + if paddle.sum(Mask_bg[i]): + Mask_bg[i] /= paddle.sum(Mask_bg[i]) + + fg_loss, bg_loss = self.feature_loss(stu_feature, tea_feature, Mask_fg, + Mask_bg, tea_channel_att, + tea_spatial_att) + mask_loss = self.mask_loss(stu_channel_att, tea_channel_att, + stu_spatial_att, tea_spatial_att) + rela_loss = self.relation_loss(stu_feature, tea_feature) + + loss = self.alpha_fgd * fg_loss + self.beta_fgd * bg_loss \ + + self.gamma_fgd * mask_loss + self.lambda_fgd * rela_loss + + return loss diff --git a/paddlers/models/ppdet/slim/ofa.py b/paddlers/models/ppdet/slim/ofa.py new file mode 100644 index 0000000..8e6f942 --- /dev/null +++ b/paddlers/models/ppdet/slim/ofa.py @@ -0,0 +1,89 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddlers.models.ppdet.core.workspace import load_config, merge_config, create +from paddlers.models.ppdet.utils.checkpoint import load_weight, load_pretrain_weight +from paddlers.models.ppdet.utils.logger import setup_logger +from paddlers.models.ppdet.core.workspace import register, serializable + +from paddle.utils import try_import + +logger = setup_logger(__name__) + + +@register +@serializable +class OFA(object): + def __init__(self, ofa_config): + super(OFA, self).__init__() + self.ofa_config = ofa_config + + def __call__(self, model, param_state_dict): + + paddleslim = try_import('paddleslim') + from paddleslim.nas.ofa import OFA, RunConfig, utils + from paddleslim.nas.ofa.convert_super import Convert, supernet + task = self.ofa_config['task'] + expand_ratio = self.ofa_config['expand_ratio'] + + skip_neck = self.ofa_config['skip_neck'] + skip_head = self.ofa_config['skip_head'] + + run_config = self.ofa_config['RunConfig'] + if 'skip_layers' in run_config: + skip_layers = run_config['skip_layers'] + else: + skip_layers = [] + + # supernet config + sp_config = supernet(expand_ratio=expand_ratio) + # convert to supernet + model = Convert(sp_config).convert(model) + + skip_names = [] + if skip_neck: + skip_names.append('neck.') + if skip_head: + skip_names.append('head.') + + for name, sublayer in model.named_sublayers(): + for n in skip_names: + if n in name: + skip_layers.append(name) + + run_config['skip_layers'] = skip_layers + run_config = RunConfig(**run_config) + + # build ofa model + ofa_model = OFA(model, run_config=run_config) + + ofa_model.set_epoch(0) + ofa_model.set_task(task) + + input_spec = [{ + "image": paddle.ones( + shape=[1, 3, 640, 640], dtype='float32'), + "im_shape": paddle.full( + [1, 2], 640, dtype='float32'), + "scale_factor": paddle.ones( + shape=[1, 2], dtype='float32') + }] + + ofa_model._clear_search_space(input_spec=input_spec) + ofa_model._build_ss = True + check_ss = ofa_model._sample_config('expand_ratio', phase=None) + # tokenize the search space + ofa_model.tokenize() + # check token map, search cands and search space + logger.info('Token map is {}'.format(ofa_model.token_map)) + logger.info('Search candidates is {}'.format(ofa_model.search_cands)) + logger.info('The length of search_space is {}, search_space is {}'. + format(len(ofa_model._ofa_layers), ofa_model._ofa_layers)) + # set model state dict into ofa model + utils.set_state_dict(ofa_model.model, param_state_dict) + return ofa_model diff --git a/paddlers/models/ppdet/slim/prune.py b/paddlers/models/ppdet/slim/prune.py index 5c7928e..4ba287f 100644 --- a/paddlers/models/ppdet/slim/prune.py +++ b/paddlers/models/ppdet/slim/prune.py @@ -83,3 +83,69 @@ class Pruner(object): pruned_flops, (ori_flops - pruned_flops) / ori_flops)) return model + + +@register +@serializable +class PrunerQAT(object): + def __init__(self, criterion, pruned_params, pruned_ratios, + print_prune_params, quant_config, print_qat_model): + super(PrunerQAT, self).__init__() + assert criterion in ['l1_norm', 'fpgm'], \ + "unsupported prune criterion: {}".format(criterion) + # Pruner hyperparameter + self.criterion = criterion + self.pruned_params = pruned_params + self.pruned_ratios = pruned_ratios + self.print_prune_params = print_prune_params + # QAT hyperparameter + self.quant_config = quant_config + self.print_qat_model = print_qat_model + + def __call__(self, model): + # FIXME: adapt to network graph when Training and inference are + # inconsistent, now only supports prune inference network graph. + model.eval() + paddleslim = try_import('paddleslim') + from paddleslim.analysis import dygraph_flops as flops + input_spec = [{ + "image": paddle.ones( + shape=[1, 3, 640, 640], dtype='float32'), + "im_shape": paddle.full( + [1, 2], 640, dtype='float32'), + "scale_factor": paddle.ones( + shape=[1, 2], dtype='float32') + }] + if self.print_prune_params: + print_prune_params(model) + + ori_flops = flops(model, input_spec) / 1000 + logger.info("FLOPs before pruning: {}GFLOPs".format(ori_flops)) + if self.criterion == 'fpgm': + pruner = paddleslim.dygraph.FPGMFilterPruner(model, input_spec) + elif self.criterion == 'l1_norm': + pruner = paddleslim.dygraph.L1NormFilterPruner(model, input_spec) + + logger.info("pruned params: {}".format(self.pruned_params)) + pruned_ratios = [float(n) for n in self.pruned_ratios] + ratios = {} + for i, param in enumerate(self.pruned_params): + ratios[param] = pruned_ratios[i] + pruner.prune_vars(ratios, [0]) + pruned_flops = flops(model, input_spec) / 1000 + logger.info("FLOPs after pruning: {}GFLOPs; pruned ratio: {}".format( + pruned_flops, (ori_flops - pruned_flops) / ori_flops)) + + self.quanter = paddleslim.dygraph.quant.QAT(config=self.quant_config) + + self.quanter.quantize(model) + + if self.print_qat_model: + logger.info("Quantized model:") + logger.info(model) + + return model + + def save_quantized_model(self, layer, path, input_spec=None, **config): + self.quanter.save_quantized_model( + model=layer, path=path, input_spec=input_spec, **config) diff --git a/paddlers/models/ppdet/slim/quant.py b/paddlers/models/ppdet/slim/quant.py index a0fb0e6..7c4d40b 100644 --- a/paddlers/models/ppdet/slim/quant.py +++ b/paddlers/models/ppdet/slim/quant.py @@ -38,6 +38,11 @@ class QAT(object): logger.info("Model before quant:") logger.info(model) + # For PP-YOLOE, convert model to deploy firstly. + for layer in model.sublayers(): + if hasattr(layer, 'convert_to_deploy'): + layer.convert_to_deploy() + self.quanter.quantize(model) if self.print_model: diff --git a/paddlers/models/ppdet/utils/check.py b/paddlers/models/ppdet/utils/check.py index 4a0a176..3480893 100644 --- a/paddlers/models/ppdet/utils/check.py +++ b/paddlers/models/ppdet/utils/check.py @@ -20,12 +20,14 @@ import sys import paddle import six -import paddle.version as fluid_version +import paddle.version as paddle_version from .logger import setup_logger logger = setup_logger(__name__) -__all__ = ['check_gpu', 'check_npu', 'check_version', 'check_config'] +__all__ = [ + 'check_gpu', 'check_npu', 'check_xpu', 'check_version', 'check_config' +] def check_npu(use_npu): @@ -47,6 +49,25 @@ def check_npu(use_npu): pass +def check_xpu(use_xpu): + """ + Log error and exit when set use_xpu=true in paddlepaddle + cpu/gpu/npu version. + """ + err = "Config use_xpu cannot be set as true while you are " \ + "using paddlepaddle cpu/gpu/npu version ! \nPlease try: \n" \ + "\t1. Install paddlepaddle-xpu to run model on XPU \n" \ + "\t2. Set use_xpu as false in config file to run " \ + "model on CPU/GPU/NPU" + + try: + if use_xpu and not paddle.is_compiled_with_xpu(): + logger.error(err) + sys.exit(1) + except Exception as e: + pass + + def check_gpu(use_gpu): """ Log error and exit when set use_gpu=true in paddlepaddle @@ -66,21 +87,23 @@ def check_gpu(use_gpu): pass -def check_version(version='2.0'): +def check_version(version='2.2'): """ Log error and exit when the installed version of paddlepaddle is not satisfied. """ err = "PaddlePaddle version {} or higher is required, " \ - "or a suitable develop version is satisfied as well. \n" \ + "or a suitable release/2.5 version is satisfied as well. \n" \ "Please make sure the version is good with your code.".format(version) version_installed = [ - fluid_version.major, fluid_version.minor, fluid_version.patch, - fluid_version.rc + paddle_version.major, paddle_version.minor, paddle_version.patch, + paddle_version.rc ] + if version_installed == ['0', '0', '0', '0']: return + version_split = version.split('.') length = min(len(version_installed), len(version_split)) diff --git a/paddlers/models/ppdet/utils/checkpoint.py b/paddlers/models/ppdet/utils/checkpoint.py index 3a6087c..04aec4d 100644 --- a/paddlers/models/ppdet/utils/checkpoint.py +++ b/paddlers/models/ppdet/utils/checkpoint.py @@ -62,7 +62,7 @@ def _strip_postfix(path): return path -def load_weight(model, weight, optimizer=None): +def load_weight(model, weight, optimizer=None, ema=None): if is_url(weight): weight = get_weights_path(weight) @@ -72,14 +72,26 @@ def load_weight(model, weight, optimizer=None): raise ValueError("Model pretrain path {} does not " "exists.".format(pdparam_path)) - param_state_dict = paddle.load(pdparam_path) + if ema is not None and os.path.exists(path + '.pdema'): + # Exchange model and ema_model to load + ema_state_dict = paddle.load(pdparam_path) + param_state_dict = paddle.load(path + '.pdema') + else: + ema_state_dict = None + param_state_dict = paddle.load(pdparam_path) + model_dict = model.state_dict() model_weight = {} incorrect_keys = 0 - for key in model_dict.keys(): + for key, value in model_dict.items(): if key in param_state_dict.keys(): - model_weight[key] = param_state_dict[key] + if isinstance(param_state_dict[key], np.ndarray): + param_state_dict[key] = paddle.to_tensor(param_state_dict[key]) + if value.dtype == param_state_dict[key].dtype: + model_weight[key] = param_state_dict[key] + else: + model_weight[key] = param_state_dict[key].astype(value.dtype) else: logger.info('Unmatched key: {}'.format(key)) incorrect_keys += 1 @@ -102,6 +114,11 @@ def load_weight(model, weight, optimizer=None): last_epoch = optim_state_dict.pop('last_epoch') optimizer.set_state_dict(optim_state_dict) + if ema_state_dict is not None: + ema.resume(ema_state_dict, + optim_state_dict['LR_Scheduler']['last_epoch']) + elif ema_state_dict is not None: + ema.resume(ema_state_dict) return last_epoch @@ -197,33 +214,52 @@ def load_pretrain_weight(model, pretrain_weight): param_state_dict = paddle.load(weights_path) param_state_dict = match_state_dict(model_dict, param_state_dict) + for k, v in param_state_dict.items(): + if isinstance(v, np.ndarray): + v = paddle.to_tensor(v) + if model_dict[k].dtype != v.dtype: + param_state_dict[k] = v.astype(model_dict[k].dtype) + model.set_dict(param_state_dict) logger.info('Finish loading model weights: {}'.format(weights_path)) -def save_model(model, optimizer, save_dir, save_name, last_epoch): +def save_model(model, + optimizer, + save_dir, + save_name, + last_epoch, + ema_model=None): """ save model into disk. Args: - model (paddle.nn.Layer): the Layer instalce to save parameters. + model (dict): the model state_dict to save parameters. optimizer (paddle.optimizer.Optimizer): the Optimizer instance to save optimizer states. save_dir (str): the directory to be saved. save_name (str): the path to be saved. last_epoch (int): the epoch index. + ema_model (dict|None): the ema_model state_dict to save parameters. """ if paddle.distributed.get_rank() != 0: return + assert isinstance(model, dict), ("model is not a instance of dict, " + "please call model.state_dict() to get.") if not os.path.exists(save_dir): os.makedirs(save_dir) save_path = os.path.join(save_dir, save_name) - if isinstance(model, nn.Layer): - paddle.save(model.state_dict(), save_path + ".pdparams") - else: - assert isinstance(model, - dict), 'model is not a instance of nn.layer or dict' + # save model + if ema_model is None: paddle.save(model, save_path + ".pdparams") + else: + assert isinstance(ema_model, + dict), ("ema_model is not a instance of dict, " + "please call model.state_dict() to get.") + # Exchange model and ema_model to save + paddle.save(ema_model, save_path + ".pdparams") + paddle.save(model, save_path + ".pdema") + # save optimizer state_dict = optimizer.state_dict() state_dict['last_epoch'] = last_epoch paddle.save(state_dict, save_path + ".pdopt") diff --git a/paddlers/models/ppdet/utils/cli.py b/paddlers/models/ppdet/utils/cli.py index afe5f3f..295757b 100644 --- a/paddlers/models/ppdet/utils/cli.py +++ b/paddlers/models/ppdet/utils/cli.py @@ -81,6 +81,13 @@ class ArgsParser(ArgumentParser): return config +def merge_args(config, args, exclude_args=['config', 'opt', 'slim_config']): + for k, v in vars(args).items(): + if k not in exclude_args: + config[k] = v + return config + + def print_total_cfg(config): modules = get_registered_modules() color_tty = ColorTTY() diff --git a/paddlers/models/ppdet/utils/download.py b/paddlers/models/ppdet/utils/download.py index 2c00787..c36b236 100644 --- a/paddlers/models/ppdet/utils/download.py +++ b/paddlers/models/ppdet/utils/download.py @@ -96,8 +96,8 @@ DATASETS = { 'https://paddlemodels.bj.bcebos.com/object_detection/roadsign_coco.tar', '49ce5a9b5ad0d6266163cd01de4b018e', ), ], ['annotations', 'images']), 'spine_coco': ([( - 'https://paddledet.bj.bcebos.com/data/spine_coco.tar', - '7ed69ae73f842cd2a8cf4f58dc3c5535', ), ], ['annotations', 'images']), + 'https://paddledet.bj.bcebos.com/data/spine.tar', + '8a3a353c2c54a2284ad7d2780b65f6a6', ), ], ['annotations', 'images']), 'mot': (), 'objects365': (), 'coco_ce': ([( @@ -235,7 +235,7 @@ def create_voc_list(data_dir, devkit_subdir='VOCdevkit'): years = ['2007', '2012'] # NOTE: since using auto download VOC - # dataset, VOC default label list should be used, + # dataset, VOC default label list should be used, # do not generate label_list.txt here. For default # label, see ../data/source/voc.py create_list(devkit_dir, years, data_dir) @@ -387,13 +387,18 @@ def _download(url, path, md5sum=None): if chunk: f.write(chunk) shutil.move(tmp_fullname, fullname) - return fullname + return fullname def _download_dist(url, path, md5sum=None): env = os.environ if 'PADDLE_TRAINERS_NUM' in env and 'PADDLE_TRAINER_ID' in env: - trainer_id = int(env['PADDLE_TRAINER_ID']) + # Mainly used to solve the problem of downloading data from + # different machines in the case of multiple machines. + # Different nodes will download data, and the same node + # will only download data once. + # Reference https://github.com/PaddlePaddle/PaddleClas/blob/release/2.5/ppcls/utils/download.py#L108 + rank_id_curr_node = int(os.environ.get("PADDLE_RANK_IN_NODE", 0)) num_trainers = int(env['PADDLE_TRAINERS_NUM']) if num_trainers <= 1: return _download(url, path, md5sum) @@ -406,12 +411,9 @@ def _download_dist(url, path, md5sum=None): os.makedirs(path) if not osp.exists(fullname): - from paddle.distributed import ParallelEnv - unique_endpoints = _get_unique_endpoints(ParallelEnv() - .trainer_endpoints[:]) - with open(lock_path, 'w'): # touch + with open(lock_path, 'w'): # touch os.utime(lock_path, None) - if ParallelEnv().current_endpoint in unique_endpoints: + if rank_id_curr_node == 0: _download(url, path, md5sum) os.remove(lock_path) else: @@ -423,7 +425,7 @@ def _download_dist(url, path, md5sum=None): def _check_exist_file_md5(filename, md5sum, url): - # if md5sum is None, and file to check is weights file, + # if md5sum is None, and file to check is weights file, # read md5um from url and check, else check md5sum directly return _md5check_from_url(filename, url) if md5sum is None \ and filename.endswith('pdparams') \ @@ -523,7 +525,7 @@ def _decompress_dist(fname): # trainer pipeline in order # **change this if you have more elegent methods** if ParallelEnv().current_endpoint in unique_endpoints: - with open(lock_path, 'w'): # touch + with open(lock_path, 'w'): # touch os.utime(lock_path, None) _decompress(fname) os.remove(lock_path) diff --git a/paddlers/models/ppdet/utils/fuse_utils.py b/paddlers/models/ppdet/utils/fuse_utils.py new file mode 100644 index 0000000..647fa99 --- /dev/null +++ b/paddlers/models/ppdet/utils/fuse_utils.py @@ -0,0 +1,179 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import paddle +import paddle.nn as nn + +__all__ = ['fuse_conv_bn'] + + +def fuse_conv_bn(model): + is_train = False + if model.training: + model.eval() + is_train = True + fuse_list = [] + tmp_pair = [None, None] + for name, layer in model.named_sublayers(): + if isinstance(layer, nn.Conv2D): + tmp_pair[0] = name + if isinstance(layer, nn.BatchNorm2D): + tmp_pair[1] = name + + if tmp_pair[0] and tmp_pair[1] and len(tmp_pair) == 2: + fuse_list.append(tmp_pair) + tmp_pair = [None, None] + model = fuse_layers(model, fuse_list) + if is_train: + model.train() + return model + + +def find_parent_layer_and_sub_name(model, name): + """ + Given the model and the name of a layer, find the parent layer and + the sub_name of the layer. + For example, if name is 'block_1/convbn_1/conv_1', the parent layer is + 'block_1/convbn_1' and the sub_name is `conv_1`. + Args: + model(paddle.nn.Layer): the model to be quantized. + name(string): the name of a layer + + Returns: + parent_layer, subname + """ + assert isinstance(model, nn.Layer), \ + "The model must be the instance of paddle.nn.Layer." + assert len(name) > 0, "The input (name) should not be empty." + + last_idx = 0 + idx = 0 + parent_layer = model + while idx < len(name): + if name[idx] == '.': + sub_name = name[last_idx:idx] + if hasattr(parent_layer, sub_name): + parent_layer = getattr(parent_layer, sub_name) + last_idx = idx + 1 + idx += 1 + sub_name = name[last_idx:idx] + return parent_layer, sub_name + + +class Identity(nn.Layer): + '''a layer to replace bn or relu layers''' + + def __init__(self, *args, **kwargs): + super(Identity, self).__init__() + + def forward(self, input): + return input + + +def fuse_layers(model, layers_to_fuse, inplace=False): + ''' + fuse layers in layers_to_fuse + + Args: + model(nn.Layer): The model to be fused. + layers_to_fuse(list): The layers' names to be fused. For + example,"fuse_list = [["conv1", "bn1"], ["conv2", "bn2"]]". + A TypeError would be raised if "fuse" was set as + True but "fuse_list" was None. + Default: None. + inplace(bool): Whether apply fusing to the input model. + Default: False. + + Return + fused_model(paddle.nn.Layer): The fused model. + ''' + if not inplace: + model = copy.deepcopy(model) + for layers_list in layers_to_fuse: + layer_list = [] + for layer_name in layers_list: + parent_layer, sub_name = find_parent_layer_and_sub_name(model, + layer_name) + layer_list.append(getattr(parent_layer, sub_name)) + new_layers = _fuse_func(layer_list) + for i, item in enumerate(layers_list): + parent_layer, sub_name = find_parent_layer_and_sub_name(model, item) + setattr(parent_layer, sub_name, new_layers[i]) + return model + + +def _fuse_func(layer_list): + '''choose the fuser method and fuse layers''' + types = tuple(type(m) for m in layer_list) + fusion_method = types_to_fusion_method.get(types, None) + new_layers = [None] * len(layer_list) + fused_layer = fusion_method(*layer_list) + for handle_id, pre_hook_fn in layer_list[0]._forward_pre_hooks.items(): + fused_layer.register_forward_pre_hook(pre_hook_fn) + del layer_list[0]._forward_pre_hooks[handle_id] + for handle_id, hook_fn in layer_list[-1]._forward_post_hooks.items(): + fused_layer.register_forward_post_hook(hook_fn) + del layer_list[-1]._forward_post_hooks[handle_id] + new_layers[0] = fused_layer + for i in range(1, len(layer_list)): + identity = Identity() + identity.training = layer_list[0].training + new_layers[i] = identity + return new_layers + + +def _fuse_conv_bn(conv, bn): + '''fuse conv and bn for train or eval''' + assert(conv.training == bn.training),\ + "Conv and BN both must be in the same mode (train or eval)." + if conv.training: + assert bn._num_features == conv._out_channels, 'Output channel of Conv2d must match num_features of BatchNorm2d' + raise NotImplementedError + else: + return _fuse_conv_bn_eval(conv, bn) + + +def _fuse_conv_bn_eval(conv, bn): + '''fuse conv and bn for eval''' + assert (not (conv.training or bn.training)), "Fusion only for eval!" + fused_conv = copy.deepcopy(conv) + + fused_weight, fused_bias = _fuse_conv_bn_weights( + fused_conv.weight, fused_conv.bias, bn._mean, bn._variance, bn._epsilon, + bn.weight, bn.bias) + fused_conv.weight.set_value(fused_weight) + if fused_conv.bias is None: + fused_conv.bias = paddle.create_parameter( + shape=[fused_conv._out_channels], is_bias=True, dtype=bn.bias.dtype) + fused_conv.bias.set_value(fused_bias) + return fused_conv + + +def _fuse_conv_bn_weights(conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b): + '''fuse weights and bias of conv and bn''' + if conv_b is None: + conv_b = paddle.zeros_like(bn_rm) + if bn_w is None: + bn_w = paddle.ones_like(bn_rm) + if bn_b is None: + bn_b = paddle.zeros_like(bn_rm) + bn_var_rsqrt = paddle.rsqrt(bn_rv + bn_eps) + conv_w = conv_w * \ + (bn_w * bn_var_rsqrt).reshape([-1] + [1] * (len(conv_w.shape) - 1)) + conv_b = (conv_b - bn_rm) * bn_var_rsqrt * bn_w + bn_b + return conv_w, conv_b + + +types_to_fusion_method = {(nn.Conv2D, nn.BatchNorm2D): _fuse_conv_bn, } diff --git a/paddlers/tasks/classifier.py b/paddlers/tasks/classifier.py index c1074a3..dff79e7 100644 --- a/paddlers/tasks/classifier.py +++ b/paddlers/tasks/classifier.py @@ -167,13 +167,8 @@ class BaseClassifier(BaseModel): weight_decay=paddle.regularizer.L2Decay(L2_coeff)) return optimizer - def default_postprocess(self, class_id_map_file): - default_config = { - "name": "Topk", - "topk": 1, - "class_id_map_file": class_id_map_file - } - return build_postprocess(default_config) + def default_postprocess(self): + return self.build_postprocess_from_labels(topk=1) def build_postprocess_from_labels(self, topk=1): label_dict = dict() @@ -250,7 +245,7 @@ class BaseClassifier(BaseModel): if self.losses is None: self.losses = self.default_loss() self.metrics = self.default_metric() - self.postprocess = self.default_postprocess(train_dataset.label_list) + self.postprocess = self.default_postprocess() if optimizer is None: num_steps_each_epoch = train_dataset.num_samples // train_batch_size