[Fix] Update ppdet Version and Update QR Code (#64)

own
Lin Manhui 2 years ago committed by GitHub
parent 6752db2de9
commit a4957b21be
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 8
      .github/workflows/build.yaml
  2. 2
      README.md
  3. BIN
      docs/images/whole_picture.png
  4. 1
      paddlers/models/hash.txt
  5. 14
      paddlers/models/ppdet/core/workspace.py
  6. 2
      paddlers/models/ppdet/data/reader.py
  7. 3
      paddlers/models/ppdet/data/shm_utils.py
  8. 1
      paddlers/models/ppdet/data/source/__init__.py
  9. 37
      paddlers/models/ppdet/data/source/category.py
  10. 164
      paddlers/models/ppdet/data/source/coco.py
  11. 126
      paddlers/models/ppdet/data/source/dataset.py
  12. 12
      paddlers/models/ppdet/data/source/mot.py
  13. 7
      paddlers/models/ppdet/data/source/voc.py
  14. 2
      paddlers/models/ppdet/data/transform/__init__.py
  15. 121
      paddlers/models/ppdet/data/transform/batch_operators.py
  16. 4
      paddlers/models/ppdet/data/transform/keypoint_operators.py
  17. 2
      paddlers/models/ppdet/data/transform/mot_operators.py
  18. 553
      paddlers/models/ppdet/data/transform/operators.py
  19. 479
      paddlers/models/ppdet/data/transform/rotated_operators.py
  20. 72
      paddlers/models/ppdet/data/utils.py
  21. 167
      paddlers/models/ppdet/engine/callbacks.py
  22. 38
      paddlers/models/ppdet/engine/export_utils.py
  23. 175
      paddlers/models/ppdet/engine/tracker.py
  24. 547
      paddlers/models/ppdet/engine/trainer.py
  25. 35
      paddlers/models/ppdet/ext_op/README.md
  26. 90
      paddlers/models/ppdet/ext_op/csrc/rbox_iou/matched_rbox_iou_op.cc
  27. 63
      paddlers/models/ppdet/ext_op/csrc/rbox_iou/matched_rbox_iou_op.cu
  28. 97
      paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou_op.cc
  29. 114
      paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou_op.cu
  30. 348
      paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou_op.h
  31. 33
      paddlers/models/ppdet/ext_op/setup.py
  32. 149
      paddlers/models/ppdet/ext_op/unittest/test_matched_rbox_iou.py
  33. 151
      paddlers/models/ppdet/ext_op/unittest/test_rbox_iou.py
  34. 12
      paddlers/models/ppdet/metrics/json_results.py
  35. 21
      paddlers/models/ppdet/metrics/keypoint_metrics.py
  36. 28
      paddlers/models/ppdet/metrics/map_utils.py
  37. 32
      paddlers/models/ppdet/metrics/mcmot_metrics.py
  38. 175
      paddlers/models/ppdet/metrics/metrics.py
  39. 46
      paddlers/models/ppdet/metrics/mot_metrics.py
  40. 1
      paddlers/models/ppdet/model_zoo/.gitignore
  41. 13
      paddlers/models/ppdet/model_zoo/tests/__init__.py
  42. 48
      paddlers/models/ppdet/model_zoo/tests/test_get_model.py
  43. 68
      paddlers/models/ppdet/model_zoo/tests/test_list_model.py
  44. 2
      paddlers/models/ppdet/modeling/__init__.py
  45. 13
      paddlers/models/ppdet/modeling/architectures/__init__.py
  46. 79
      paddlers/models/ppdet/modeling/architectures/bytetrack.py
  47. 8
      paddlers/models/ppdet/modeling/architectures/cascade_rcnn.py
  48. 0
      paddlers/models/ppdet/modeling/architectures/centernet.py
  49. 5
      paddlers/models/ppdet/modeling/architectures/deepsort.py
  50. 0
      paddlers/models/ppdet/modeling/architectures/fairmot.py
  51. 4
      paddlers/models/ppdet/modeling/architectures/faster_rcnn.py
  52. 2
      paddlers/models/ppdet/modeling/architectures/keypoint_hrhrnet.py
  53. 8
      paddlers/models/ppdet/modeling/architectures/mask_rcnn.py
  54. 44
      paddlers/models/ppdet/modeling/architectures/meta_arch.py
  55. 20
      paddlers/models/ppdet/modeling/architectures/picodet.py
  56. 68
      paddlers/models/ppdet/modeling/architectures/retinanet.py
  57. 47
      paddlers/models/ppdet/modeling/architectures/s2anet.py
  58. 5
      paddlers/models/ppdet/modeling/architectures/yolo.py
  59. 138
      paddlers/models/ppdet/modeling/architectures/yolox.py
  60. 2
      paddlers/models/ppdet/modeling/assigners/__init__.py
  61. 60
      paddlers/models/ppdet/modeling/assigners/atss_assigner.py
  62. 54
      paddlers/models/ppdet/modeling/assigners/max_iou_assigner.py
  63. 5
      paddlers/models/ppdet/modeling/assigners/simota_assigner.py
  64. 38
      paddlers/models/ppdet/modeling/assigners/task_aligned_assigner.py
  65. 71
      paddlers/models/ppdet/modeling/assigners/utils.py
  66. 13
      paddlers/models/ppdet/modeling/backbones/__init__.py
  67. 2
      paddlers/models/ppdet/modeling/backbones/blazenet.py
  68. 245
      paddlers/models/ppdet/modeling/backbones/convnext.py
  69. 404
      paddlers/models/ppdet/modeling/backbones/csp_darknet.py
  70. 321
      paddlers/models/ppdet/modeling/backbones/cspresnet.py
  71. 25
      paddlers/models/ppdet/modeling/backbones/darknet.py
  72. 0
      paddlers/models/ppdet/modeling/backbones/dla.py
  73. 4
      paddlers/models/ppdet/modeling/backbones/esnet.py
  74. 8
      paddlers/models/ppdet/modeling/backbones/ghostnet.py
  75. 6
      paddlers/models/ppdet/modeling/backbones/hardnet.py
  76. 41
      paddlers/models/ppdet/modeling/backbones/lcnet.py
  77. 2
      paddlers/models/ppdet/modeling/backbones/mobilenet_v1.py
  78. 8
      paddlers/models/ppdet/modeling/backbones/mobilenet_v3.py
  79. 266
      paddlers/models/ppdet/modeling/backbones/mobileone.py
  80. 0
      paddlers/models/ppdet/modeling/backbones/resnet.py
  81. 2
      paddlers/models/ppdet/modeling/backbones/senet.py
  82. 5
      paddlers/models/ppdet/modeling/backbones/shufflenet_v2.py
  83. 93
      paddlers/models/ppdet/modeling/backbones/swin_transformer.py
  84. 74
      paddlers/models/ppdet/modeling/backbones/transformer_utils.py
  85. 6
      paddlers/models/ppdet/modeling/backbones/vgg.py
  86. 634
      paddlers/models/ppdet/modeling/backbones/vision_transformer.py
  87. 441
      paddlers/models/ppdet/modeling/bbox_utils.py
  88. 40
      paddlers/models/ppdet/modeling/cls_utils.py
  89. 4
      paddlers/models/ppdet/modeling/heads/__init__.py
  90. 38
      paddlers/models/ppdet/modeling/heads/bbox_head.py
  91. 70
      paddlers/models/ppdet/modeling/heads/cascade_head.py
  92. 0
      paddlers/models/ppdet/modeling/heads/centernet_head.py
  93. 3
      paddlers/models/ppdet/modeling/heads/face_head.py
  94. 2
      paddlers/models/ppdet/modeling/heads/fcos_head.py
  95. 114
      paddlers/models/ppdet/modeling/heads/gfl_head.py
  96. 35
      paddlers/models/ppdet/modeling/heads/mask_head.py
  97. 528
      paddlers/models/ppdet/modeling/heads/pico_head.py
  98. 388
      paddlers/models/ppdet/modeling/heads/ppyoloe_head.py
  99. 249
      paddlers/models/ppdet/modeling/heads/retina_head.py
  100. 36
      paddlers/models/ppdet/modeling/heads/roi_extractor.py
  101. Some files were not shown because too many files have changed in this diff Show More

@ -26,16 +26,16 @@ jobs:
include:
- python-version: "3.7"
os: windows-latest
gdal-whl-url: https://download.lfd.uci.edu/pythonlibs/archived/cp37/GDAL-3.3.3-cp37-cp37m-win_amd64.whl
gdal-whl-url: https://paddlers.bj.bcebos.com/dependencies/gdal/GDAL-3.3.3-cp37-cp37m-win_amd64.whl
- python-version: "3.7"
os: ubuntu-latest
gdal-whl-url: https://versaweb.dl.sourceforge.net/project/gdal-wheels-for-linux/GDAL-3.4.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl
gdal-whl-url: https://paddlers.bj.bcebos.com/dependencies/gdal/GDAL-3.4.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl
- python-version: "3.8"
os: windows-latest
gdal-whl-url: https://download.lfd.uci.edu/pythonlibs/archived/GDAL-3.3.3-cp38-cp38-win_amd64.whl
gdal-whl-url: https://paddlers.bj.bcebos.com/dependencies/gdal/GDAL-3.3.3-cp38-cp38-win_amd64.whl
- python-version: "3.8"
os: ubuntu-latest
gdal-whl-url: https://versaweb.dl.sourceforge.net/project/gdal-wheels-for-linux/GDAL-3.4.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl
gdal-whl-url: https://paddlers.bj.bcebos.com/dependencies/gdal/GDAL-3.4.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl
fail-fast: false
steps:
- uses: actions/checkout@v3

@ -48,7 +48,7 @@ PaddleRS具有以下五大特色:
* 如果您发现任何PaddleRS存在的问题或是对PaddleRS有建议, 欢迎通过[GitHub Issues](https://github.com/PaddlePaddle/PaddleRS/issues)向我们提出。
* 欢迎加入PaddleRS微信群:
<div align="center">
<img src="https://user-images.githubusercontent.com/21275753/199192024-79373ad7-917f-4a7c-9de2-010a4d0c0152.png" width = "150" />
<img src="https://user-images.githubusercontent.com/21275753/200470530-a3321f5b-fa8e-4330-84fa-b76cb3df873a.png" width = "150" />
</div>
## <img src="./docs/images/model.png" width="30"/> 产品矩阵

Binary file not shown.

Before

Width:  |  Height:  |  Size: 280 KiB

After

Width:  |  Height:  |  Size: 281 KiB

@ -1 +1,2 @@
ppdet ba2aad26e6bc1e5c2dad76ca96692a0d63eccfac
ppseg f6c73b478cdf00f40ae69edd35bf6bce2a1687ef

@ -210,9 +210,17 @@ def create(cls_or_name, **kwargs):
assert type(cls_or_name) in [type, str
], "should be a class or name of a class"
name = type(cls_or_name) == str and cls_or_name or cls_or_name.__name__
assert name in global_config and \
isinstance(global_config[name], SchemaDict), \
"the module {} is not registered".format(name)
if name in global_config:
if isinstance(global_config[name], SchemaDict):
pass
elif hasattr(global_config[name], "__dict__"):
# support instance return directly
return global_config[name]
else:
raise ValueError("The module {} is not registered".format(name))
else:
raise ValueError("The module {} is not registered".format(name))
config = global_config[name]
cls = getattr(config.pymodule, name)
cls_kwargs = {}

@ -23,7 +23,7 @@ else:
import numpy as np
from paddle.io import DataLoader, DistributedBatchSampler
from paddle.fluid.dataloader.collate import default_collate_fn
from .utils import default_collate_fn
from paddlers.models.ppdet.core.workspace import register
from . import transform

@ -34,6 +34,9 @@ SHM_DEFAULT_MOUNT = '/dev/shm'
def _parse_size_in_M(size_str):
if size_str[-1] == 'B':
num, unit = size_str[:-2], size_str[-2]
else:
num, unit = size_str[:-1], size_str[-1]
assert unit in SIZE_UNIT, \
"unknown shm size unit {}".format(unit)

@ -27,3 +27,4 @@ from .category import *
from .keypoint_coco import *
from .mot import *
from .sniper_coco import SniperCOCODataSet
from .dataset import ImageFolder

@ -39,24 +39,49 @@ def get_categories(metric_type, anno_file=None, arch=None):
if arch == 'keypoint_arch':
return (None, {'id': 'keypoint'})
if anno_file == None or (not os.path.isfile(anno_file)):
logger.warning(
"anno_file '{}' is None or not set or not exist, "
"please recheck TrainDataset/EvalDataset/TestDataset.anno_path, "
"otherwise the default categories will be used by metric_type.".
format(anno_file))
if metric_type.lower() == 'coco' or metric_type.lower(
) == 'rbox' or metric_type.lower() == 'snipercoco':
if anno_file and os.path.isfile(anno_file):
if anno_file.endswith('json'):
# lazy import pycocotools here
from pycocotools.coco import COCO
coco = COCO(anno_file)
cats = coco.loadCats(coco.getCatIds())
clsid2catid = {i: cat['id'] for i, cat in enumerate(cats)}
catid2name = {cat['id']: cat['name'] for cat in cats}
elif anno_file.endswith('txt'):
cats = []
with open(anno_file) as f:
for line in f.readlines():
cats.append(line.strip())
if cats[0] == 'background': cats = cats[1:]
clsid2catid = {i: i for i in range(len(cats))}
catid2name = {i: name for i, name in enumerate(cats)}
else:
raise ValueError("anno_file {} should be json or txt.".format(
anno_file))
return clsid2catid, catid2name
# anno file not exist, load default categories of COCO17
else:
if metric_type.lower() == 'rbox':
logger.warning(
"metric_type: {}, load default categories of DOTA.".format(
metric_type))
return _dota_category()
logger.warning("metric_type: {}, load default categories of COCO.".
format(metric_type))
return _coco17_category()
elif metric_type.lower() == 'voc':
@ -77,6 +102,8 @@ def get_categories(metric_type, anno_file=None, arch=None):
# anno file not exist, load default categories of
# VOC all 20 categories
else:
logger.warning("metric_type: {}, load default categories of VOC.".
format(metric_type))
return _vocall_category()
elif metric_type.lower() == 'oid':
@ -104,6 +131,9 @@ def get_categories(metric_type, anno_file=None, arch=None):
return clsid2catid, catid2name
# anno file not exist, load default category 'pedestrian'.
else:
logger.warning(
"metric_type: {}, load default categories of pedestrian MOT.".
format(metric_type))
return _mot_category(category='pedestrian')
elif metric_type.lower() in ['kitti', 'bdd100kmot']:
@ -122,6 +152,9 @@ def get_categories(metric_type, anno_file=None, arch=None):
return clsid2catid, catid2name
# anno file not exist, load default categories of visdrone all 10 categories
else:
logger.warning(
"metric_type: {}, load default categories of VisDrone.".format(
metric_type))
return _visdrone_category()
else:

@ -39,6 +39,7 @@ class COCODataSet(DetDataset):
empty_ratio (float): the ratio of empty record number to total
record's, if empty_ratio is out of [0. ,1.), do not sample the
records and use all the empty entries. 1. as default
repeat (int): repeat times for dataset, use in benchmark.
"""
def __init__(self,
@ -49,9 +50,15 @@ class COCODataSet(DetDataset):
sample_num=-1,
load_crowd=False,
allow_empty=False,
empty_ratio=1.):
super(COCODataSet, self).__init__(dataset_dir, image_dir, anno_path,
data_fields, sample_num)
empty_ratio=1.,
repeat=1):
super(COCODataSet, self).__init__(
dataset_dir,
image_dir,
anno_path,
data_fields,
sample_num,
repeat=repeat)
self.load_image_only = False
self.load_semantic = False
self.load_crowd = load_crowd
@ -138,15 +145,6 @@ class COCODataSet(DetDataset):
if not any(np.array(inst['bbox'])):
continue
# read rbox anno or not
is_rbox_anno = True if len(inst['bbox']) == 5 else False
if is_rbox_anno:
xc, yc, box_w, box_h, angle = inst['bbox']
x1 = xc - box_w / 2.0
y1 = yc - box_h / 2.0
x2 = x1 + box_w
y2 = y1 + box_h
else:
x1, y1, box_w, box_h = inst['bbox']
x2 = x1 + box_w
y2 = y1 + box_h
@ -155,8 +153,6 @@ class COCODataSet(DetDataset):
inst['clean_bbox'] = [
round(float(x), 3) for x in [x1, y1, x2, y2]
]
if is_rbox_anno:
inst['clean_rbox'] = [xc, yc, box_w, box_h, angle]
bboxes.append(inst)
else:
logger.warning(
@ -171,9 +167,6 @@ class COCODataSet(DetDataset):
is_empty = True
gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)
if is_rbox_anno:
gt_rbox = np.zeros((num_bbox, 5), dtype=np.float32)
gt_theta = np.zeros((num_bbox, 1), dtype=np.int32)
gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
is_crowd = np.zeros((num_bbox, 1), dtype=np.int32)
gt_poly = [None] * num_bbox
@ -183,13 +176,10 @@ class COCODataSet(DetDataset):
catid = box['category_id']
gt_class[i][0] = self.catid2clsid[catid]
gt_bbox[i, :] = box['clean_bbox']
# xc, yc, w, h, theta
if is_rbox_anno:
gt_rbox[i, :] = box['clean_rbox']
is_crowd[i][0] = box['iscrowd']
# check RLE format
if 'segmentation' in box and box['iscrowd'] == 1:
gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
elif 'segmentation' in box and box['segmentation']:
if not np.array(box['segmentation']
).size > 0 and not self.allow_empty:
@ -206,15 +196,6 @@ class COCODataSet(DetDataset):
gt_poly) and not self.allow_empty:
continue
if is_rbox_anno:
gt_rec = {
'is_crowd': is_crowd,
'gt_class': gt_class,
'gt_bbox': gt_bbox,
'gt_rbox': gt_rbox,
'gt_poly': gt_poly,
}
else:
gt_rec = {
'is_crowd': is_crowd,
'gt_class': gt_class,
@ -247,3 +228,126 @@ class COCODataSet(DetDataset):
empty_records = self._sample_empty(empty_records, len(records))
records += empty_records
self.roidbs = records
@register
@serializable
class SlicedCOCODataSet(COCODataSet):
"""Sliced COCODataSet"""
def __init__(
self,
dataset_dir=None,
image_dir=None,
anno_path=None,
data_fields=['image'],
sample_num=-1,
load_crowd=False,
allow_empty=False,
empty_ratio=1.,
repeat=1,
sliced_size=[640, 640],
overlap_ratio=[0.25, 0.25], ):
super(SlicedCOCODataSet, self).__init__(
dataset_dir=dataset_dir,
image_dir=image_dir,
anno_path=anno_path,
data_fields=data_fields,
sample_num=sample_num,
load_crowd=load_crowd,
allow_empty=allow_empty,
empty_ratio=empty_ratio,
repeat=repeat, )
self.sliced_size = sliced_size
self.overlap_ratio = overlap_ratio
def parse_dataset(self):
anno_path = os.path.join(self.dataset_dir, self.anno_path)
image_dir = os.path.join(self.dataset_dir, self.image_dir)
assert anno_path.endswith('.json'), \
'invalid coco annotation file: ' + anno_path
from pycocotools.coco import COCO
coco = COCO(anno_path)
img_ids = coco.getImgIds()
img_ids.sort()
cat_ids = coco.getCatIds()
records = []
empty_records = []
ct = 0
ct_sub = 0
self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
self.cname2cid = dict({
coco.loadCats(catid)[0]['name']: clsid
for catid, clsid in self.catid2clsid.items()
})
if 'annotations' not in coco.dataset:
self.load_image_only = True
logger.warning('Annotation file: {} does not contains ground truth '
'and load image information only.'.format(anno_path))
try:
import sahi
from sahi.slicing import slice_image
except Exception as e:
logger.error(
'sahi not found, plaese install sahi. '
'for example: `pip install sahi`, see https://github.com/obss/sahi.'
)
raise e
sub_img_ids = 0
for img_id in img_ids:
img_anno = coco.loadImgs([img_id])[0]
im_fname = img_anno['file_name']
im_w = float(img_anno['width'])
im_h = float(img_anno['height'])
im_path = os.path.join(image_dir,
im_fname) if image_dir else im_fname
is_empty = False
if not os.path.exists(im_path):
logger.warning('Illegal image file: {}, and it will be '
'ignored'.format(im_path))
continue
if im_w < 0 or im_h < 0:
logger.warning('Illegal width: {} or height: {} in annotation, '
'and im_id: {} will be ignored'.format(
im_w, im_h, img_id))
continue
slice_image_result = sahi.slicing.slice_image(
image=im_path,
slice_height=self.sliced_size[0],
slice_width=self.sliced_size[1],
overlap_height_ratio=self.overlap_ratio[0],
overlap_width_ratio=self.overlap_ratio[1])
sub_img_num = len(slice_image_result)
for _ind in range(sub_img_num):
im = slice_image_result.images[_ind]
coco_rec = {
'image': im,
'im_id': np.array([sub_img_ids + _ind]),
'h': im.shape[0],
'w': im.shape[1],
'ori_im_id': np.array([img_id]),
'st_pix': np.array(
slice_image_result.starting_pixels[_ind],
dtype=np.float32),
'is_last': 1 if _ind == sub_img_num - 1 else 0,
} if 'image' in self.data_fields else {}
records.append(coco_rec)
ct_sub += sub_img_num
ct += 1
if self.sample_num > 0 and ct >= self.sample_num:
break
assert ct > 0, 'not found any coco record in %s' % (anno_path)
logger.info('{} samples and slice to {} sub_samples in file {}'.format(
ct, ct_sub, anno_path))
if self.allow_empty and len(empty_records) > 0:
empty_records = self._sample_empty(empty_records, len(records))
records += empty_records
self.roidbs = records

@ -13,8 +13,8 @@
# limitations under the License.
import os
import copy
import numpy as np
try:
from collections.abc import Sequence
except Exception:
@ -22,7 +22,10 @@ except Exception:
from paddle.io import Dataset
from paddlers.models.ppdet.core.workspace import register, serializable
from paddlers.models.ppdet.utils.download import get_dataset_path
import copy
from paddlers.models.ppdet.data import source
from paddlers.models.ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)
@serializable
@ -37,6 +40,7 @@ class DetDataset(Dataset):
data_fields (list): key name of data dictionary, at least have 'image'.
sample_num (int): number of samples to load, -1 means all.
use_default_label (bool): whether to load default label list.
repeat (int): repeat times for dataset, use in benchmark.
"""
def __init__(self,
@ -46,6 +50,7 @@ class DetDataset(Dataset):
data_fields=['image'],
sample_num=-1,
use_default_label=None,
repeat=1,
**kwargs):
super(DetDataset, self).__init__()
self.dataset_dir = dataset_dir if dataset_dir is not None else ''
@ -54,28 +59,32 @@ class DetDataset(Dataset):
self.data_fields = data_fields
self.sample_num = sample_num
self.use_default_label = use_default_label
self.repeat = repeat
self._epoch = 0
self._curr_iter = 0
def __len__(self, ):
return len(self.roidbs)
return len(self.roidbs) * self.repeat
def __call__(self, *args, **kwargs):
return self
def __getitem__(self, idx):
n = len(self.roidbs)
if self.repeat > 1:
idx %= n
# data batch
roidb = copy.deepcopy(self.roidbs[idx])
if self.mixup_epoch == 0 or self._epoch < self.mixup_epoch:
n = len(self.roidbs)
idx = np.random.randint(n)
roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
elif self.cutmix_epoch == 0 or self._epoch < self.cutmix_epoch:
n = len(self.roidbs)
idx = np.random.randint(n)
roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
elif self.mosaic_epoch == 0 or self._epoch < self.mosaic_epoch:
n = len(self.roidbs)
roidb = [roidb, ] + [
copy.deepcopy(self.roidbs[np.random.randint(n)])
for _ in range(3)
for _ in range(4)
]
if isinstance(roidb, Sequence):
for r in roidb:
@ -149,12 +158,15 @@ class ImageFolder(DetDataset):
self.sample_num = sample_num
def check_or_download_dataset(self):
return
def get_anno(self):
if self.anno_path is None:
return
if self.dataset_dir:
# NOTE: ImageFolder is only used for prediction, in
# infer mode, image_dir is set by set_images
# so we only check anno_path here
self.dataset_dir = get_dataset_path(self.dataset_dir,
self.anno_path, None)
return os.path.join(self.dataset_dir, self.anno_path)
else:
return self.anno_path
def parse_dataset(self, ):
if not self.roidbs:
@ -195,3 +207,93 @@ class ImageFolder(DetDataset):
def set_images(self, images):
self.image_dir = images
self.roidbs = self._load_images()
def set_slice_images(self,
images,
slice_size=[640, 640],
overlap_ratio=[0.25, 0.25]):
self.image_dir = images
ori_records = self._load_images()
try:
import sahi
from sahi.slicing import slice_image
except Exception as e:
logger.error(
'sahi not found, plaese install sahi. '
'for example: `pip install sahi`, see https://github.com/obss/sahi.'
)
raise e
sub_img_ids = 0
ct = 0
ct_sub = 0
records = []
for i, ori_rec in enumerate(ori_records):
im_path = ori_rec['im_file']
slice_image_result = sahi.slicing.slice_image(
image=im_path,
slice_height=slice_size[0],
slice_width=slice_size[1],
overlap_height_ratio=overlap_ratio[0],
overlap_width_ratio=overlap_ratio[1])
sub_img_num = len(slice_image_result)
for _ind in range(sub_img_num):
im = slice_image_result.images[_ind]
rec = {
'image': im,
'im_id': np.array([sub_img_ids + _ind]),
'h': im.shape[0],
'w': im.shape[1],
'ori_im_id': np.array([ori_rec['im_id'][0]]),
'st_pix': np.array(
slice_image_result.starting_pixels[_ind],
dtype=np.float32),
'is_last': 1 if _ind == sub_img_num - 1 else 0,
} if 'image' in self.data_fields else {}
records.append(rec)
ct_sub += sub_img_num
ct += 1
print('{} samples and slice to {} sub_samples'.format(ct, ct_sub))
self.roidbs = records
def get_label_list(self):
# Only VOC dataset needs label list in ImageFold
return self.anno_path
@register
class CommonDataset(object):
def __init__(self, **dataset_args):
super(CommonDataset, self).__init__()
dataset_args = copy.deepcopy(dataset_args)
type = dataset_args.pop("name")
self.dataset = getattr(source, type)(**dataset_args)
def __call__(self):
return self.dataset
@register
class TrainDataset(CommonDataset):
pass
@register
class EvalMOTDataset(CommonDataset):
pass
@register
class TestMOTDataset(CommonDataset):
pass
@register
class EvalDataset(CommonDataset):
pass
@register
class TestDataset(CommonDataset):
pass

@ -39,6 +39,7 @@ class MOTDataSet(DetDataset):
image_lists (str|list): mot data image lists, muiti-source mot dataset.
data_fields (list): key name of data dictionary, at least have 'image'.
sample_num (int): number of samples to load, -1 means all.
repeat (int): repeat times for dataset, use in benchmark.
Notes:
MOT datasets root directory following this:
@ -77,11 +78,13 @@ class MOTDataSet(DetDataset):
dataset_dir=None,
image_lists=[],
data_fields=['image'],
sample_num=-1):
sample_num=-1,
repeat=1):
super(MOTDataSet, self).__init__(
dataset_dir=dataset_dir,
data_fields=data_fields,
sample_num=sample_num)
sample_num=sample_num,
repeat=repeat)
self.dataset_dir = dataset_dir
self.image_lists = image_lists
if isinstance(self.image_lists, str):
@ -474,6 +477,7 @@ class MOTImageFolder(DetDataset):
image_dir=None,
sample_num=-1,
keep_ori_im=False,
anno_path=None,
**kwargs):
super(MOTImageFolder, self).__init__(
dataset_dir, image_dir, sample_num=sample_num)
@ -483,6 +487,7 @@ class MOTImageFolder(DetDataset):
self._imid2path = {}
self.roidbs = None
self.frame_rate = frame_rate
self.anno_path = anno_path
def check_or_download_dataset(self):
return
@ -573,6 +578,9 @@ class MOTImageFolder(DetDataset):
"wrong or unsupported file format: {}".format(self.video_file)
self.roidbs = self._load_video_images()
def get_anno(self):
return self.anno_path
def _is_valid_video(f, extensions=('.mp4', '.avi', '.mov', '.rmvb', 'flv')):
return f.lower().endswith(extensions)

@ -46,6 +46,7 @@ class VOCDataSet(DetDataset):
empty_ratio (float): the ratio of empty record number to total
record's, if empty_ratio is out of [0. ,1.), do not sample the
records and use all the empty entries. 1. as default
repeat (int): repeat times for dataset, use in benchmark.
"""
def __init__(self,
@ -56,13 +57,15 @@ class VOCDataSet(DetDataset):
sample_num=-1,
label_list=None,
allow_empty=False,
empty_ratio=1.):
empty_ratio=1.,
repeat=1):
super(VOCDataSet, self).__init__(
dataset_dir=dataset_dir,
image_dir=image_dir,
anno_path=anno_path,
data_fields=data_fields,
sample_num=sample_num)
sample_num=sample_num,
repeat=repeat)
self.label_list = label_list
self.allow_empty = allow_empty
self.empty_ratio = empty_ratio

@ -16,11 +16,13 @@ from . import operators
from . import batch_operators
from . import keypoint_operators
from . import mot_operators
from . import rotated_operators
from .operators import *
from .batch_operators import *
from .keypoint_operators import *
from .mot_operators import *
from .rotated_operators import *
__all__ = []
__all__ += registered_ops

@ -47,6 +47,8 @@ __all__ = [
'PadMaskBatch',
'Gt2GFLTarget',
'Gt2CenterNetTarget',
'PadGT',
'PadRGT',
]
@ -108,12 +110,6 @@ class PadBatch(BaseOperator):
padding_segm[:, :im_h, :im_w] = gt_segm
data['gt_segm'] = padding_segm
if 'gt_rbox2poly' in data and data['gt_rbox2poly'] is not None:
# ploy to rbox
polys = data['gt_rbox2poly']
rbox = bbox_utils.poly2rbox(polys)
data['gt_rbox'] = rbox
return samples
@ -980,12 +976,6 @@ class PadMaskBatch(BaseOperator):
padding_mask[:im_h, :im_w] = 1.
data['pad_mask'] = padding_mask
if 'gt_rbox2poly' in data and data['gt_rbox2poly'] is not None:
# ploy to rbox
polys = data['gt_rbox2poly']
rbox = bbox_utils.poly2rbox(polys)
data['gt_rbox'] = rbox
return samples
@ -1068,3 +1058,110 @@ class Gt2CenterNetTarget(BaseOperator):
sample['size'] = wh
sample['offset'] = reg
return sample
@register_op
class PadGT(BaseOperator):
"""
Pad 0 to `gt_class`, `gt_bbox`, `gt_score`...
The num_max_boxes is the largest for batch.
Args:
return_gt_mask (bool): If true, return `pad_gt_mask`,
1 means bbox, 0 means no bbox.
"""
def __init__(self, return_gt_mask=True):
super(PadGT, self).__init__()
self.return_gt_mask = return_gt_mask
def __call__(self, samples, context=None):
num_max_boxes = max([len(s['gt_bbox']) for s in samples])
for sample in samples:
if self.return_gt_mask:
sample['pad_gt_mask'] = np.zeros(
(num_max_boxes, 1), dtype=np.float32)
if num_max_boxes == 0:
continue
num_gt = len(sample['gt_bbox'])
pad_gt_class = np.zeros((num_max_boxes, 1), dtype=np.int32)
pad_gt_bbox = np.zeros((num_max_boxes, 4), dtype=np.float32)
if num_gt > 0:
pad_gt_class[:num_gt] = sample['gt_class']
pad_gt_bbox[:num_gt] = sample['gt_bbox']
sample['gt_class'] = pad_gt_class
sample['gt_bbox'] = pad_gt_bbox
# pad_gt_mask
if 'pad_gt_mask' in sample:
sample['pad_gt_mask'][:num_gt] = 1
# gt_score
if 'gt_score' in sample:
pad_gt_score = np.zeros((num_max_boxes, 1), dtype=np.float32)
if num_gt > 0:
pad_gt_score[:num_gt] = sample['gt_score']
sample['gt_score'] = pad_gt_score
if 'is_crowd' in sample:
pad_is_crowd = np.zeros((num_max_boxes, 1), dtype=np.int32)
if num_gt > 0:
pad_is_crowd[:num_gt] = sample['is_crowd']
sample['is_crowd'] = pad_is_crowd
if 'difficult' in sample:
pad_diff = np.zeros((num_max_boxes, 1), dtype=np.int32)
if num_gt > 0:
pad_diff[:num_gt] = sample['difficult']
sample['difficult'] = pad_diff
return samples
@register_op
class PadRGT(BaseOperator):
"""
Pad 0 to `gt_class`, `gt_bbox`, `gt_score`...
The num_max_boxes is the largest for batch.
Args:
return_gt_mask (bool): If true, return `pad_gt_mask`,
1 means bbox, 0 means no bbox.
"""
def __init__(self, return_gt_mask=True):
super(PadRGT, self).__init__()
self.return_gt_mask = return_gt_mask
def pad_field(self, sample, field, num_gt):
name, shape, dtype = field
if name in sample:
pad_v = np.zeros(shape, dtype=dtype)
if num_gt > 0:
pad_v[:num_gt] = sample[name]
sample[name] = pad_v
def __call__(self, samples, context=None):
num_max_boxes = max([len(s['gt_bbox']) for s in samples])
for sample in samples:
if self.return_gt_mask:
sample['pad_gt_mask'] = np.zeros(
(num_max_boxes, 1), dtype=np.float32)
if num_max_boxes == 0:
continue
num_gt = len(sample['gt_bbox'])
pad_gt_class = np.zeros((num_max_boxes, 1), dtype=np.int32)
pad_gt_bbox = np.zeros((num_max_boxes, 4), dtype=np.float32)
if num_gt > 0:
pad_gt_class[:num_gt] = sample['gt_class']
pad_gt_bbox[:num_gt] = sample['gt_bbox']
sample['gt_class'] = pad_gt_class
sample['gt_bbox'] = pad_gt_bbox
# pad_gt_mask
if 'pad_gt_mask' in sample:
sample['pad_gt_mask'][:num_gt] = 1
# gt_score
names = ['gt_score', 'is_crowd', 'difficult', 'gt_poly', 'gt_rbox']
dims = [1, 1, 1, 8, 5]
dtypes = [np.float32, np.int32, np.int32, np.float32, np.float32]
for name, dim, dtype in zip(names, dims, dtypes):
self.pad_field(sample, [name, (num_max_boxes, dim), dtype],
num_gt)
return samples

@ -698,8 +698,8 @@ class ToHeatmapsTopDown(object):
tmp_size = self.sigma * 3
feat_stride = image_size / self.hmsize
for joint_id in range(num_joints):
mu_x = int(joints[joint_id][0] + 0.5) / feat_stride[0]
mu_y = int(joints[joint_id][1] + 0.5) / feat_stride[1]
mu_x = int(joints[joint_id][0] / feat_stride[0] + 0.5)
mu_y = int(joints[joint_id][1] / feat_stride[1] + 0.5)
# Check that any part of the gaussian is in-bounds
ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]
br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]

@ -529,7 +529,7 @@ class Gt2FairMOTTarget(Gt2TTFTarget):
Generate FairMOT targets by ground truth data.
Difference between Gt2FairMOTTarget and Gt2TTFTarget are:
1. the gaussian kernal radius to generate a heatmap.
2. the targets needed during traing.
2. the targets needed during training.
Args:
num_classes(int): the number of classes.

@ -41,7 +41,6 @@ import threading
MUTEX = threading.Lock()
from paddlers.models.ppdet.core.workspace import serializable
from paddlers.models.ppdet.modeling import bbox_utils
from ..reader import Compose
from .op_helper import (satisfy_sample_constraint, filter_and_process,
@ -123,12 +122,15 @@ class Decode(BaseOperator):
sample['image'] = f.read()
sample.pop('im_file')
try:
im = sample['image']
data = np.frombuffer(im, dtype='uint8')
im = cv2.imdecode(data, 1) # BGR mode, but need RGB mode
if 'keep_ori_im' in sample and sample['keep_ori_im']:
sample['ori_image'] = im
im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
except:
im = sample['image']
sample['image'] = im
if 'h' not in sample:
@ -357,19 +359,26 @@ class RandomErasingImage(BaseOperator):
@register_op
class NormalizeImage(BaseOperator):
def __init__(self, mean=[0.485, 0.456, 0.406], std=[1, 1, 1],
is_scale=True):
def __init__(self,
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225],
is_scale=True,
norm_type='mean_std'):
"""
Args:
mean (list): the pixel mean
std (list): the pixel variance
is_scale (bool): scale the pixel to [0,1]
norm_type (str): type in ['mean_std', 'none']
"""
super(NormalizeImage, self).__init__()
self.mean = mean
self.std = std
self.is_scale = is_scale
self.norm_type = norm_type
if not (isinstance(self.mean, list) and isinstance(self.std, list) and
isinstance(self.is_scale, bool)):
isinstance(self.is_scale, bool) and
self.norm_type in ['mean_std', 'none']):
raise TypeError("{}: input type is invalid.".format(self))
from functools import reduce
if reduce(lambda x, y: x * y, self.std) == 0:
@ -378,20 +387,20 @@ class NormalizeImage(BaseOperator):
def apply(self, sample, context=None):
"""Normalize the image.
Operators:
1.(optional) Scale the image to [0,1]
2. Each pixel minus mean and is divided by std
1.(optional) Scale the pixel to [0,1]
2.(optional) Each pixel minus mean and is divided by std
"""
im = sample['image']
im = im.astype(np.float32, copy=False)
mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
std = np.array(self.std)[np.newaxis, np.newaxis, :]
if self.is_scale:
im = im / 255.0
scale = 1.0 / 255.0
im *= scale
if self.norm_type == 'mean_std':
mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
std = np.array(self.std)[np.newaxis, np.newaxis, :]
im -= mean
im /= std
sample['image'] = im
return sample
@ -448,6 +457,10 @@ class GridMask(BaseOperator):
@register_op
class RandomDistort(BaseOperator):
"""Random color distortion.
Note:
The 'probability' in [lower, upper, probability] is the probability of not using this transformation,
not the probability of using this transformation. And this only applies in this operator(RandomDistort),
'probability' in other BaseOperator means the probability of using that transformation.
Args:
hue (list): hue settings. in [lower, upper, probability] format.
saturation (list): saturation settings. in [lower, upper, probability] format.
@ -657,18 +670,6 @@ class RandomFlip(BaseOperator):
bbox[:, 2] = width - oldx1
return bbox
def apply_rbox(self, bbox, width):
oldx1 = bbox[:, 0].copy()
oldx2 = bbox[:, 2].copy()
oldx3 = bbox[:, 4].copy()
oldx4 = bbox[:, 6].copy()
bbox[:, 0] = width - oldx1
bbox[:, 2] = width - oldx2
bbox[:, 4] = width - oldx3
bbox[:, 6] = width - oldx4
bbox = [bbox_utils.get_best_begin_point_single(e) for e in bbox]
return bbox
def apply(self, sample, context=None):
"""Filp the image and bounding box.
Operators:
@ -700,10 +701,6 @@ class RandomFlip(BaseOperator):
if 'gt_segm' in sample and sample['gt_segm'].any():
sample['gt_segm'] = sample['gt_segm'][:, :, ::-1]
if 'gt_rbox2poly' in sample and sample['gt_rbox2poly'].any():
sample['gt_rbox2poly'] = self.apply_rbox(sample['gt_rbox2poly'],
width)
sample['flipped'] = True
sample['image'] = im
return sample
@ -824,7 +821,7 @@ class Resize(BaseOperator):
im_scale_x = resize_w / im_shape[1]
im = self.apply_image(sample['image'], [im_scale_x, im_scale_y])
sample['image'] = im
sample['image'] = im.astype(np.float32)
sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32)
if 'scale_factor' in sample:
scale_factor = sample['scale_factor']
@ -841,16 +838,6 @@ class Resize(BaseOperator):
[im_scale_x, im_scale_y],
[resize_w, resize_h])
# apply rbox
if 'gt_rbox2poly' in sample:
if np.array(sample['gt_rbox2poly']).shape[1] != 8:
logger.warning(
"gt_rbox2poly's length shoule be 8, but actually is {}".
format(len(sample['gt_rbox2poly'])))
sample['gt_rbox2poly'] = self.apply_bbox(sample['gt_rbox2poly'],
[im_scale_x, im_scale_y],
[resize_w, resize_h])
# apply polygon
if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im_shape[:2],
@ -1054,7 +1041,7 @@ class CropWithSampling(BaseOperator):
[max sample, max trial, min scale, max scale,
min aspect ratio, max aspect ratio,
min overlap, max overlap]
avoid_no_bbox (bool): whether to to avoid the
avoid_no_bbox (bool): whether to avoid the
situation where the box does not appear.
"""
super(CropWithSampling, self).__init__()
@ -1145,7 +1132,7 @@ class CropWithDataAchorSampling(BaseOperator):
das_anchor_scales (list[float]): a list of anchor scales in data
anchor smapling.
min_size (float): minimum size of sampled bbox.
avoid_no_bbox (bool): whether to to avoid the
avoid_no_bbox (bool): whether to avoid the
situation where the box does not appear.
"""
super(CropWithDataAchorSampling, self).__init__()
@ -1504,6 +1491,11 @@ class RandomCrop(BaseOperator):
if 'is_crowd' in sample:
sample['is_crowd'] = np.take(
sample['is_crowd'], valid_ids, axis=0)
if 'difficult' in sample:
sample['difficult'] = np.take(
sample['difficult'], valid_ids, axis=0)
return sample
return sample
@ -1747,7 +1739,7 @@ class Mixup(BaseOperator):
gt_score2 = np.ones_like(sample[1]['gt_class'])
gt_score = np.concatenate(
(gt_score1 * factor, gt_score2 * (1. - factor)), axis=0)
result['gt_score'] = gt_score
result['gt_score'] = gt_score.astype('float32')
if 'is_crowd' in sample[0]:
is_crowd1 = sample[0]['is_crowd']
is_crowd2 = sample[1]['is_crowd']
@ -2029,13 +2021,14 @@ class Pad(BaseOperator):
if self.size:
h, w = self.size
assert (
im_h < h and im_w < w
im_h <= h and im_w <= w
), '(h, w) of target size should be greater than (im_h, im_w)'
else:
h = np.ceil(im_h / self.size_divisor) * self.size_divisor
w = np.ceil(im_w / self.size_divisor) * self.size_divisor
h = int(np.ceil(im_h / self.size_divisor) * self.size_divisor)
w = int(np.ceil(im_w / self.size_divisor) * self.size_divisor)
if h == im_h and w == im_w:
sample['image'] = im.astype(np.float32)
return sample
if self.pad_mode == -1:
@ -2105,45 +2098,31 @@ class Poly2Mask(BaseOperator):
return sample
@register_op
class Rbox2Poly(BaseOperator):
"""
Convert rbbox format to poly format.
"""
def __init__(self):
super(Rbox2Poly, self).__init__()
def apply(self, sample, context=None):
assert 'gt_rbox' in sample
assert sample['gt_rbox'].shape[1] == 5
rrects = sample['gt_rbox']
x_ctr = rrects[:, 0]
y_ctr = rrects[:, 1]
width = rrects[:, 2]
height = rrects[:, 3]
x1 = x_ctr - width / 2.0
y1 = y_ctr - height / 2.0
x2 = x_ctr + width / 2.0
y2 = y_ctr + height / 2.0
sample['gt_bbox'] = np.stack([x1, y1, x2, y2], axis=1)
polys = bbox_utils.rbox2poly_np(rrects)
sample['gt_rbox2poly'] = polys
return sample
@register_op
class AugmentHSV(BaseOperator):
def __init__(self, fraction=0.50, is_bgr=True):
"""
Augment the SV channel of image data.
Args:
fraction (float): the fraction for augment. Default: 0.5.
is_bgr (bool): whether the image is BGR mode. Default: True.
hgain (float): H channel gains
sgain (float): S channel gains
vgain (float): V channel gains
"""
def __init__(self,
fraction=0.50,
is_bgr=True,
hgain=None,
sgain=None,
vgain=None):
super(AugmentHSV, self).__init__()
self.fraction = fraction
self.is_bgr = is_bgr
self.hgain = hgain
self.sgain = sgain
self.vgain = vgain
self.use_hsvgain = False if hgain is None else True
def apply(self, sample, context=None):
img = sample['image']
@ -2151,6 +2130,17 @@ class AugmentHSV(BaseOperator):
img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
else:
img_hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)
if self.use_hsvgain:
hsv_augs = np.random.uniform(
-1, 1, 3) * [self.hgain, self.sgain, self.vgain]
# random selection of h, s, v
hsv_augs *= np.random.randint(0, 2, 3)
img_hsv[..., 0] = (img_hsv[..., 0] + hsv_augs[0]) % 180
img_hsv[..., 1] = np.clip(img_hsv[..., 1] + hsv_augs[1], 0, 255)
img_hsv[..., 2] = np.clip(img_hsv[..., 2] + hsv_augs[2], 0, 255)
else:
S = img_hsv[:, :, 1].astype(np.float32)
V = img_hsv[:, :, 2].astype(np.float32)
@ -2166,12 +2156,13 @@ class AugmentHSV(BaseOperator):
img_hsv[:, :, 1] = S.astype(np.uint8)
img_hsv[:, :, 2] = V.astype(np.uint8)
if self.is_bgr:
cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img)
else:
cv2.cvtColor(img_hsv, cv2.COLOR_HSV2RGB, dst=img)
sample['image'] = img
sample['image'] = img.astype(np.float32)
return sample
@ -2425,16 +2416,6 @@ class RandomResizeCrop(BaseOperator):
[im_scale_x, im_scale_y],
[resize_w, resize_h])
# apply rbox
if 'gt_rbox2poly' in sample:
if np.array(sample['gt_rbox2poly']).shape[1] != 8:
logger.warn(
"gt_rbox2poly's length shoule be 8, but actually is {}".
format(len(sample['gt_rbox2poly'])))
sample['gt_rbox2poly'] = self.apply_bbox(sample['gt_rbox2poly'],
[im_scale_x, im_scale_y],
[resize_w, resize_h])
# apply polygon
if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im_shape[:2],
@ -3013,3 +2994,409 @@ class CenterRandColor(BaseOperator):
img = func(img, img_gray)
sample['image'] = img
return sample
@register_op
class Mosaic(BaseOperator):
""" Mosaic operator for image and gt_bboxes
The code is based on https://github.com/Megvii-BaseDetection/YOLOX/blob/main/yolox/data/datasets/mosaicdetection.py
1. get mosaic coords
2. clip bbox and get mosaic_labels
3. random_affine augment
4. Mixup augment as copypaste (optinal), not used in tiny/nano
Args:
prob (float): probability of using Mosaic, 1.0 as default
input_dim (list[int]): input shape
degrees (list[2]): the rotate range to apply, transform range is [min, max]
translate (list[2]): the translate range to apply, transform range is [min, max]
scale (list[2]): the scale range to apply, transform range is [min, max]
shear (list[2]): the shear range to apply, transform range is [min, max]
enable_mixup (bool): whether to enable Mixup or not
mixup_prob (float): probability of using Mixup, 1.0 as default
mixup_scale (list[int]): scale range of Mixup
remove_outside_box (bool): whether remove outside boxes, False as
default in COCO dataset, True in MOT dataset
"""
def __init__(self,
prob=1.0,
input_dim=[640, 640],
degrees=[-10, 10],
translate=[-0.1, 0.1],
scale=[0.1, 2],
shear=[-2, 2],
enable_mixup=True,
mixup_prob=1.0,
mixup_scale=[0.5, 1.5],
remove_outside_box=False):
super(Mosaic, self).__init__()
self.prob = prob
if isinstance(input_dim, Integral):
input_dim = [input_dim, input_dim]
self.input_dim = input_dim
self.degrees = degrees
self.translate = translate
self.scale = scale
self.shear = shear
self.enable_mixup = enable_mixup
self.mixup_prob = mixup_prob
self.mixup_scale = mixup_scale
self.remove_outside_box = remove_outside_box
def get_mosaic_coords(self, mosaic_idx, xc, yc, w, h, input_h, input_w):
# (x1, y1, x2, y2) means coords in large image,
# small_coords means coords in small image in mosaic aug.
if mosaic_idx == 0:
# top left
x1, y1, x2, y2 = max(xc - w, 0), max(yc - h, 0), xc, yc
small_coords = w - (x2 - x1), h - (y2 - y1), w, h
elif mosaic_idx == 1:
# top right
x1, y1, x2, y2 = xc, max(yc - h, 0), min(xc + w, input_w * 2), yc
small_coords = 0, h - (y2 - y1), min(w, x2 - x1), h
elif mosaic_idx == 2:
# bottom left
x1, y1, x2, y2 = max(xc - w, 0), yc, xc, min(input_h * 2, yc + h)
small_coords = w - (x2 - x1), 0, w, min(y2 - y1, h)
elif mosaic_idx == 3:
# bottom right
x1, y1, x2, y2 = xc, yc, min(xc + w, input_w * 2), min(input_h * 2,
yc + h)
small_coords = 0, 0, min(w, x2 - x1), min(y2 - y1, h)
return (x1, y1, x2, y2), small_coords
def random_affine_augment(self,
img,
labels=[],
input_dim=[640, 640],
degrees=[-10, 10],
scales=[0.1, 2],
shears=[-2, 2],
translates=[-0.1, 0.1]):
# random rotation and scale
degree = random.uniform(degrees[0], degrees[1])
scale = random.uniform(scales[0], scales[1])
assert scale > 0, "Argument scale should be positive."
R = cv2.getRotationMatrix2D(angle=degree, center=(0, 0), scale=scale)
M = np.ones([2, 3])
# random shear
shear = random.uniform(shears[0], shears[1])
shear_x = math.tan(shear * math.pi / 180)
shear_y = math.tan(shear * math.pi / 180)
M[0] = R[0] + shear_y * R[1]
M[1] = R[1] + shear_x * R[0]
# random translation
translate = random.uniform(translates[0], translates[1])
translation_x = translate * input_dim[0]
translation_y = translate * input_dim[1]
M[0, 2] = translation_x
M[1, 2] = translation_y
# warpAffine
img = cv2.warpAffine(
img, M, dsize=tuple(input_dim), borderValue=(114, 114, 114))
num_gts = len(labels)
if num_gts > 0:
# warp corner points
corner_points = np.ones((4 * num_gts, 3))
corner_points[:, :2] = labels[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(
4 * num_gts, 2) # x1y1, x2y2, x1y2, x2y1
# apply affine transform
corner_points = corner_points @M.T
corner_points = corner_points.reshape(num_gts, 8)
# create new boxes
corner_xs = corner_points[:, 0::2]
corner_ys = corner_points[:, 1::2]
new_bboxes = np.concatenate((corner_xs.min(1), corner_ys.min(1),
corner_xs.max(1), corner_ys.max(1)))
new_bboxes = new_bboxes.reshape(4, num_gts).T
# clip boxes
new_bboxes[:, 0::2] = np.clip(new_bboxes[:, 0::2], 0, input_dim[0])
new_bboxes[:, 1::2] = np.clip(new_bboxes[:, 1::2], 0, input_dim[1])
labels[:, :4] = new_bboxes
return img, labels
def __call__(self, sample, context=None):
if not isinstance(sample, Sequence):
return sample
assert len(
sample) == 5, "Mosaic needs 5 samples, 4 for mosaic and 1 for mixup."
if np.random.uniform(0., 1.) > self.prob:
return sample[0]
mosaic_gt_bbox, mosaic_gt_class, mosaic_is_crowd, mosaic_difficult = [], [], [], []
input_h, input_w = self.input_dim
yc = int(random.uniform(0.5 * input_h, 1.5 * input_h))
xc = int(random.uniform(0.5 * input_w, 1.5 * input_w))
mosaic_img = np.full((input_h * 2, input_w * 2, 3), 114, dtype=np.uint8)
# 1. get mosaic coords
for mosaic_idx, sp in enumerate(sample[:4]):
img = sp['image']
gt_bbox = sp['gt_bbox']
h0, w0 = img.shape[:2]
scale = min(1. * input_h / h0, 1. * input_w / w0)
img = cv2.resize(
img, (int(w0 * scale), int(h0 * scale)),
interpolation=cv2.INTER_LINEAR)
(h, w, c) = img.shape[:3]
# suffix l means large image, while s means small image in mosaic aug.
(l_x1, l_y1, l_x2, l_y2), (
s_x1, s_y1, s_x2, s_y2) = self.get_mosaic_coords(
mosaic_idx, xc, yc, w, h, input_h, input_w)
mosaic_img[l_y1:l_y2, l_x1:l_x2] = img[s_y1:s_y2, s_x1:s_x2]
padw, padh = l_x1 - s_x1, l_y1 - s_y1
# Normalized xywh to pixel xyxy format
_gt_bbox = gt_bbox.copy()
if len(gt_bbox) > 0:
_gt_bbox[:, 0] = scale * gt_bbox[:, 0] + padw
_gt_bbox[:, 1] = scale * gt_bbox[:, 1] + padh
_gt_bbox[:, 2] = scale * gt_bbox[:, 2] + padw
_gt_bbox[:, 3] = scale * gt_bbox[:, 3] + padh
mosaic_gt_bbox.append(_gt_bbox)
mosaic_gt_class.append(sp['gt_class'])
if 'is_crowd' in sp:
mosaic_is_crowd.append(sp['is_crowd'])
if 'difficult' in sp:
mosaic_difficult.append(sp['difficult'])
# 2. clip bbox and get mosaic_labels([gt_bbox, gt_class, is_crowd])
if len(mosaic_gt_bbox):
mosaic_gt_bbox = np.concatenate(mosaic_gt_bbox, 0)
mosaic_gt_class = np.concatenate(mosaic_gt_class, 0)
if mosaic_is_crowd:
mosaic_is_crowd = np.concatenate(mosaic_is_crowd, 0)
mosaic_labels = np.concatenate([
mosaic_gt_bbox,
mosaic_gt_class.astype(mosaic_gt_bbox.dtype),
mosaic_is_crowd.astype(mosaic_gt_bbox.dtype)
], 1)
elif mosaic_difficult:
mosaic_difficult = np.concatenate(mosaic_difficult, 0)
mosaic_labels = np.concatenate([
mosaic_gt_bbox,
mosaic_gt_class.astype(mosaic_gt_bbox.dtype),
mosaic_difficult.astype(mosaic_gt_bbox.dtype)
], 1)
else:
mosaic_labels = np.concatenate([
mosaic_gt_bbox, mosaic_gt_class.astype(mosaic_gt_bbox.dtype)
], 1)
if self.remove_outside_box:
# for MOT dataset
flag1 = mosaic_gt_bbox[:, 0] < 2 * input_w
flag2 = mosaic_gt_bbox[:, 2] > 0
flag3 = mosaic_gt_bbox[:, 1] < 2 * input_h
flag4 = mosaic_gt_bbox[:, 3] > 0
flag_all = flag1 * flag2 * flag3 * flag4
mosaic_labels = mosaic_labels[flag_all]
else:
mosaic_labels[:, 0] = np.clip(mosaic_labels[:, 0], 0,
2 * input_w)
mosaic_labels[:, 1] = np.clip(mosaic_labels[:, 1], 0,
2 * input_h)
mosaic_labels[:, 2] = np.clip(mosaic_labels[:, 2], 0,
2 * input_w)
mosaic_labels[:, 3] = np.clip(mosaic_labels[:, 3], 0,
2 * input_h)
else:
mosaic_labels = np.zeros((1, 6))
# 3. random_affine augment
mosaic_img, mosaic_labels = self.random_affine_augment(
mosaic_img,
mosaic_labels,
input_dim=self.input_dim,
degrees=self.degrees,
translates=self.translate,
scales=self.scale,
shears=self.shear)
# 4. Mixup augment as copypaste, https://arxiv.org/abs/2012.07177
# optinal, not used(enable_mixup=False) in tiny/nano
if (self.enable_mixup and not len(mosaic_labels) == 0 and
random.random() < self.mixup_prob):
sample_mixup = sample[4]
mixup_img = sample_mixup['image']
if 'is_crowd' in sample_mixup:
cp_labels = np.concatenate([
sample_mixup['gt_bbox'],
sample_mixup['gt_class'].astype(mosaic_labels.dtype),
sample_mixup['is_crowd'].astype(mosaic_labels.dtype)
], 1)
elif 'difficult' in sample_mixup:
cp_labels = np.concatenate([
sample_mixup['gt_bbox'],
sample_mixup['gt_class'].astype(mosaic_labels.dtype),
sample_mixup['difficult'].astype(mosaic_labels.dtype)
], 1)
else:
cp_labels = np.concatenate([
sample_mixup['gt_bbox'],
sample_mixup['gt_class'].astype(mosaic_labels.dtype)
], 1)
mosaic_img, mosaic_labels = self.mixup_augment(
mosaic_img, mosaic_labels, self.input_dim, cp_labels, mixup_img)
sample0 = sample[0]
sample0['image'] = mosaic_img.astype(np.uint8) # can not be float32
sample0['h'] = float(mosaic_img.shape[0])
sample0['w'] = float(mosaic_img.shape[1])
sample0['im_shape'][0] = sample0['h']
sample0['im_shape'][1] = sample0['w']
sample0['gt_bbox'] = mosaic_labels[:, :4].astype(np.float32)
sample0['gt_class'] = mosaic_labels[:, 4:5].astype(np.float32)
if 'is_crowd' in sample[0]:
sample0['is_crowd'] = mosaic_labels[:, 5:6].astype(np.float32)
if 'difficult' in sample[0]:
sample0['difficult'] = mosaic_labels[:, 5:6].astype(np.float32)
return sample0
def mixup_augment(self, origin_img, origin_labels, input_dim, cp_labels,
img):
jit_factor = random.uniform(*self.mixup_scale)
FLIP = random.uniform(0, 1) > 0.5
if len(img.shape) == 3:
cp_img = np.ones(
(input_dim[0], input_dim[1], 3), dtype=np.uint8) * 114
else:
cp_img = np.ones(input_dim, dtype=np.uint8) * 114
cp_scale_ratio = min(input_dim[0] / img.shape[0],
input_dim[1] / img.shape[1])
resized_img = cv2.resize(
img, (int(img.shape[1] * cp_scale_ratio),
int(img.shape[0] * cp_scale_ratio)),
interpolation=cv2.INTER_LINEAR)
cp_img[:int(img.shape[0] * cp_scale_ratio), :int(img.shape[
1] * cp_scale_ratio)] = resized_img
cp_img = cv2.resize(cp_img, (int(cp_img.shape[1] * jit_factor),
int(cp_img.shape[0] * jit_factor)))
cp_scale_ratio *= jit_factor
if FLIP:
cp_img = cp_img[:, ::-1, :]
origin_h, origin_w = cp_img.shape[:2]
target_h, target_w = origin_img.shape[:2]
padded_img = np.zeros(
(max(origin_h, target_h), max(origin_w, target_w), 3),
dtype=np.uint8)
padded_img[:origin_h, :origin_w] = cp_img
x_offset, y_offset = 0, 0
if padded_img.shape[0] > target_h:
y_offset = random.randint(0, padded_img.shape[0] - target_h - 1)
if padded_img.shape[1] > target_w:
x_offset = random.randint(0, padded_img.shape[1] - target_w - 1)
padded_cropped_img = padded_img[y_offset:y_offset + target_h, x_offset:
x_offset + target_w]
# adjust boxes
cp_bboxes_origin_np = cp_labels[:, :4].copy()
cp_bboxes_origin_np[:, 0::2] = np.clip(cp_bboxes_origin_np[:, 0::2] *
cp_scale_ratio, 0, origin_w)
cp_bboxes_origin_np[:, 1::2] = np.clip(cp_bboxes_origin_np[:, 1::2] *
cp_scale_ratio, 0, origin_h)
if FLIP:
cp_bboxes_origin_np[:, 0::2] = (
origin_w - cp_bboxes_origin_np[:, 0::2][:, ::-1])
cp_bboxes_transformed_np = cp_bboxes_origin_np.copy()
if self.remove_outside_box:
# for MOT dataset
cp_bboxes_transformed_np[:, 0::2] -= x_offset
cp_bboxes_transformed_np[:, 1::2] -= y_offset
else:
cp_bboxes_transformed_np[:, 0::2] = np.clip(
cp_bboxes_transformed_np[:, 0::2] - x_offset, 0, target_w)
cp_bboxes_transformed_np[:, 1::2] = np.clip(
cp_bboxes_transformed_np[:, 1::2] - y_offset, 0, target_h)
cls_labels = cp_labels[:, 4:5].copy()
box_labels = cp_bboxes_transformed_np
if cp_labels.shape[-1] == 6:
crd_labels = cp_labels[:, 5:6].copy()
labels = np.hstack((box_labels, cls_labels, crd_labels))
else:
labels = np.hstack((box_labels, cls_labels))
if self.remove_outside_box:
labels = labels[labels[:, 0] < target_w]
labels = labels[labels[:, 2] > 0]
labels = labels[labels[:, 1] < target_h]
labels = labels[labels[:, 3] > 0]
origin_labels = np.vstack((origin_labels, labels))
origin_img = origin_img.astype(np.float32)
origin_img = 0.5 * origin_img + 0.5 * padded_cropped_img.astype(
np.float32)
return origin_img.astype(np.uint8), origin_labels
@register_op
class PadResize(BaseOperator):
""" PadResize for image and gt_bbbox
Args:
target_size (list[int]): input shape
fill_value (float): pixel value of padded image
"""
def __init__(self, target_size, fill_value=114):
super(PadResize, self).__init__()
if isinstance(target_size, Integral):
target_size = [target_size, target_size]
self.target_size = target_size
self.fill_value = fill_value
def _resize(self, img, bboxes, labels):
ratio = min(self.target_size[0] / img.shape[0],
self.target_size[1] / img.shape[1])
w, h = int(img.shape[1] * ratio), int(img.shape[0] * ratio)
resized_img = cv2.resize(img, (w, h), interpolation=cv2.INTER_LINEAR)
if len(bboxes) > 0:
bboxes *= ratio
mask = np.minimum(bboxes[:, 2] - bboxes[:, 0],
bboxes[:, 3] - bboxes[:, 1]) > 1
bboxes = bboxes[mask]
labels = labels[mask]
return resized_img, bboxes, labels
def _pad(self, img):
h, w, _ = img.shape
if h == self.target_size[0] and w == self.target_size[1]:
return img
padded_img = np.full(
(self.target_size[0], self.target_size[1], 3),
self.fill_value,
dtype=np.uint8)
padded_img[:h, :w] = img
return padded_img
def apply(self, sample, context=None):
image = sample['image']
bboxes = sample['gt_bbox']
labels = sample['gt_class']
image, bboxes, labels = self._resize(image, bboxes, labels)
sample['image'] = self._pad(image).astype(np.float32)
sample['gt_bbox'] = bboxes
sample['gt_class'] = labels
return sample

@ -0,0 +1,479 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
try:
from collections.abc import Sequence
except Exception:
from collections import Sequence
from numbers import Number, Integral
import cv2
import numpy as np
import math
import copy
from .operators import register_op, BaseOperator
from paddlers.models.ppdet.modeling.rbox_utils import poly2rbox_le135_np, poly2rbox_oc_np, rbox2poly_np
from paddlers.models.ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)
@register_op
class RRotate(BaseOperator):
""" Rotate Image, Polygon, Box
Args:
scale (float): rotate scale
angle (float): rotate angle
fill_value (int, tuple): fill color
auto_bound (bool): whether auto bound or not
"""
def __init__(self, scale=1.0, angle=0., fill_value=0., auto_bound=True):
super(RRotate, self).__init__()
self.scale = scale
self.angle = angle
self.fill_value = fill_value
self.auto_bound = auto_bound
def get_rotated_matrix(self, angle, scale, h, w):
center = ((w - 1) * 0.5, (h - 1) * 0.5)
matrix = cv2.getRotationMatrix2D(center, -angle, scale)
# calculate the new size
cos = np.abs(matrix[0, 0])
sin = np.abs(matrix[0, 1])
new_w = h * sin + w * cos
new_h = h * cos + w * sin
# calculate offset
n_w = int(np.round(new_w))
n_h = int(np.round(new_h))
if self.auto_bound:
ratio = min(w / n_w, h / n_h)
matrix = cv2.getRotationMatrix2D(center, -angle, ratio)
else:
matrix[0, 2] += (new_w - w) * 0.5
matrix[1, 2] += (new_h - h) * 0.5
w = n_w
h = n_h
return matrix, h, w
def get_rect_from_pts(self, pts, h, w):
""" get minimum rectangle of points
"""
assert pts.shape[-1] % 2 == 0, 'the dim of input [pts] is not correct'
min_x, min_y = np.min(pts[:, 0::2], axis=1), np.min(pts[:, 1::2],
axis=1)
max_x, max_y = np.max(pts[:, 0::2], axis=1), np.max(pts[:, 1::2],
axis=1)
min_x, min_y = np.clip(min_x, 0, w), np.clip(min_y, 0, h)
max_x, max_y = np.clip(max_x, 0, w), np.clip(max_y, 0, h)
boxes = np.stack([min_x, min_y, max_x, max_y], axis=-1)
return boxes
def apply_image(self, image, matrix, h, w):
return cv2.warpAffine(
image, matrix, (w, h), borderValue=self.fill_value)
def apply_pts(self, pts, matrix, h, w):
assert pts.shape[-1] % 2 == 0, 'the dim of input [pts] is not correct'
# n is number of samples and m is two times the number of points due to (x, y)
_, m = pts.shape
# transpose points
pts_ = pts.reshape(-1, 2).T
# pad 1 to convert the points to homogeneous coordinates
padding = np.ones((1, pts_.shape[1]), pts.dtype)
rotated_pts = np.matmul(matrix, np.concatenate((pts_, padding), axis=0))
return rotated_pts[:2, :].T.reshape(-1, m)
def apply(self, sample, context=None):
image = sample['image']
h, w = image.shape[:2]
matrix, h, w = self.get_rotated_matrix(self.angle, self.scale, h, w)
sample['image'] = self.apply_image(image, matrix, h, w)
polys = sample['gt_poly']
# TODO: segment or keypoint to be processed
if len(polys) > 0:
pts = self.apply_pts(polys, matrix, h, w)
sample['gt_poly'] = pts
sample['gt_bbox'] = self.get_rect_from_pts(pts, h, w)
return sample
@register_op
class RandomRRotate(BaseOperator):
""" Random Rotate Image
Args:
scale (float, tuple, list): rotate scale
scale_mode (str): mode of scale, [range, value, None]
angle (float, tuple, list): rotate angle
angle_mode (str): mode of angle, [range, value, None]
fill_value (float, tuple, list): fill value
rotate_prob (float): probability of rotation
auto_bound (bool): whether auto bound or not
"""
def __init__(self,
scale=1.0,
scale_mode=None,
angle=0.,
angle_mode=None,
fill_value=0.,
rotate_prob=1.0,
auto_bound=True):
super(RandomRRotate, self).__init__()
self.scale = scale
self.scale_mode = scale_mode
self.angle = angle
self.angle_mode = angle_mode
self.fill_value = fill_value
self.rotate_prob = rotate_prob
self.auto_bound = auto_bound
def get_angle(self, angle, angle_mode):
assert not angle_mode or angle_mode in [
'range', 'value'
], 'angle mode should be in [range, value, None]'
if not angle_mode:
return angle
elif angle_mode == 'range':
low, high = angle
return np.random.rand() * (high - low) + low
elif angle_mode == 'value':
return np.random.choice(angle)
def get_scale(self, scale, scale_mode):
assert not scale_mode or scale_mode in [
'range', 'value'
], 'scale mode should be in [range, value, None]'
if not scale_mode:
return scale
elif scale_mode == 'range':
low, high = scale
return np.random.rand() * (high - low) + low
elif scale_mode == 'value':
return np.random.choice(scale)
def apply(self, sample, context=None):
if np.random.rand() > self.rotate_prob:
return sample
angle = self.get_angle(self.angle, self.angle_mode)
scale = self.get_scale(self.scale, self.scale_mode)
rotator = RRotate(scale, angle, self.fill_value, self.auto_bound)
return rotator(sample)
@register_op
class Poly2RBox(BaseOperator):
""" Polygon to Rotated Box, using new OpenCV definition since 4.5.1
Args:
filter_threshold (int, float): threshold to filter annotations
filter_mode (str): filter mode, ['area', 'edge']
rbox_type (str): rbox type, ['le135', 'oc']
"""
def __init__(self, filter_threshold=4, filter_mode=None, rbox_type='le135'):
super(Poly2RBox, self).__init__()
self.filter_fn = lambda size: self.filter(size, filter_threshold, filter_mode)
self.rbox_fn = poly2rbox_le135_np if rbox_type == 'le135' else poly2rbox_oc_np
def filter(self, size, threshold, mode):
if mode == 'area':
if size[0] * size[1] < threshold:
return True
elif mode == 'edge':
if min(size) < threshold:
return True
return False
def get_rbox(self, polys):
valid_ids, rboxes, bboxes = [], [], []
for i, poly in enumerate(polys):
cx, cy, w, h, angle = self.rbox_fn(poly)
if self.filter_fn((w, h)):
continue
rboxes.append(np.array([cx, cy, w, h, angle], dtype=np.float32))
valid_ids.append(i)
xmin, ymin = min(poly[0::2]), min(poly[1::2])
xmax, ymax = max(poly[0::2]), max(poly[1::2])
bboxes.append(np.array([xmin, ymin, xmax, ymax], dtype=np.float32))
if len(valid_ids) == 0:
rboxes = np.zeros((0, 5), dtype=np.float32)
bboxes = np.zeros((0, 4), dtype=np.float32)
else:
rboxes = np.stack(rboxes)
bboxes = np.stack(bboxes)
return rboxes, bboxes, valid_ids
def apply(self, sample, context=None):
rboxes, bboxes, valid_ids = self.get_rbox(sample['gt_poly'])
sample['gt_rbox'] = rboxes
sample['gt_bbox'] = bboxes
for k in ['gt_class', 'gt_score', 'gt_poly', 'is_crowd', 'difficult']:
if k in sample:
sample[k] = sample[k][valid_ids]
return sample
@register_op
class Poly2Array(BaseOperator):
""" convert gt_poly to np.array for rotated bboxes
"""
def __init__(self):
super(Poly2Array, self).__init__()
def apply(self, sample, context=None):
if 'gt_poly' in sample:
sample['gt_poly'] = np.array(
sample['gt_poly'], dtype=np.float32).reshape((-1, 8))
return sample
@register_op
class RResize(BaseOperator):
def __init__(self, target_size, keep_ratio, interp=cv2.INTER_LINEAR):
"""
Resize image to target size. if keep_ratio is True,
resize the image's long side to the maximum of target_size
if keep_ratio is False, resize the image to target size(h, w)
Args:
target_size (int|list): image target size
keep_ratio (bool): whether keep_ratio or not, default true
interp (int): the interpolation method
"""
super(RResize, self).__init__()
self.keep_ratio = keep_ratio
self.interp = interp
if not isinstance(target_size, (Integral, Sequence)):
raise TypeError(
"Type of target_size is invalid. Must be Integer or List or Tuple, now is {}".
format(type(target_size)))
if isinstance(target_size, Integral):
target_size = [target_size, target_size]
self.target_size = target_size
def apply_image(self, image, scale):
im_scale_x, im_scale_y = scale
return cv2.resize(
image,
None,
None,
fx=im_scale_x,
fy=im_scale_y,
interpolation=self.interp)
def apply_pts(self, pts, scale, size):
im_scale_x, im_scale_y = scale
resize_w, resize_h = size
pts[:, 0::2] *= im_scale_x
pts[:, 1::2] *= im_scale_y
pts[:, 0::2] = np.clip(pts[:, 0::2], 0, resize_w)
pts[:, 1::2] = np.clip(pts[:, 1::2], 0, resize_h)
return pts
def apply(self, sample, context=None):
""" Resize the image numpy.
"""
im = sample['image']
if not isinstance(im, np.ndarray):
raise TypeError("{}: image type is not numpy.".format(self))
if len(im.shape) != 3:
raise ImageError('{}: image is not 3-dimensional.'.format(self))
# apply image
im_shape = im.shape
if self.keep_ratio:
im_size_min = np.min(im_shape[0:2])
im_size_max = np.max(im_shape[0:2])
target_size_min = np.min(self.target_size)
target_size_max = np.max(self.target_size)
im_scale = min(target_size_min / im_size_min,
target_size_max / im_size_max)
resize_h = im_scale * float(im_shape[0])
resize_w = im_scale * float(im_shape[1])
im_scale_x = im_scale
im_scale_y = im_scale
else:
resize_h, resize_w = self.target_size
im_scale_y = resize_h / im_shape[0]
im_scale_x = resize_w / im_shape[1]
im = self.apply_image(sample['image'], [im_scale_x, im_scale_y])
sample['image'] = im.astype(np.float32)
sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32)
if 'scale_factor' in sample:
scale_factor = sample['scale_factor']
sample['scale_factor'] = np.asarray(
[scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],
dtype=np.float32)
else:
sample['scale_factor'] = np.asarray(
[im_scale_y, im_scale_x], dtype=np.float32)
# apply bbox
if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
sample['gt_bbox'] = self.apply_pts(sample['gt_bbox'],
[im_scale_x, im_scale_y],
[resize_w, resize_h])
# apply polygon
if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
sample['gt_poly'] = self.apply_pts(sample['gt_poly'],
[im_scale_x, im_scale_y],
[resize_w, resize_h])
return sample
@register_op
class RandomRFlip(BaseOperator):
def __init__(self, prob=0.5):
"""
Args:
prob (float): the probability of flipping image
"""
super(RandomRFlip, self).__init__()
self.prob = prob
if not (isinstance(self.prob, float)):
raise TypeError("{}: input type is invalid.".format(self))
def apply_image(self, image):
return image[:, ::-1, :]
def apply_pts(self, pts, width):
oldx = pts[:, 0::2].copy()
pts[:, 0::2] = width - oldx - 1
return pts
def apply(self, sample, context=None):
"""Filp the image and bounding box.
Operators:
1. Flip the image numpy.
2. Transform the bboxes' x coordinates.
(Must judge whether the coordinates are normalized!)
3. Transform the segmentations' x coordinates.
(Must judge whether the coordinates are normalized!)
Output:
sample: the image, bounding box and segmentation part
in sample are flipped.
"""
if np.random.uniform(0, 1) < self.prob:
im = sample['image']
height, width = im.shape[:2]
im = self.apply_image(im)
if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
sample['gt_bbox'] = self.apply_pts(sample['gt_bbox'], width)
if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
sample['gt_poly'] = self.apply_pts(sample['gt_poly'], width)
sample['flipped'] = True
sample['image'] = im
return sample
@register_op
class VisibleRBox(BaseOperator):
"""
In debug mode, visualize images according to `gt_box`.
(Currently only supported when not cropping and flipping image.)
"""
def __init__(self, output_dir='debug'):
super(VisibleRBox, self).__init__()
self.output_dir = output_dir
if not os.path.isdir(output_dir):
os.makedirs(output_dir)
def apply(self, sample, context=None):
image = Image.fromarray(sample['image'].astype(np.uint8))
out_file_name = '{:012d}.jpg'.format(sample['im_id'][0])
width = sample['w']
height = sample['h']
# gt_poly = sample['gt_rbox']
gt_poly = sample['gt_poly']
gt_class = sample['gt_class']
draw = ImageDraw.Draw(image)
for i in range(gt_poly.shape[0]):
x1, y1, x2, y2, x3, y3, x4, y4 = gt_poly[i]
draw.line(
[(x1, y1), (x2, y2), (x3, y3), (x4, y4), (x1, y1)],
width=2,
fill='green')
# draw label
xmin = min(x1, x2, x3, x4)
ymin = min(y1, y2, y3, y4)
text = str(gt_class[i][0])
tw, th = draw.textsize(text)
draw.rectangle(
[(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill='green')
draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255))
if 'gt_keypoint' in sample.keys():
gt_keypoint = sample['gt_keypoint']
if self.is_normalized:
for i in range(gt_keypoint.shape[1]):
if i % 2:
gt_keypoint[:, i] = gt_keypoint[:, i] * height
else:
gt_keypoint[:, i] = gt_keypoint[:, i] * width
for i in range(gt_keypoint.shape[0]):
keypoint = gt_keypoint[i]
for j in range(int(keypoint.shape[0] / 2)):
x1 = round(keypoint[2 * j]).astype(np.int32)
y1 = round(keypoint[2 * j + 1]).astype(np.int32)
draw.ellipse(
(x1, y1, x1 + 5, y1 + 5), fill='green', outline='green')
save_path = os.path.join(self.output_dir, out_file_name)
image.save(save_path, quality=95)
return sample
@register_op
class Rbox2Poly(BaseOperator):
"""
Convert rbbox format to poly format.
"""
def __init__(self):
super(Rbox2Poly, self).__init__()
def apply(self, sample, context=None):
assert 'gt_rbox' in sample
assert sample['gt_rbox'].shape[1] == 5
rboxes = sample['gt_rbox']
polys = rbox2poly_np(rboxes)
sample['gt_poly'] = polys
xmin, ymin = polys[:, 0::2].min(1), polys[:, 1::2].min(1)
xmax, ymax = polys[:, 0::2].max(1), polys[:, 1::2].max(1)
sample['gt_bbox'] = np.stack([xmin, ymin, xmin, ymin], axis=1)
return sample

@ -0,0 +1,72 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import numbers
import numpy as np
try:
from collections.abc import Sequence, Mapping
except:
from collections import Sequence, Mapping
def default_collate_fn(batch):
"""
Default batch collating function for :code:`paddle.io.DataLoader`,
get input data as a list of sample datas, each element in list
if the data of a sample, and sample data should composed of list,
dictionary, string, number, numpy array, this
function will parse input data recursively and stack number,
numpy array and paddle.Tensor datas as batch datas. e.g. for
following input data:
[{'image': np.array(shape=[3, 224, 224]), 'label': 1},
{'image': np.array(shape=[3, 224, 224]), 'label': 3},
{'image': np.array(shape=[3, 224, 224]), 'label': 4},
{'image': np.array(shape=[3, 224, 224]), 'label': 5},]
This default collate function zipped each number and numpy array
field together and stack each field as the batch field as follows:
{'image': np.array(shape=[4, 3, 224, 224]), 'label': np.array([1, 3, 4, 5])}
Args:
batch(list of sample data): batch should be a list of sample data.
Returns:
Batched data: batched each number, numpy array and paddle.Tensor
in input data.
"""
sample = batch[0]
if isinstance(sample, np.ndarray):
batch = np.stack(batch, axis=0)
return batch
elif isinstance(sample, numbers.Number):
batch = np.array(batch)
return batch
elif isinstance(sample, (str, bytes)):
return batch
elif isinstance(sample, Mapping):
return {
key: default_collate_fn([d[key] for d in batch])
for key in sample
}
elif isinstance(sample, Sequence):
sample_fields_num = len(sample)
if not all(len(sample) == sample_fields_num for sample in iter(batch)):
raise RuntimeError(
"fileds number not same among samples in a batch")
return [default_collate_fn(fields) for fields in zip(*batch)]
raise TypeError("batch data con only contains: tensor, numpy.ndarray, "
"dict, list, number, but got {}".format(type(sample)))

@ -182,7 +182,7 @@ class Checkpointer(Callback):
) % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1:
save_name = str(
epoch_id) if epoch_id != end_epoch - 1 else "model_final"
weight = self.weight
weight = self.weight.state_dict()
elif mode == 'eval':
if 'save_best_model' in status and status['save_best_model']:
for metric in self.model._metrics:
@ -198,13 +198,23 @@ class Checkpointer(Callback):
"training iterations being too few or not " \
"loading the correct weights.")
return
if map_res[key][0] > self.best_ap:
if map_res[key][0] >= self.best_ap:
self.best_ap = map_res[key][0]
save_name = 'best_model'
weight = self.weight
weight = self.weight.state_dict()
logger.info("Best test {} ap is {:0.3f}.".format(
key, self.best_ap))
if weight:
if self.model.use_ema:
# save model and ema_model
save_model(
status['weight'],
self.model.optimizer,
self.save_dir,
save_name,
epoch_id + 1,
ema_model=weight)
else:
save_model(weight, self.model.optimizer, self.save_dir,
save_name, epoch_id + 1)
@ -279,6 +289,157 @@ class VisualDLWriter(Callback):
self.vdl_mAP_step += 1
class WandbCallback(Callback):
def __init__(self, model):
super(WandbCallback, self).__init__(model)
try:
import wandb
self.wandb = wandb
except Exception as e:
logger.error('wandb not found, please install wandb. '
'Use: `pip install wandb`.')
raise e
self.wandb_params = model.cfg.get('wandb', None)
self.save_dir = os.path.join(self.model.cfg.save_dir,
self.model.cfg.filename)
if self.wandb_params is None:
self.wandb_params = {}
for k, v in model.cfg.items():
if k.startswith("wandb_"):
self.wandb_params.update({k.lstrip("wandb_"): v})
self._run = None
if dist.get_world_size() < 2 or dist.get_rank() == 0:
_ = self.run
self.run.config.update(self.model.cfg)
self.run.define_metric("epoch")
self.run.define_metric("eval/*", step_metric="epoch")
self.best_ap = 0
@property
def run(self):
if self._run is None:
if self.wandb.run is not None:
logger.info(
"There is an ongoing wandb run which will be used"
"for logging. Please use `wandb.finish()` to end that"
"if the behaviour is not intended")
self._run = self.wandb.run
else:
self._run = self.wandb.init(**self.wandb_params)
return self._run
def save_model(self,
optimizer,
save_dir,
save_name,
last_epoch,
ema_model=None,
ap=None,
tags=None):
if dist.get_world_size() < 2 or dist.get_rank() == 0:
model_path = os.path.join(save_dir, save_name)
metadata = {}
metadata["last_epoch"] = last_epoch
if ap:
metadata["ap"] = ap
if ema_model is None:
ema_artifact = self.wandb.Artifact(
name="ema_model-{}".format(self.run.id),
type="model",
metadata=metadata)
model_artifact = self.wandb.Artifact(
name="model-{}".format(self.run.id),
type="model",
metadata=metadata)
ema_artifact.add_file(model_path + ".pdema", name="model_ema")
model_artifact.add_file(model_path + ".pdparams", name="model")
self.run.log_artifact(ema_artifact, aliases=tags)
self.run.log_artfact(model_artifact, aliases=tags)
else:
model_artifact = self.wandb.Artifact(
name="model-{}".format(self.run.id),
type="model",
metadata=metadata)
model_artifact.add_file(model_path + ".pdparams", name="model")
self.run.log_artifact(model_artifact, aliases=tags)
def on_step_end(self, status):
mode = status['mode']
if dist.get_world_size() < 2 or dist.get_rank() == 0:
if mode == 'train':
training_status = status['training_staus'].get()
for k, v in training_status.items():
training_status[k] = float(v)
metrics = {"train/" + k: v for k, v in training_status.items()}
self.run.log(metrics)
def on_epoch_end(self, status):
mode = status['mode']
epoch_id = status['epoch_id']
save_name = None
if dist.get_world_size() < 2 or dist.get_rank() == 0:
if mode == 'train':
end_epoch = self.model.cfg.epoch
if (
epoch_id + 1
) % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1:
save_name = str(
epoch_id) if epoch_id != end_epoch - 1 else "model_final"
tags = ["latest", "epoch_{}".format(epoch_id)]
self.save_model(
self.model.optimizer,
self.save_dir,
save_name,
epoch_id + 1,
self.model.use_ema,
tags=tags)
if mode == 'eval':
merged_dict = {}
for metric in self.model._metrics:
for key, map_value in metric.get_results().items():
merged_dict["eval/{}-mAP".format(key)] = map_value[0]
merged_dict["epoch"] = status["epoch_id"]
self.run.log(merged_dict)
if 'save_best_model' in status and status['save_best_model']:
for metric in self.model._metrics:
map_res = metric.get_results()
if 'bbox' in map_res:
key = 'bbox'
elif 'keypoint' in map_res:
key = 'keypoint'
else:
key = 'mask'
if key not in map_res:
logger.warning("Evaluation results empty, this may be due to " \
"training iterations being too few or not " \
"loading the correct weights.")
return
if map_res[key][0] >= self.best_ap:
self.best_ap = map_res[key][0]
save_name = 'best_model'
tags = ["best", "epoch_{}".format(epoch_id)]
self.save_model(
self.model.optimizer,
self.save_dir,
save_name,
last_epoch=epoch_id + 1,
ema_model=self.model.use_ema,
ap=self.best_ap,
tags=tags)
def on_train_end(self, status):
self.run.finish()
class SniperProposalsGenerator(Callback):
def __init__(self, model):
super(SniperProposalsGenerator, self).__init__(model)

@ -41,22 +41,26 @@ TRT_MIN_SUBGRAPH = {
'HigherHRNet': 3,
'HRNet': 3,
'DeepSORT': 3,
'ByteTrack': 10,
'JDE': 10,
'FairMOT': 5,
'GFL': 16,
'PicoDet': 3,
'CenterNet': 5,
'TOOD': 5,
'YOLOX': 8,
}
KEYPOINT_ARCH = ['HigherHRNet', 'TopDownHRNet']
MOT_ARCH = ['DeepSORT', 'JDE', 'FairMOT']
MOT_ARCH = ['DeepSORT', 'JDE', 'FairMOT', 'ByteTrack']
def _prune_input_spec(input_spec, program, targets):
# try to prune static program to figure out pruned input spec
# so we perform following operations in static mode
device = paddle.get_device()
paddle.enable_static()
paddle.set_device(device)
pruned_input_spec = [{}]
program = program.clone()
program = program._prune(targets=targets)
@ -67,7 +71,7 @@ def _prune_input_spec(input_spec, program, targets):
pruned_input_spec[0][name] = spec
except Exception:
pass
paddle.disable_static()
paddle.disable_static(place=device)
return pruned_input_spec
@ -88,6 +92,7 @@ def _parse_reader(reader_cfg, dataset_cfg, metric, arch, image_shape):
if key == 'Resize':
if int(image_shape[1]) != -1:
value['target_size'] = image_shape[1:]
value['interp'] = value.get('interp', 1) # cv2.INTER_LINEAR
if fuse_normalize and key == 'NormalizeImage':
continue
p.update(value)
@ -120,12 +125,20 @@ def _dump_infer_config(config, path, image_shape, model):
setup_orderdict()
use_dynamic_shape = True if image_shape[2] == -1 else False
infer_cfg = OrderedDict({
'mode': 'fluid',
'mode': 'paddle',
'draw_threshold': 0.5,
'metric': config['metric'],
'use_dynamic_shape': use_dynamic_shape
})
export_onnx = config.get('export_onnx', False)
export_eb = config.get('export_eb', False)
infer_arch = config['architecture']
if 'RCNN' in infer_arch and export_onnx:
logger.warning(
"Exporting RCNN model to ONNX only support batch_size = 1")
infer_cfg['export_onnx'] = True
infer_cfg['export_eb'] = export_eb
if infer_arch in MOT_ARCH:
if infer_arch == 'DeepSORT':
@ -140,6 +153,12 @@ def _dump_infer_config(config, path, image_shape, model):
infer_cfg['min_subgraph_size'] = min_subgraph_size
arch_state = True
break
if infer_arch == 'YOLOX':
infer_cfg['arch'] = infer_arch
infer_cfg['min_subgraph_size'] = TRT_MIN_SUBGRAPH[infer_arch]
arch_state = True
if not arch_state:
logger.error(
'Architecture: {} is not supported for exporting model now.\n'.
@ -165,12 +184,17 @@ def _dump_infer_config(config, path, image_shape, model):
reader_cfg, dataset_cfg, config['metric'], label_arch, image_shape[1:])
if infer_arch == 'PicoDet':
infer_cfg['NMS'] = config['PicoHead']['nms']
if hasattr(config, 'export') and config['export'].get(
'post_process',
False) and not config['export'].get('benchmark', False):
infer_cfg['arch'] = 'GFL'
head_name = 'PicoHeadV2' if config['PicoHeadV2'] else 'PicoHead'
infer_cfg['NMS'] = config[head_name]['nms']
# In order to speed up the prediction, the threshold of nms
# is adjusted here, which can be changed in infer_cfg.yml
config['PicoHead']['nms']["score_threshold"] = 0.3
config['PicoHead']['nms']["nms_threshold"] = 0.5
infer_cfg['fpn_stride'] = config['PicoHead']['fpn_stride']
config[head_name]['nms']["score_threshold"] = 0.3
config[head_name]['nms']["nms_threshold"] = 0.5
infer_cfg['fpn_stride'] = config[head_name]['fpn_stride']
yaml.dump(infer_cfg, open(path, 'w'))
logger.info("Export inference config file to {}".format(os.path.join(path)))

@ -17,27 +17,33 @@ from __future__ import division
from __future__ import print_function
import os
import cv2
import glob
import re
import paddle
import paddle.nn as nn
import numpy as np
import os.path as osp
from tqdm import tqdm
from collections import defaultdict
from paddlers.models.ppdet.core.workspace import create
from paddlers.models.ppdet.utils.checkpoint import load_weight, load_pretrain_weight
from paddlers.models.ppdet.modeling.mot.utils import Detection, get_crops, scale_coords, clip_box
from paddlers.models.ppdet.modeling.mot.utils import MOTTimer, load_det_results, write_mot_results, save_vis_results
from paddlers.models.ppdet.metrics import Metric, MOTMetric, KITTIMOTMetric
from paddlers.models.ppdet.metrics import MCMOTMetric
from paddlers.models.ppdet.modeling.mot.tracker import JDETracker, DeepSORTTracker, OCSORTTracker
from paddlers.models.ppdet.modeling.architectures import YOLOX
from paddlers.models.ppdet.metrics import Metric, MOTMetric, KITTIMOTMetric, MCMOTMetric
import paddlers.models.ppdet.utils.stats as stats
from .callbacks import Callback, ComposeCallback
from paddlers.models.ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)
MOT_ARCH = ['DeepSORT', 'JDE', 'FairMOT', 'ByteTrack']
MOT_ARCH_JDE = ['JDE', 'FairMOT']
MOT_ARCH_SDE = ['DeepSORT', 'ByteTrack']
MOT_DATA_TYPE = ['mot', 'mcmot', 'kitti']
__all__ = ['Tracker']
@ -55,6 +61,12 @@ class Tracker(object):
# build model
self.model = create(cfg.architecture)
if isinstance(self.model.detector, YOLOX):
for k, m in self.model.named_sublayers():
if isinstance(m, nn.BatchNorm2D):
m._epsilon = 1e-3 # for amp(fp16)
m._momentum = 0.97 # 0.03 in pytorch
self.status = {}
self.start_epoch = 0
@ -108,11 +120,15 @@ class Tracker(object):
load_weight(self.model, weights, self.optimizer)
def load_weights_sde(self, det_weights, reid_weights):
if self.model.detector:
with_detector = self.model.detector is not None
with_reid = self.model.reid is not None
if with_detector:
load_weight(self.model.detector, det_weights)
if with_reid:
load_weight(self.model.reid, reid_weights)
else:
load_weight(self.model.reid, reid_weights, self.optimizer)
load_weight(self.model.reid, reid_weights)
def _eval_seq_jde(self,
dataloader,
@ -131,11 +147,8 @@ class Tracker(object):
self.model.eval()
results = defaultdict(list) # support single class and multi classes
for step_id, data in enumerate(dataloader):
for step_id, data in enumerate(tqdm(dataloader)):
self.status['step_id'] = step_id
if frame_id % 40 == 0:
logger.info('Processing frame {} ({:.2f} fps)'.format(
frame_id, 1. / max(1e-5, timer.average_time)))
# forward
timer.tic()
pred_dets, pred_embs = self.model(data)
@ -184,24 +197,23 @@ class Tracker(object):
if save_dir:
if not os.path.exists(save_dir): os.makedirs(save_dir)
use_detector = False if not self.model.detector else True
use_reid = False if not self.model.reid else True
timer = MOTTimer()
results = defaultdict(list)
frame_id = 0
self.status['mode'] = 'track'
self.model.eval()
if use_reid:
self.model.reid.eval()
if not use_detector:
dets_list = load_det_results(det_file, len(dataloader))
logger.info('Finish loading detection results file {}.'.format(
det_file))
for step_id, data in enumerate(dataloader):
tracker = self.model.tracker
for step_id, data in enumerate(tqdm(dataloader)):
self.status['step_id'] = step_id
if frame_id % 40 == 0:
logger.info('Processing frame {} ({:.2f} fps)'.format(
frame_id, 1. / max(1e-5, timer.average_time)))
ori_image = data['ori_image'] # [bs, H, W, 3]
ori_image_shape = data['ori_image'].shape[1:3]
# ori_image_shape: [H, W]
@ -240,7 +252,7 @@ class Tracker(object):
outs['bbox'] = outs['bbox'].numpy()
outs['bbox_num'] = outs['bbox_num'].numpy()
if outs['bbox_num'] > 0 and empty_detections == False:
if len(outs['bbox']) > 0 and empty_detections == False:
# detector outputs: pred_cls_ids, pred_scores, pred_bboxes
pred_cls_ids = outs['bbox'][:, 0:1]
pred_scores = outs['bbox'][:, 1:2]
@ -256,6 +268,8 @@ class Tracker(object):
scale_factor)
else:
pred_bboxes = outs['bbox'][:, 2:]
pred_dets_old = np.concatenate(
(pred_cls_ids, pred_scores, pred_bboxes), axis=1)
else:
logger.warning(
'Frame {} has not detected object, try to modify score threshold.'.
@ -281,16 +295,12 @@ class Tracker(object):
# thus will not inference reid model
continue
pred_scores = pred_scores[keep_idx[0]]
pred_cls_ids = pred_cls_ids[keep_idx[0]]
pred_tlwhs = np.concatenate(
(pred_xyxys[:, 0:2],
pred_xyxys[:, 2:4] - pred_xyxys[:, 0:2] + 1),
axis=1)
pred_scores = pred_scores[keep_idx[0]]
pred_dets = np.concatenate(
(pred_tlwhs, pred_scores, pred_cls_ids), axis=1)
(pred_cls_ids, pred_scores, pred_xyxys), axis=1)
tracker = self.model.tracker
if use_reid:
crops = get_crops(
pred_xyxys,
ori_image,
@ -299,12 +309,14 @@ class Tracker(object):
crops = paddle.to_tensor(crops)
data.update({'crops': crops})
pred_embs = self.model(data).numpy()
pred_embs = self.model(data)['embeddings'].numpy()
else:
pred_embs = None
if isinstance(tracker, DeepSORTTracker):
online_tlwhs, online_scores, online_ids = [], [], []
tracker.predict()
online_targets = tracker.update(pred_dets, pred_embs)
online_tlwhs, online_scores, online_ids = [], [], []
for t in online_targets:
if not t.is_confirmed() or t.time_since_update > 1:
continue
@ -327,6 +339,60 @@ class Tracker(object):
save_vis_results(data, frame_id, online_ids, online_tlwhs,
online_scores, timer.average_time, show_image,
save_dir, self.cfg.num_classes)
elif isinstance(tracker, JDETracker):
# trick hyperparams only used for MOTChallenge (MOT17, MOT20) Test-set
tracker.track_buffer, tracker.conf_thres = get_trick_hyperparams(
seq_name, tracker.track_buffer, tracker.conf_thres)
online_targets_dict = tracker.update(pred_dets_old, pred_embs)
online_tlwhs = defaultdict(list)
online_scores = defaultdict(list)
online_ids = defaultdict(list)
for cls_id in range(self.cfg.num_classes):
online_targets = online_targets_dict[cls_id]
for t in online_targets:
tlwh = t.tlwh
tid = t.track_id
tscore = t.score
if tlwh[2] * tlwh[3] <= tracker.min_box_area: continue
if tracker.vertical_ratio > 0 and tlwh[2] / tlwh[
3] > tracker.vertical_ratio:
continue
online_tlwhs[cls_id].append(tlwh)
online_ids[cls_id].append(tid)
online_scores[cls_id].append(tscore)
# save results
results[cls_id].append(
(frame_id + 1, online_tlwhs[cls_id],
online_scores[cls_id], online_ids[cls_id]))
timer.toc()
save_vis_results(data, frame_id, online_ids, online_tlwhs,
online_scores, timer.average_time, show_image,
save_dir, self.cfg.num_classes)
elif isinstance(tracker, OCSORTTracker):
# OC_SORT Tracker
online_targets = tracker.update(pred_dets_old, pred_embs)
online_tlwhs = []
online_ids = []
online_scores = []
for t in online_targets:
tlwh = [t[0], t[1], t[2] - t[0], t[3] - t[1]]
tscore = float(t[4])
tid = int(t[5])
if tlwh[2] * tlwh[3] > 0:
online_tlwhs.append(tlwh)
online_ids.append(tid)
online_scores.append(tscore)
timer.toc()
# save results
results[0].append(
(frame_id + 1, online_tlwhs, online_scores, online_ids))
save_vis_results(data, frame_id, online_ids, online_tlwhs,
online_scores, timer.average_time, show_image,
save_dir, self.cfg.num_classes)
else:
raise ValueError(tracker)
frame_id += 1
return results, frame_id, timer.average_time, timer.calls
@ -345,10 +411,10 @@ class Tracker(object):
if not os.path.exists(output_dir): os.makedirs(output_dir)
result_root = os.path.join(output_dir, 'mot_results')
if not os.path.exists(result_root): os.makedirs(result_root)
assert data_type in ['mot', 'mcmot', 'kitti'], \
assert data_type in MOT_DATA_TYPE, \
"data_type should be 'mot', 'mcmot' or 'kitti'"
assert model_type in ['JDE', 'DeepSORT', 'FairMOT'], \
"model_type should be 'JDE', 'DeepSORT' or 'FairMOT'"
assert model_type in MOT_ARCH, \
"model_type should be 'JDE', 'DeepSORT', 'FairMOT' or 'ByteTrack'"
# run tracking
n_frame = 0
@ -371,7 +437,7 @@ class Tracker(object):
save_dir = os.path.join(output_dir, 'mot_outputs',
seq) if save_images or save_videos else None
logger.info('start seq: {}'.format(seq))
logger.info('Evaluate seq: {}'.format(seq))
self.dataset.set_images(self.get_infer_images(infer_dir))
dataloader = create('EvalMOTReader')(self.dataset, 0)
@ -379,13 +445,13 @@ class Tracker(object):
result_filename = os.path.join(result_root, '{}.txt'.format(seq))
with paddle.no_grad():
if model_type in ['JDE', 'FairMOT']:
if model_type in MOT_ARCH_JDE:
results, nf, ta, tc = self._eval_seq_jde(
dataloader,
save_dir=save_dir,
show_image=show_image,
frame_rate=frame_rate)
elif model_type in ['DeepSORT']:
elif model_type in MOT_ARCH_SDE:
results, nf, ta, tc = self._eval_seq_sde(
dataloader,
save_dir=save_dir,
@ -412,7 +478,6 @@ class Tracker(object):
os.system(cmd_str)
logger.info('Save video in {}.'.format(output_video_path))
logger.info('Evaluate seq: {}'.format(seq))
# update metrics
for metric in self._metrics:
metric.update(data_root, seq, data_type, result_root,
@ -471,10 +536,10 @@ class Tracker(object):
if not os.path.exists(output_dir): os.makedirs(output_dir)
result_root = os.path.join(output_dir, 'mot_results')
if not os.path.exists(result_root): os.makedirs(result_root)
assert data_type in ['mot', 'mcmot', 'kitti'], \
assert data_type in MOT_DATA_TYPE, \
"data_type should be 'mot', 'mcmot' or 'kitti'"
assert model_type in ['JDE', 'DeepSORT', 'FairMOT'], \
"model_type should be 'JDE', 'DeepSORT' or 'FairMOT'"
assert model_type in MOT_ARCH, \
"model_type should be 'JDE', 'DeepSORT', 'FairMOT' or 'ByteTrack'"
# run tracking
if video_file:
@ -504,14 +569,14 @@ class Tracker(object):
frame_rate = self.dataset.frame_rate
with paddle.no_grad():
if model_type in ['JDE', 'FairMOT']:
if model_type in MOT_ARCH_JDE:
results, nf, ta, tc = self._eval_seq_jde(
dataloader,
save_dir=save_dir,
show_image=show_image,
frame_rate=frame_rate,
draw_threshold=draw_threshold)
elif model_type in ['DeepSORT']:
elif model_type in MOT_ARCH_SDE:
results, nf, ta, tc = self._eval_seq_sde(
dataloader,
save_dir=save_dir,
@ -535,3 +600,35 @@ class Tracker(object):
write_mot_results(result_filename, results, data_type,
self.cfg.num_classes)
def get_trick_hyperparams(video_name, ori_buffer, ori_thresh):
if video_name[:3] != 'MOT':
# only used for MOTChallenge (MOT17, MOT20) Test-set
return ori_buffer, ori_thresh
video_name = video_name[:8]
if 'MOT17-05' in video_name:
track_buffer = 14
elif 'MOT17-13' in video_name:
track_buffer = 25
else:
track_buffer = ori_buffer
if 'MOT17-01' in video_name:
track_thresh = 0.65
elif 'MOT17-06' in video_name:
track_thresh = 0.65
elif 'MOT17-12' in video_name:
track_thresh = 0.7
elif 'MOT17-14' in video_name:
track_thresh = 0.67
else:
track_thresh = ori_thresh
if 'MOT20-06' in video_name or 'MOT20-08' in video_name:
track_thresh = 0.3
else:
track_thresh = ori_thresh
return track_buffer, ori_thresh

@ -20,38 +20,44 @@ import os
import sys
import copy
import time
from tqdm import tqdm
import numpy as np
import typing
from PIL import Image, ImageOps
from PIL import Image, ImageOps, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
import paddle
import paddle.nn as nn
import paddle.distributed as dist
from paddle.distributed import fleet
from paddle import amp
from paddle.static import InputSpec
from paddlers.models.ppdet.optimizer import ModelEMA
from paddlers.models.ppdet.core.workspace import create
from paddlers.models.ppdet.modeling.architectures.meta_arch import BaseArch
from paddlers.models.ppdet.utils.checkpoint import load_weight, load_pretrain_weight
from paddlers.models.ppdet.utils.visualizer import visualize_results, save_result
from paddlers.models.ppdet.metrics import Metric, COCOMetric, VOCMetric, WiderFaceMetric, get_infer_results, KeyPointTopDownCOCOEval, KeyPointTopDownMPIIEval
from paddlers.models.ppdet.metrics import RBoxMetric, JDEDetMetric, SNIPERCOCOMetric
from paddlers.models.ppdet.data.source.sniper_coco import SniperCOCODataSet
from paddlers.models.ppdet.data.source.category import get_categories
from paddlers.models.ppdet.utils import stats
import paddlers.models.ppdet.utils.stats as stats
from paddlers.models.ppdet.utils.fuse_utils import fuse_conv_bn
from paddlers.models.ppdet.utils import profiler
from paddlers.models.ppdet.modeling.post_process import multiclass_nms
from .callbacks import Callback, ComposeCallback, LogPrinter, Checkpointer, WiferFaceEval, VisualDLWriter, SniperProposalsGenerator
from .callbacks import Callback, ComposeCallback, LogPrinter, Checkpointer, WiferFaceEval, VisualDLWriter, SniperProposalsGenerator, WandbCallback
from .export_utils import _dump_infer_config, _prune_input_spec
from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients
from paddlers.models.ppdet.utils.logger import setup_logger
logger = setup_logger('ppdet.engine')
__all__ = ['Trainer']
MOT_ARCH = ['DeepSORT', 'JDE', 'FairMOT']
MOT_ARCH = ['DeepSORT', 'JDE', 'FairMOT', 'ByteTrack']
class Trainer(object):
@ -62,19 +68,30 @@ class Trainer(object):
self.mode = mode.lower()
self.optimizer = None
self.is_loaded_weights = False
self.use_amp = self.cfg.get('amp', False)
self.amp_level = self.cfg.get('amp_level', 'O1')
self.custom_white_list = self.cfg.get('custom_white_list', None)
self.custom_black_list = self.cfg.get('custom_black_list', None)
# build data loader
capital_mode = self.mode.capitalize()
if cfg.architecture in MOT_ARCH and self.mode in ['eval', 'test']:
self.dataset = cfg['{}MOTDataset'.format(self.mode.capitalize())]
self.dataset = self.cfg['{}MOTDataset'.format(
capital_mode)] = create('{}MOTDataset'.format(capital_mode))()
else:
self.dataset = cfg['{}Dataset'.format(self.mode.capitalize())]
self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create(
'{}Dataset'.format(capital_mode))()
if cfg.architecture == 'DeepSORT' and self.mode == 'train':
logger.error('DeepSORT has no need of training on mot dataset.')
sys.exit(1)
if cfg.architecture == 'FairMOT' and self.mode == 'eval':
images = self.parse_mot_images(cfg)
self.dataset.set_images(images)
if self.mode == 'train':
self.loader = create('{}Reader'.format(self.mode.capitalize()))(
self.loader = create('{}Reader'.format(capital_mode))(
self.dataset, cfg.worker_num)
if cfg.architecture == 'JDE' and self.mode == 'train':
@ -94,22 +111,32 @@ class Trainer(object):
self.model = self.cfg.model
self.is_loaded_weights = True
if cfg.architecture == 'YOLOX':
for k, m in self.model.named_sublayers():
if isinstance(m, nn.BatchNorm2D):
m._epsilon = 1e-3 # for amp(fp16)
m._momentum = 0.97 # 0.03 in pytorch
#normalize params for deploy
if 'slim' in cfg and cfg['slim_type'] == 'OFA':
self.model.model.load_meanstd(cfg['TestReader'][
'sample_transforms'])
elif 'slim' in cfg and cfg['slim_type'] == 'Distill':
self.model.student_model.load_meanstd(cfg['TestReader'][
'sample_transforms'])
elif 'slim' in cfg and cfg[
'slim_type'] == 'DistillPrune' and self.mode == 'train':
self.model.student_model.load_meanstd(cfg['TestReader'][
'sample_transforms'])
else:
self.model.load_meanstd(cfg['TestReader']['sample_transforms'])
self.use_ema = ('use_ema' in cfg and cfg['use_ema'])
if self.use_ema:
ema_decay = self.cfg.get('ema_decay', 0.9998)
cycle_epoch = self.cfg.get('cycle_epoch', -1)
self.ema = ModelEMA(
self.model,
decay=ema_decay,
use_thres_step=True,
cycle_epoch=cycle_epoch)
# EvalDataset build with BatchSampler to evaluate in single device
# TODO: multi-device evaluate
if self.mode == 'eval':
if cfg.architecture == 'FairMOT':
self.loader = create('EvalMOTReader')(self.dataset, 0)
else:
self._eval_batch_sampler = paddle.io.BatchSampler(
self.dataset, batch_size=self.cfg.EvalReader['batch_size'])
reader_name = '{}Reader'.format(self.mode.capitalize())
@ -123,12 +150,34 @@ class Trainer(object):
# build optimizer in train mode
if self.mode == 'train':
steps_per_epoch = len(self.loader)
if steps_per_epoch < 1:
logger.warning(
"Samples in dataset are less than batch_size, please set smaller batch_size in TrainReader."
)
self.lr = create('LearningRate')(steps_per_epoch)
self.optimizer = create('OptimizerBuilder')(self.lr, self.model)
# Unstructured pruner is only enabled in the train mode.
if self.cfg.get('unstructured_prune'):
self.pruner = create('UnstructuredPruner')(self.model,
steps_per_epoch)
if self.use_amp and self.amp_level == 'O2':
self.model, self.optimizer = paddle.amp.decorate(
models=self.model,
optimizers=self.optimizer,
level=self.amp_level)
self.use_ema = ('use_ema' in cfg and cfg['use_ema'])
if self.use_ema:
ema_decay = self.cfg.get('ema_decay', 0.9998)
ema_decay_type = self.cfg.get('ema_decay_type', 'threshold')
cycle_epoch = self.cfg.get('cycle_epoch', -1)
ema_black_list = self.cfg.get('ema_black_list', None)
self.ema = ModelEMA(
self.model,
decay=ema_decay,
ema_decay_type=ema_decay_type,
cycle_epoch=cycle_epoch,
ema_black_list=ema_black_list)
self._nranks = dist.get_world_size()
self._local_rank = dist.get_rank()
@ -152,6 +201,8 @@ class Trainer(object):
self._callbacks.append(VisualDLWriter(self))
if self.cfg.get('save_proposals', False):
self._callbacks.append(SniperProposalsGenerator(self))
if self.cfg.get('use_wandb', False) or 'wandb' in self.cfg:
self._callbacks.append(WandbCallback(self))
self._compose_callback = ComposeCallback(self._callbacks)
elif self.mode == 'eval':
self._callbacks = [LogPrinter(self)]
@ -172,7 +223,7 @@ class Trainer(object):
classwise = self.cfg['classwise'] if 'classwise' in self.cfg else False
if self.cfg.metric == 'COCO' or self.cfg.metric == "SNIPERCOCO":
# TODO: bias should be unified
bias = self.cfg['bias'] if 'bias' in self.cfg else 0
bias = 1 if self.cfg.get('bias', False) else 0
output_eval = self.cfg['output_eval'] \
if 'output_eval' in self.cfg else None
save_prediction_only = self.cfg.get('save_prediction_only', False)
@ -184,13 +235,14 @@ class Trainer(object):
# when do validation in train, annotation file should be get from
# EvalReader instead of self.dataset(which is TrainReader)
anno_file = self.dataset.get_anno()
dataset = self.dataset
if self.mode == 'train' and validate:
eval_dataset = self.cfg['EvalDataset']
eval_dataset.check_or_download_dataset()
anno_file = eval_dataset.get_anno()
dataset = eval_dataset
else:
dataset = self.dataset
anno_file = dataset.get_anno()
IouType = self.cfg['IouType'] if 'IouType' in self.cfg else 'bbox'
if self.cfg.metric == "COCO":
@ -222,11 +274,7 @@ class Trainer(object):
output_eval = self.cfg['output_eval'] \
if 'output_eval' in self.cfg else None
save_prediction_only = self.cfg.get('save_prediction_only', False)
# pass clsid2catid info to metric instance to avoid multiple loading
# annotation file
clsid2catid = {v: k for k, v in self.dataset.catid2clsid.items()} \
if self.mode == 'eval' else None
imid2path = self.cfg.get('imid2path', None)
# when do validation in train, annotation file should be get from
# EvalReader instead of self.dataset(which is TrainReader)
@ -239,19 +287,25 @@ class Trainer(object):
self._metrics = [
RBoxMetric(
anno_file=anno_file,
clsid2catid=clsid2catid,
classwise=classwise,
output_eval=output_eval,
bias=bias,
save_prediction_only=save_prediction_only)
save_prediction_only=save_prediction_only,
imid2path=imid2path)
]
elif self.cfg.metric == 'VOC':
output_eval = self.cfg['output_eval'] \
if 'output_eval' in self.cfg else None
save_prediction_only = self.cfg.get('save_prediction_only', False)
self._metrics = [
VOCMetric(
label_list=self.dataset.get_label_list(),
class_num=self.cfg.num_classes,
map_type=self.cfg.map_type,
classwise=classwise)
classwise=classwise,
output_eval=output_eval,
save_prediction_only=save_prediction_only)
]
elif self.cfg.metric == 'WiderFace':
multi_scale = self.cfg.multi_scale_eval if 'multi_scale_eval' in self.cfg else True
@ -334,19 +388,29 @@ class Trainer(object):
self.start_epoch = load_weight(self.model.student_model, weights,
self.optimizer)
else:
self.start_epoch = load_weight(self.model, weights, self.optimizer)
self.start_epoch = load_weight(self.model, weights, self.optimizer,
self.ema if self.use_ema else None)
logger.debug("Resume weights of epoch {}".format(self.start_epoch))
def train(self, validate=False):
assert self.mode == 'train', "Model not in 'train' mode"
Init_mark = False
if validate:
self.cfg['EvalDataset'] = self.cfg.EvalDataset = create(
"EvalDataset")()
sync_bn = (getattr(self.cfg, 'norm_type', None) in [None, 'sync_bn'] and
model = self.model
sync_bn = (getattr(self.cfg, 'norm_type', None) == 'sync_bn' and
self.cfg.use_gpu and self._nranks > 1)
if sync_bn:
self.model = BaseArch.convert_sync_batchnorm(self.model)
model = self.model
model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model)
# enabel auto mixed precision mode
if self.use_amp:
scaler = paddle.amp.GradScaler(
enable=self.cfg.use_gpu or self.cfg.use_npu,
init_loss_scaling=self.cfg.get('init_loss_scaling', 1024))
# get distributed model
if self.cfg.get('fleet', False):
model = fleet.distributed_model(model)
self.optimizer = fleet.distributed_optimizer(self.optimizer)
@ -354,12 +418,7 @@ class Trainer(object):
find_unused_parameters = self.cfg[
'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
model = paddle.DataParallel(
self.model, find_unused_parameters=find_unused_parameters)
# initial fp16
if self.cfg.get('fp16', False):
scaler = amp.GradScaler(
enable=self.cfg.use_gpu, init_loss_scaling=1024)
model, find_unused_parameters=find_unused_parameters)
self.status.update({
'epoch_id': self.start_epoch,
@ -381,6 +440,9 @@ class Trainer(object):
self._compose_callback.on_train_begin(self.status)
use_fused_allreduce_gradients = self.cfg[
'use_fused_allreduce_gradients'] if 'use_fused_allreduce_gradients' in self.cfg else False
for epoch_id in range(self.start_epoch, self.cfg.epoch):
self.status['mode'] = 'train'
self.status['epoch_id'] = epoch_id
@ -395,17 +457,50 @@ class Trainer(object):
self._compose_callback.on_step_begin(self.status)
data['epoch_id'] = epoch_id
if self.cfg.get('fp16', False):
with amp.auto_cast(enable=self.cfg.use_gpu):
if self.use_amp:
if isinstance(
model, paddle.
DataParallel) and use_fused_allreduce_gradients:
with model.no_sync():
with paddle.amp.auto_cast(
enable=self.cfg.use_gpu,
custom_white_list=self.custom_white_list,
custom_black_list=self.custom_black_list,
level=self.amp_level):
# model forward
outputs = model(data)
loss = outputs['loss']
# model backward
scaled_loss = scaler.scale(loss)
scaled_loss.backward()
fused_allreduce_gradients(
list(model.parameters()), None)
else:
with paddle.amp.auto_cast(
enable=self.cfg.use_gpu,
custom_white_list=self.custom_white_list,
custom_black_list=self.custom_black_list,
level=self.amp_level):
# model forward
outputs = model(data)
loss = outputs['loss']
# model backward
scaled_loss = scaler.scale(loss)
scaled_loss.backward()
# in dygraph mode, optimizer.minimize is equal to optimizer.step
scaler.minimize(self.optimizer, scaled_loss)
else:
if isinstance(
model, paddle.
DataParallel) and use_fused_allreduce_gradients:
with model.no_sync():
# model forward
outputs = model(data)
loss = outputs['loss']
# model backward
loss.backward()
fused_allreduce_gradients(
list(model.parameters()), None)
else:
# model forward
outputs = model(data)
@ -426,21 +521,23 @@ class Trainer(object):
self.status['batch_time'].update(time.time() - iter_tic)
self._compose_callback.on_step_end(self.status)
if self.use_ema:
self.ema.update(self.model)
self.ema.update()
iter_tic = time.time()
if self.cfg.get('unstructured_prune'):
self.pruner.update_params()
is_snapshot = (self._nranks < 2 or self._local_rank == 0) \
and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 or epoch_id == self.end_epoch - 1)
if is_snapshot and self.use_ema:
# apply ema weight on model
if self.use_ema:
weight = copy.deepcopy(self.model.state_dict())
self.model.set_dict(self.ema.apply())
if self.cfg.get('unstructured_prune'):
self.pruner.update_params()
self.status['weight'] = weight
self._compose_callback.on_epoch_end(self.status)
if validate and (self._nranks < 2 or self._local_rank == 0) \
and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 \
or epoch_id == self.end_epoch - 1):
if validate and is_snapshot:
if not hasattr(self, '_eval_loader'):
# build evaluation dataset and loader
self._eval_dataset = self.cfg.EvalDataset
@ -461,13 +558,15 @@ class Trainer(object):
Init_mark = True
self._init_metrics(validate=validate)
self._reset_metrics()
with paddle.no_grad():
self.status['save_best_model'] = True
self._eval_with_loader(self._eval_loader)
# restore origin weight on model
if self.use_ema:
if is_snapshot and self.use_ema:
# reset original weight
self.model.set_dict(weight)
self.status.pop('weight')
self._compose_callback.on_train_end(self.status)
@ -485,6 +584,14 @@ class Trainer(object):
self.status['step_id'] = step_id
self._compose_callback.on_step_begin(self.status)
# forward
if self.use_amp:
with paddle.amp.auto_cast(
enable=self.cfg.use_gpu,
custom_white_list=self.custom_white_list,
custom_black_list=self.custom_black_list,
level=self.amp_level):
outs = self.model(data)
else:
outs = self.model(data)
# update metrics
@ -513,16 +620,248 @@ class Trainer(object):
with paddle.no_grad():
self._eval_with_loader(self.loader)
def _eval_with_loader_slice(self,
loader,
slice_size=[640, 640],
overlap_ratio=[0.25, 0.25],
combine_method='nms',
match_threshold=0.6,
match_metric='iou'):
sample_num = 0
tic = time.time()
self._compose_callback.on_epoch_begin(self.status)
self.status['mode'] = 'eval'
self.model.eval()
if self.cfg.get('print_flops', False):
flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
self.dataset, self.cfg.worker_num, self._eval_batch_sampler)
self._flops(flops_loader)
merged_bboxs = []
for step_id, data in enumerate(loader):
self.status['step_id'] = step_id
self._compose_callback.on_step_begin(self.status)
# forward
if self.use_amp:
with paddle.amp.auto_cast(
enable=self.cfg.use_gpu,
custom_white_list=self.custom_white_list,
custom_black_list=self.custom_black_list,
level=self.amp_level):
outs = self.model(data)
else:
outs = self.model(data)
shift_amount = data['st_pix']
outs['bbox'][:, 2:4] = outs['bbox'][:, 2:4] + shift_amount
outs['bbox'][:, 4:6] = outs['bbox'][:, 4:6] + shift_amount
merged_bboxs.append(outs['bbox'])
if data['is_last'] > 0:
# merge matching predictions
merged_results = {'bbox': []}
if combine_method == 'nms':
final_boxes = multiclass_nms(
np.concatenate(merged_bboxs), self.cfg.num_classes,
match_threshold, match_metric)
merged_results['bbox'] = np.concatenate(final_boxes)
elif combine_method == 'concat':
merged_results['bbox'] = np.concatenate(merged_bboxs)
else:
raise ValueError(
"Now only support 'nms' or 'concat' to fuse detection results."
)
merged_results['im_id'] = np.array([[0]])
merged_results['bbox_num'] = np.array(
[len(merged_results['bbox'])])
merged_bboxs = []
data['im_id'] = data['ori_im_id']
# update metrics
for metric in self._metrics:
metric.update(data, merged_results)
# multi-scale inputs: all inputs have same im_id
if isinstance(data, typing.Sequence):
sample_num += data[0]['im_id'].numpy().shape[0]
else:
sample_num += data['im_id'].numpy().shape[0]
self._compose_callback.on_step_end(self.status)
self.status['sample_num'] = sample_num
self.status['cost_time'] = time.time() - tic
# accumulate metric to log out
for metric in self._metrics:
metric.accumulate()
metric.log()
self._compose_callback.on_epoch_end(self.status)
# reset metric states for metric may performed multiple times
self._reset_metrics()
def evaluate_slice(self,
slice_size=[640, 640],
overlap_ratio=[0.25, 0.25],
combine_method='nms',
match_threshold=0.6,
match_metric='iou'):
with paddle.no_grad():
self._eval_with_loader_slice(self.loader, slice_size, overlap_ratio,
combine_method, match_threshold,
match_metric)
def slice_predict(self,
images,
slice_size=[640, 640],
overlap_ratio=[0.25, 0.25],
combine_method='nms',
match_threshold=0.6,
match_metric='iou',
draw_threshold=0.5,
output_dir='output',
save_results=False,
visualize=True):
self.dataset.set_slice_images(images, slice_size, overlap_ratio)
loader = create('TestReader')(self.dataset, 0)
imid2path = self.dataset.get_imid2path()
anno_file = self.dataset.get_anno()
clsid2catid, catid2name = get_categories(
self.cfg.metric, anno_file=anno_file)
# Run Infer
self.status['mode'] = 'test'
self.model.eval()
if self.cfg.get('print_flops', False):
flops_loader = create('TestReader')(self.dataset, 0)
self._flops(flops_loader)
results = [] # all images
merged_bboxs = [] # single image
for step_id, data in enumerate(tqdm(loader)):
self.status['step_id'] = step_id
# forward
outs = self.model(data)
outs['bbox'] = outs['bbox'].numpy() # only in test mode
shift_amount = data['st_pix']
outs['bbox'][:, 2:4] = outs['bbox'][:, 2:4] + shift_amount.numpy()
outs['bbox'][:, 4:6] = outs['bbox'][:, 4:6] + shift_amount.numpy()
merged_bboxs.append(outs['bbox'])
if data['is_last'] > 0:
# merge matching predictions
merged_results = {'bbox': []}
if combine_method == 'nms':
final_boxes = multiclass_nms(
np.concatenate(merged_bboxs), self.cfg.num_classes,
match_threshold, match_metric)
merged_results['bbox'] = np.concatenate(final_boxes)
elif combine_method == 'concat':
merged_results['bbox'] = np.concatenate(merged_bboxs)
else:
raise ValueError(
"Now only support 'nms' or 'concat' to fuse detection results."
)
merged_results['im_id'] = np.array([[0]])
merged_results['bbox_num'] = np.array(
[len(merged_results['bbox'])])
merged_bboxs = []
data['im_id'] = data['ori_im_id']
for key in ['im_shape', 'scale_factor', 'im_id']:
if isinstance(data, typing.Sequence):
merged_results[key] = data[0][key]
else:
merged_results[key] = data[key]
for key, value in merged_results.items():
if hasattr(value, 'numpy'):
merged_results[key] = value.numpy()
results.append(merged_results)
if visualize:
for outs in results:
batch_res = get_infer_results(outs, clsid2catid)
bbox_num = outs['bbox_num']
start = 0
for i, im_id in enumerate(outs['im_id']):
image_path = imid2path[int(im_id)]
image = Image.open(image_path).convert('RGB')
image = ImageOps.exif_transpose(image)
self.status['original_image'] = np.array(image.copy())
end = start + bbox_num[i]
bbox_res = batch_res['bbox'][start:end] \
if 'bbox' in batch_res else None
mask_res, segm_res, keypoint_res = None, None, None
image = visualize_results(
image, bbox_res, mask_res, segm_res, keypoint_res,
int(im_id), catid2name, draw_threshold)
self.status['result_image'] = np.array(image.copy())
if self._compose_callback:
self._compose_callback.on_step_end(self.status)
# save image with detection
save_name = self._get_save_image_name(output_dir,
image_path)
logger.info("Detection bbox results save in {}".format(
save_name))
image.save(save_name, quality=95)
start = end
def predict(self,
images,
draw_threshold=0.5,
output_dir='output',
save_txt=False):
save_results=False,
visualize=True):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
self.dataset.set_images(images)
loader = create('TestReader')(self.dataset, 0)
imid2path = self.dataset.get_imid2path()
def setup_metrics_for_loader():
# mem
metrics = copy.deepcopy(self._metrics)
mode = self.mode
save_prediction_only = self.cfg[
'save_prediction_only'] if 'save_prediction_only' in self.cfg else None
output_eval = self.cfg[
'output_eval'] if 'output_eval' in self.cfg else None
# modify
self.mode = '_test'
self.cfg['save_prediction_only'] = True
self.cfg['output_eval'] = output_dir
self.cfg['imid2path'] = imid2path
self._init_metrics()
# restore
self.mode = mode
self.cfg.pop('save_prediction_only')
if save_prediction_only is not None:
self.cfg['save_prediction_only'] = save_prediction_only
self.cfg.pop('output_eval')
if output_eval is not None:
self.cfg['output_eval'] = output_eval
self.cfg.pop('imid2path')
_metrics = copy.deepcopy(self._metrics)
self._metrics = metrics
return _metrics
if save_results:
metrics = setup_metrics_for_loader()
else:
metrics = []
anno_file = self.dataset.get_anno()
clsid2catid, catid2name = get_categories(
self.cfg.metric, anno_file=anno_file)
@ -534,11 +873,14 @@ class Trainer(object):
flops_loader = create('TestReader')(self.dataset, 0)
self._flops(flops_loader)
results = []
for step_id, data in enumerate(loader):
for step_id, data in enumerate(tqdm(loader)):
self.status['step_id'] = step_id
# forward
outs = self.model(data)
for _m in metrics:
_m.update(data, outs)
for key in ['im_shape', 'scale_factor', 'im_id']:
if isinstance(data, typing.Sequence):
outs[key] = data[0][key]
@ -548,11 +890,17 @@ class Trainer(object):
if hasattr(value, 'numpy'):
outs[key] = value.numpy()
results.append(outs)
# sniper
if type(self.dataset) == SniperCOCODataSet:
results = self.dataset.anno_cropper.aggregate_chips_detections(
results)
for _m in metrics:
_m.accumulate()
_m.reset()
if visualize:
for outs in results:
batch_res = get_infer_results(outs, clsid2catid)
bbox_num = outs['bbox_num']
@ -580,32 +928,26 @@ class Trainer(object):
if self._compose_callback:
self._compose_callback.on_step_end(self.status)
# save image with detection
save_name = self._get_save_image_name(output_dir, image_path)
save_name = self._get_save_image_name(output_dir,
image_path)
logger.info("Detection bbox results save in {}".format(
save_name))
image.save(save_name, quality=95)
if save_txt:
save_path = os.path.splitext(save_name)[0] + '.txt'
results = {}
results["im_id"] = im_id
if bbox_res:
results["bbox_res"] = bbox_res
if keypoint_res:
results["keypoint_res"] = keypoint_res
save_result(save_path, results, catid2name, draw_threshold)
start = end
def _get_save_image_name(self, output_dir, image_path):
"""
Get save image name from source image path.
"""
if not os.path.exists(output_dir):
os.makedirs(output_dir)
image_name = os.path.split(image_path)[-1]
name, ext = os.path.splitext(image_name)
return os.path.join(output_dir, "{}".format(name)) + ext
def _get_infer_cfg_and_input_spec(self, save_dir, prune_input=True):
def _get_infer_cfg_and_input_spec(self,
save_dir,
prune_input=True,
kl_quant=False):
image_shape = None
im_shape = [None, 2]
scale_factor = [None, 2]
@ -628,9 +970,27 @@ class Trainer(object):
if hasattr(self.model, 'deploy'):
self.model.deploy = True
if 'slim' not in self.cfg:
for layer in self.model.sublayers():
if hasattr(layer, 'convert_to_deploy'):
layer.convert_to_deploy()
export_post_process = self.cfg['export'].get(
'post_process', False) if hasattr(self.cfg, 'export') else True
export_nms = self.cfg['export'].get('nms', False) if hasattr(
self.cfg, 'export') else True
export_benchmark = self.cfg['export'].get(
'benchmark', False) if hasattr(self.cfg, 'export') else False
if hasattr(self.model, 'fuse_norm'):
self.model.fuse_norm = self.cfg['TestReader'].get('fuse_normalize',
False)
if hasattr(self.model, 'export_post_process'):
self.model.export_post_process = export_post_process if not export_benchmark else False
if hasattr(self.model, 'export_nms'):
self.model.export_nms = export_nms if not export_benchmark else False
if export_post_process and not export_benchmark:
image_shape = [None] + image_shape[1:]
# Save infer cfg
_dump_infer_config(self.cfg,
@ -663,7 +1023,20 @@ class Trainer(object):
pruned_input_spec = input_spec
# TODO: Hard code, delete it when support prune input_spec.
if self.cfg.architecture == 'PicoDet':
if self.cfg.architecture == 'PicoDet' and not export_post_process:
pruned_input_spec = [{
"image": InputSpec(
shape=image_shape, name='image')
}]
if kl_quant:
if self.cfg.architecture == 'PicoDet' or 'ppyoloe' in self.cfg.weights:
pruned_input_spec = [{
"image": InputSpec(
shape=image_shape, name='image'),
"scale_factor": InputSpec(
shape=scale_factor, name='scale_factor')
}]
elif 'tinypose' in self.cfg.weights:
pruned_input_spec = [{
"image": InputSpec(
shape=image_shape, name='image')
@ -673,6 +1046,11 @@ class Trainer(object):
def export(self, output_dir='output_inference'):
self.model.eval()
if hasattr(self.cfg, 'export') and 'fuse_conv_bn' in self.cfg[
'export'] and self.cfg['export']['fuse_conv_bn']:
self.model = fuse_conv_bn(self.model)
model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0]
save_dir = os.path.join(output_dir, model_name)
if not os.path.exists(save_dir):
@ -682,7 +1060,7 @@ class Trainer(object):
save_dir)
# dy2st and save model
if 'slim' not in self.cfg or self.cfg['slim_type'] != 'QAT':
if 'slim' not in self.cfg or 'QAT' not in self.cfg['slim_type']:
paddle.jit.save(
static_model,
os.path.join(save_dir, 'model'),
@ -706,8 +1084,9 @@ class Trainer(object):
break
# TODO: support prune input_spec
kl_quant = True if hasattr(self.cfg.slim, 'ptq') else False
_, pruned_input_spec = self._get_infer_cfg_and_input_spec(
save_dir, prune_input=False)
save_dir, prune_input=False, kl_quant=kl_quant)
self.cfg.slim.save_quantized_model(
self.model,
@ -739,3 +1118,29 @@ class Trainer(object):
flops = flops(self.model, input_spec) / (1000**3)
logger.info(" Model FLOPs : {:.6f}G. (image shape is {})".format(
flops, input_data['image'][0].unsqueeze(0).shape))
def parse_mot_images(self, cfg):
import glob
# for quant
dataset_dir = cfg['EvalMOTDataset'].dataset_dir
data_root = cfg['EvalMOTDataset'].data_root
data_root = '{}/{}'.format(dataset_dir, data_root)
seqs = os.listdir(data_root)
seqs.sort()
all_images = []
for seq in seqs:
infer_dir = os.path.join(data_root, seq)
assert infer_dir is None or os.path.isdir(infer_dir), \
"{} is not a directory".format(infer_dir)
images = set()
exts = ['jpg', 'jpeg', 'png', 'bmp']
exts += [ext.upper() for ext in exts]
for ext in exts:
images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))
images = list(images)
images.sort()
assert len(images) > 0, "no image found in {}".format(infer_dir)
all_images.extend(images)
logger.info("Found {} inference images in total.".format(
len(images)))
return all_images

@ -0,0 +1,35 @@
# 自定义OP编译
旋转框IOU计算OP是参考[自定义外部算子](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/custom_op/new_cpp_op_cn.html) 。
## 1. 环境依赖
- Paddle >= 2.0.1
- gcc 8.2
## 2. 安装
```
python setup.py install
```
编译完成后即可使用,以下为`rbox_iou`的使用示例
```
# 引入自定义op
from ext_op import rbox_iou
paddle.set_device('gpu:0')
paddle.disable_static()
rbox1 = np.random.rand(13000, 5)
rbox2 = np.random.rand(7, 5)
pd_rbox1 = paddle.to_tensor(rbox1)
pd_rbox2 = paddle.to_tensor(rbox2)
iou = rbox_iou(pd_rbox1, pd_rbox2)
print('iou', iou)
```
## 3. 单元测试
可以通过执行单元测试来确认自定义算子功能的正确性,执行单元测试的示例如下所示:
```
python unittest/test_matched_rbox_iou.py
```

@ -0,0 +1,90 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// The code is based on
// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/box_iou_rotated
#include "paddle/extension.h"
#include "rbox_iou_op.h"
template <typename T>
void matched_rbox_iou_cpu_kernel(const int rbox_num, const T *rbox1_data_ptr,
const T *rbox2_data_ptr, T *output_data_ptr) {
int i;
for (i = 0; i < rbox_num; i++) {
output_data_ptr[i] =
rbox_iou_single<T>(rbox1_data_ptr + i * 5, rbox2_data_ptr + i * 5);
}
}
#define CHECK_INPUT_CPU(x) \
PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.")
std::vector<paddle::Tensor> MatchedRboxIouCPUForward(const paddle::Tensor &rbox1,
const paddle::Tensor &rbox2) {
CHECK_INPUT_CPU(rbox1);
CHECK_INPUT_CPU(rbox2);
PD_CHECK(rbox1.shape()[0] == rbox2.shape()[0], "inputs must be same dim");
auto rbox_num = rbox1.shape()[0];
auto output = paddle::Tensor(paddle::PlaceType::kCPU, {rbox_num});
PD_DISPATCH_FLOATING_TYPES(rbox1.type(), "rotated_iou_cpu_kernel", ([&] {
matched_rbox_iou_cpu_kernel<data_t>(
rbox_num, rbox1.data<data_t>(),
rbox2.data<data_t>(),
output.mutable_data<data_t>());
}));
return {output};
}
#ifdef PADDLE_WITH_CUDA
std::vector<paddle::Tensor> MatchedRboxIouCUDAForward(const paddle::Tensor &rbox1,
const paddle::Tensor &rbox2);
#endif
#define CHECK_INPUT_SAME(x1, x2) \
PD_CHECK(x1.place() == x2.place(), "input must be smae pacle.")
std::vector<paddle::Tensor> MatchedRboxIouForward(const paddle::Tensor &rbox1,
const paddle::Tensor &rbox2) {
CHECK_INPUT_SAME(rbox1, rbox2);
if (rbox1.place() == paddle::PlaceType::kCPU) {
return MatchedRboxIouCPUForward(rbox1, rbox2);
#ifdef PADDLE_WITH_CUDA
} else if (rbox1.place() == paddle::PlaceType::kGPU) {
return MatchedRboxIouCUDAForward(rbox1, rbox2);
#endif
}
}
std::vector<std::vector<int64_t>>
MatchedRboxIouInferShape(std::vector<int64_t> rbox1_shape,
std::vector<int64_t> rbox2_shape) {
return {{rbox1_shape[0]}};
}
std::vector<paddle::DataType> MatchedRboxIouInferDtype(paddle::DataType t1,
paddle::DataType t2) {
return {t1};
}
PD_BUILD_OP(matched_rbox_iou)
.Inputs({"RBOX1", "RBOX2"})
.Outputs({"Output"})
.SetKernelFn(PD_KERNEL(MatchedRboxIouForward))
.SetInferShapeFn(PD_INFER_SHAPE(MatchedRboxIouInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(MatchedRboxIouInferDtype));

@ -0,0 +1,63 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// The code is based on
// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/box_iou_rotated
#include "paddle/extension.h"
#include "rbox_iou_op.h"
/**
Computes ceil(a / b)
*/
static inline int CeilDiv(const int a, const int b) { return (a + b - 1) / b; }
template <typename T>
__global__ void
matched_rbox_iou_cuda_kernel(const int rbox_num, const T *rbox1_data_ptr,
const T *rbox2_data_ptr, T *output_data_ptr) {
for (int tid = blockIdx.x * blockDim.x + threadIdx.x; tid < rbox_num;
tid += blockDim.x * gridDim.x) {
output_data_ptr[tid] =
rbox_iou_single<T>(rbox1_data_ptr + tid * 5, rbox2_data_ptr + tid * 5);
}
}
#define CHECK_INPUT_GPU(x) \
PD_CHECK(x.place() == paddle::PlaceType::kGPU, #x " must be a GPU Tensor.")
std::vector<paddle::Tensor> MatchedRboxIouCUDAForward(const paddle::Tensor &rbox1,
const paddle::Tensor &rbox2) {
CHECK_INPUT_GPU(rbox1);
CHECK_INPUT_GPU(rbox2);
PD_CHECK(rbox1.shape()[0] == rbox2.shape()[0], "inputs must be same dim");
auto rbox_num = rbox1.shape()[0];
auto output = paddle::Tensor(paddle::PlaceType::kGPU, {rbox_num});
const int thread_per_block = 512;
const int block_per_grid = CeilDiv(rbox_num, thread_per_block);
PD_DISPATCH_FLOATING_TYPES(
rbox1.type(), "matched_rbox_iou_cuda_kernel", ([&] {
matched_rbox_iou_cuda_kernel<
data_t><<<block_per_grid, thread_per_block, 0, rbox1.stream()>>>(
rbox_num, rbox1.data<data_t>(), rbox2.data<data_t>(),
output.mutable_data<data_t>());
}));
return {output};
}

@ -0,0 +1,97 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// The code is based on https://github.com/csuhan/s2anet/blob/master/mmdet/ops/box_iou_rotated
#include "rbox_iou_op.h"
#include "paddle/extension.h"
template <typename T>
void rbox_iou_cpu_kernel(
const int rbox1_num,
const int rbox2_num,
const T* rbox1_data_ptr,
const T* rbox2_data_ptr,
T* output_data_ptr) {
int i, j;
for (i = 0; i < rbox1_num; i++) {
for (j = 0; j < rbox2_num; j++) {
int offset = i * rbox2_num + j;
output_data_ptr[offset] = rbox_iou_single<T>(rbox1_data_ptr + i * 5, rbox2_data_ptr + j * 5);
}
}
}
#define CHECK_INPUT_CPU(x) PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.")
std::vector<paddle::Tensor> RboxIouCPUForward(const paddle::Tensor& rbox1, const paddle::Tensor& rbox2) {
CHECK_INPUT_CPU(rbox1);
CHECK_INPUT_CPU(rbox2);
auto rbox1_num = rbox1.shape()[0];
auto rbox2_num = rbox2.shape()[0];
auto output = paddle::Tensor(paddle::PlaceType::kCPU, {rbox1_num, rbox2_num});
PD_DISPATCH_FLOATING_TYPES(
rbox1.type(),
"rbox_iou_cpu_kernel",
([&] {
rbox_iou_cpu_kernel<data_t>(
rbox1_num,
rbox2_num,
rbox1.data<data_t>(),
rbox2.data<data_t>(),
output.mutable_data<data_t>());
}));
return {output};
}
#ifdef PADDLE_WITH_CUDA
std::vector<paddle::Tensor> RboxIouCUDAForward(const paddle::Tensor& rbox1, const paddle::Tensor& rbox2);
#endif
#define CHECK_INPUT_SAME(x1, x2) PD_CHECK(x1.place() == x2.place(), "input must be smae pacle.")
std::vector<paddle::Tensor> RboxIouForward(const paddle::Tensor& rbox1, const paddle::Tensor& rbox2) {
CHECK_INPUT_SAME(rbox1, rbox2);
if (rbox1.place() == paddle::PlaceType::kCPU) {
return RboxIouCPUForward(rbox1, rbox2);
#ifdef PADDLE_WITH_CUDA
} else if (rbox1.place() == paddle::PlaceType::kGPU) {
return RboxIouCUDAForward(rbox1, rbox2);
#endif
}
}
std::vector<std::vector<int64_t>> InferShape(std::vector<int64_t> rbox1_shape, std::vector<int64_t> rbox2_shape) {
return {{rbox1_shape[0], rbox2_shape[0]}};
}
std::vector<paddle::DataType> InferDtype(paddle::DataType t1, paddle::DataType t2) {
return {t1};
}
PD_BUILD_OP(rbox_iou)
.Inputs({"RBOX1", "RBOX2"})
.Outputs({"Output"})
.SetKernelFn(PD_KERNEL(RboxIouForward))
.SetInferShapeFn(PD_INFER_SHAPE(InferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(InferDtype));

@ -0,0 +1,114 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// The code is based on
// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/box_iou_rotated
#include "paddle/extension.h"
#include "rbox_iou_op.h"
// 2D block with 32 * 16 = 512 threads per block
const int BLOCK_DIM_X = 32;
const int BLOCK_DIM_Y = 16;
/**
Computes ceil(a / b)
*/
static inline int CeilDiv(const int a, const int b) { return (a + b - 1) / b; }
template <typename T>
__global__ void rbox_iou_cuda_kernel(const int rbox1_num, const int rbox2_num,
const T *rbox1_data_ptr,
const T *rbox2_data_ptr,
T *output_data_ptr) {
// get row_start and col_start
const int rbox1_block_idx = blockIdx.x * blockDim.x;
const int rbox2_block_idx = blockIdx.y * blockDim.y;
const int rbox1_thread_num = min(rbox1_num - rbox1_block_idx, blockDim.x);
const int rbox2_thread_num = min(rbox2_num - rbox2_block_idx, blockDim.y);
__shared__ T block_boxes1[BLOCK_DIM_X * 5];
__shared__ T block_boxes2[BLOCK_DIM_Y * 5];
// It's safe to copy using threadIdx.x since BLOCK_DIM_X >= BLOCK_DIM_Y
if (threadIdx.x < rbox1_thread_num && threadIdx.y == 0) {
block_boxes1[threadIdx.x * 5 + 0] =
rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 0];
block_boxes1[threadIdx.x * 5 + 1] =
rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 1];
block_boxes1[threadIdx.x * 5 + 2] =
rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 2];
block_boxes1[threadIdx.x * 5 + 3] =
rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 3];
block_boxes1[threadIdx.x * 5 + 4] =
rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 4];
}
// threadIdx.x < BLOCK_DIM_Y=rbox2_thread_num, just use same condition as
// above: threadIdx.y == 0
if (threadIdx.x < rbox2_thread_num && threadIdx.y == 0) {
block_boxes2[threadIdx.x * 5 + 0] =
rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 0];
block_boxes2[threadIdx.x * 5 + 1] =
rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 1];
block_boxes2[threadIdx.x * 5 + 2] =
rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 2];
block_boxes2[threadIdx.x * 5 + 3] =
rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 3];
block_boxes2[threadIdx.x * 5 + 4] =
rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 4];
}
// sync
__syncthreads();
if (threadIdx.x < rbox1_thread_num && threadIdx.y < rbox2_thread_num) {
int offset = (rbox1_block_idx + threadIdx.x) * rbox2_num + rbox2_block_idx +
threadIdx.y;
output_data_ptr[offset] = rbox_iou_single<T>(
block_boxes1 + threadIdx.x * 5, block_boxes2 + threadIdx.y * 5);
}
}
#define CHECK_INPUT_GPU(x) \
PD_CHECK(x.place() == paddle::PlaceType::kGPU, #x " must be a GPU Tensor.")
std::vector<paddle::Tensor> RboxIouCUDAForward(const paddle::Tensor &rbox1,
const paddle::Tensor &rbox2) {
CHECK_INPUT_GPU(rbox1);
CHECK_INPUT_GPU(rbox2);
auto rbox1_num = rbox1.shape()[0];
auto rbox2_num = rbox2.shape()[0];
auto output = paddle::Tensor(paddle::PlaceType::kGPU, {rbox1_num, rbox2_num});
const int blocks_x = CeilDiv(rbox1_num, BLOCK_DIM_X);
const int blocks_y = CeilDiv(rbox2_num, BLOCK_DIM_Y);
dim3 blocks(blocks_x, blocks_y);
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
PD_DISPATCH_FLOATING_TYPES(
rbox1.type(), "rbox_iou_cuda_kernel", ([&] {
rbox_iou_cuda_kernel<data_t><<<blocks, threads, 0, rbox1.stream()>>>(
rbox1_num, rbox2_num, rbox1.data<data_t>(), rbox2.data<data_t>(),
output.mutable_data<data_t>());
}));
return {output};
}

@ -0,0 +1,348 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// The code is based on
// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/box_iou_rotated
#pragma once
#include <cassert>
#include <cmath>
#include <vector>
#ifdef __CUDACC__
// Designates functions callable from the host (CPU) and the device (GPU)
#define HOST_DEVICE __host__ __device__
#define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__
#else
#include <algorithm>
#define HOST_DEVICE
#define HOST_DEVICE_INLINE HOST_DEVICE inline
#endif
namespace {
template <typename T> struct RotatedBox { T x_ctr, y_ctr, w, h, a; };
template <typename T> struct Point {
T x, y;
HOST_DEVICE_INLINE Point(const T &px = 0, const T &py = 0) : x(px), y(py) {}
HOST_DEVICE_INLINE Point operator+(const Point &p) const {
return Point(x + p.x, y + p.y);
}
HOST_DEVICE_INLINE Point &operator+=(const Point &p) {
x += p.x;
y += p.y;
return *this;
}
HOST_DEVICE_INLINE Point operator-(const Point &p) const {
return Point(x - p.x, y - p.y);
}
HOST_DEVICE_INLINE Point operator*(const T coeff) const {
return Point(x * coeff, y * coeff);
}
};
template <typename T>
HOST_DEVICE_INLINE T dot_2d(const Point<T> &A, const Point<T> &B) {
return A.x * B.x + A.y * B.y;
}
template <typename T>
HOST_DEVICE_INLINE T cross_2d(const Point<T> &A, const Point<T> &B) {
return A.x * B.y - B.x * A.y;
}
template <typename T>
HOST_DEVICE_INLINE void get_rotated_vertices(const RotatedBox<T> &box,
Point<T> (&pts)[4]) {
// M_PI / 180. == 0.01745329251
// double theta = box.a * 0.01745329251;
// MODIFIED
double theta = box.a;
T cosTheta2 = (T)cos(theta) * 0.5f;
T sinTheta2 = (T)sin(theta) * 0.5f;
// y: top --> down; x: left --> right
pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
pts[2].x = 2 * box.x_ctr - pts[0].x;
pts[2].y = 2 * box.y_ctr - pts[0].y;
pts[3].x = 2 * box.x_ctr - pts[1].x;
pts[3].y = 2 * box.y_ctr - pts[1].y;
}
template <typename T>
HOST_DEVICE_INLINE int get_intersection_points(const Point<T> (&pts1)[4],
const Point<T> (&pts2)[4],
Point<T> (&intersections)[24]) {
// Line vector
// A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
Point<T> vec1[4], vec2[4];
for (int i = 0; i < 4; i++) {
vec1[i] = pts1[(i + 1) % 4] - pts1[i];
vec2[i] = pts2[(i + 1) % 4] - pts2[i];
}
// Line test - test all line combos for intersection
int num = 0; // number of intersections
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 4; j++) {
// Solve for 2x2 Ax=b
T det = cross_2d<T>(vec2[j], vec1[i]);
// This takes care of parallel lines
if (fabs(det) <= 1e-14) {
continue;
}
auto vec12 = pts2[j] - pts1[i];
T t1 = cross_2d<T>(vec2[j], vec12) / det;
T t2 = cross_2d<T>(vec1[i], vec12) / det;
if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) {
intersections[num++] = pts1[i] + vec1[i] * t1;
}
}
}
// Check for vertices of rect1 inside rect2
{
const auto &AB = vec2[0];
const auto &DA = vec2[3];
auto ABdotAB = dot_2d<T>(AB, AB);
auto ADdotAD = dot_2d<T>(DA, DA);
for (int i = 0; i < 4; i++) {
// assume ABCD is the rectangle, and P is the point to be judged
// P is inside ABCD iff. P's projection on AB lies within AB
// and P's projection on AD lies within AD
auto AP = pts1[i] - pts2[0];
auto APdotAB = dot_2d<T>(AP, AB);
auto APdotAD = -dot_2d<T>(AP, DA);
if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&
(APdotAD <= ADdotAD)) {
intersections[num++] = pts1[i];
}
}
}
// Reverse the check - check for vertices of rect2 inside rect1
{
const auto &AB = vec1[0];
const auto &DA = vec1[3];
auto ABdotAB = dot_2d<T>(AB, AB);
auto ADdotAD = dot_2d<T>(DA, DA);
for (int i = 0; i < 4; i++) {
auto AP = pts2[i] - pts1[0];
auto APdotAB = dot_2d<T>(AP, AB);
auto APdotAD = -dot_2d<T>(AP, DA);
if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&
(APdotAD <= ADdotAD)) {
intersections[num++] = pts2[i];
}
}
}
return num;
}
template <typename T>
HOST_DEVICE_INLINE int convex_hull_graham(const Point<T> (&p)[24],
const int &num_in, Point<T> (&q)[24],
bool shift_to_zero = false) {
assert(num_in >= 2);
// Step 1:
// Find point with minimum y
// if more than 1 points have the same minimum y,
// pick the one with the minimum x.
int t = 0;
for (int i = 1; i < num_in; i++) {
if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {
t = i;
}
}
auto &start = p[t]; // starting point
// Step 2:
// Subtract starting point from every points (for sorting in the next step)
for (int i = 0; i < num_in; i++) {
q[i] = p[i] - start;
}
// Swap the starting point to position 0
auto tmp = q[0];
q[0] = q[t];
q[t] = tmp;
// Step 3:
// Sort point 1 ~ num_in according to their relative cross-product values
// (essentially sorting according to angles)
// If the angles are the same, sort according to their distance to origin
T dist[24];
for (int i = 0; i < num_in; i++) {
dist[i] = dot_2d<T>(q[i], q[i]);
}
#ifdef __CUDACC__
// CUDA version
// In the future, we can potentially use thrust
// for sorting here to improve speed (though not guaranteed)
for (int i = 1; i < num_in - 1; i++) {
for (int j = i + 1; j < num_in; j++) {
T crossProduct = cross_2d<T>(q[i], q[j]);
if ((crossProduct < -1e-6) ||
(fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) {
auto q_tmp = q[i];
q[i] = q[j];
q[j] = q_tmp;
auto dist_tmp = dist[i];
dist[i] = dist[j];
dist[j] = dist_tmp;
}
}
}
#else
// CPU version
std::sort(q + 1, q + num_in,
[](const Point<T> &A, const Point<T> &B) -> bool {
T temp = cross_2d<T>(A, B);
if (fabs(temp) < 1e-6) {
return dot_2d<T>(A, A) < dot_2d<T>(B, B);
} else {
return temp > 0;
}
});
#endif
// Step 4:
// Make sure there are at least 2 points (that don't overlap with each other)
// in the stack
int k; // index of the non-overlapped second point
for (k = 1; k < num_in; k++) {
if (dist[k] > 1e-8) {
break;
}
}
if (k == num_in) {
// We reach the end, which means the convex hull is just one point
q[0] = p[t];
return 1;
}
q[1] = q[k];
int m = 2; // 2 points in the stack
// Step 5:
// Finally we can start the scanning process.
// When a non-convex relationship between the 3 points is found
// (either concave shape or duplicated points),
// we pop the previous point from the stack
// until the 3-point relationship is convex again, or
// until the stack only contains two points
for (int i = k + 1; i < num_in; i++) {
while (m > 1 && cross_2d<T>(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) {
m--;
}
q[m++] = q[i];
}
// Step 6 (Optional):
// In general sense we need the original coordinates, so we
// need to shift the points back (reverting Step 2)
// But if we're only interested in getting the area/perimeter of the shape
// We can simply return.
if (!shift_to_zero) {
for (int i = 0; i < m; i++) {
q[i] += start;
}
}
return m;
}
template <typename T>
HOST_DEVICE_INLINE T polygon_area(const Point<T> (&q)[24], const int &m) {
if (m <= 2) {
return 0;
}
T area = 0;
for (int i = 1; i < m - 1; i++) {
area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));
}
return area / 2.0;
}
template <typename T>
HOST_DEVICE_INLINE T rboxes_intersection(const RotatedBox<T> &box1,
const RotatedBox<T> &box2) {
// There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
// from rotated_rect_intersection_pts
Point<T> intersectPts[24], orderedPts[24];
Point<T> pts1[4];
Point<T> pts2[4];
get_rotated_vertices<T>(box1, pts1);
get_rotated_vertices<T>(box2, pts2);
int num = get_intersection_points<T>(pts1, pts2, intersectPts);
if (num <= 2) {
return 0.0;
}
// Convex Hull to order the intersection points in clockwise order and find
// the contour area.
int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);
return polygon_area<T>(orderedPts, num_convex);
}
} // namespace
template <typename T>
HOST_DEVICE_INLINE T rbox_iou_single(T const *const box1_raw,
T const *const box2_raw) {
// shift center to the middle point to achieve higher precision in result
RotatedBox<T> box1, box2;
auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0;
auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0;
box1.x_ctr = box1_raw[0] - center_shift_x;
box1.y_ctr = box1_raw[1] - center_shift_y;
box1.w = box1_raw[2];
box1.h = box1_raw[3];
box1.a = box1_raw[4];
box2.x_ctr = box2_raw[0] - center_shift_x;
box2.y_ctr = box2_raw[1] - center_shift_y;
box2.w = box2_raw[2];
box2.h = box2_raw[3];
box2.a = box2_raw[4];
const T area1 = box1.w * box1.h;
const T area2 = box2.w * box2.h;
if (area1 < 1e-14 || area2 < 1e-14) {
return 0.f;
}
const T intersection = rboxes_intersection<T>(box1, box2);
const T iou = intersection / (area1 + area2 - intersection);
return iou;
}

@ -0,0 +1,33 @@
import os
import glob
import paddle
from paddle.utils.cpp_extension import CppExtension, CUDAExtension, setup
def get_extensions():
root_dir = os.path.dirname(os.path.abspath(__file__))
ext_root_dir = os.path.join(root_dir, 'csrc')
sources = []
for ext_name in os.listdir(ext_root_dir):
ext_dir = os.path.join(ext_root_dir, ext_name)
source = glob.glob(os.path.join(ext_dir, '*.cc'))
kwargs = dict()
if paddle.device.is_compiled_with_cuda():
source += glob.glob(os.path.join(ext_dir, '*.cu'))
if not source:
continue
sources += source
if paddle.device.is_compiled_with_cuda():
extension = CUDAExtension(
sources, extra_compile_args={'cxx': ['-DPADDLE_WITH_CUDA']})
else:
extension = CppExtension(sources)
return extension
if __name__ == "__main__":
setup(name='ext_op', ext_modules=get_extensions())

@ -0,0 +1,149 @@
import numpy as np
import sys
import time
from shapely.geometry import Polygon
import paddle
import unittest
from ext_op import matched_rbox_iou
def rbox2poly_single(rrect, get_best_begin_point=False):
"""
rrect:[x_ctr,y_ctr,w,h,angle]
to
poly:[x0,y0,x1,y1,x2,y2,x3,y3]
"""
x_ctr, y_ctr, width, height, angle = rrect[:5]
tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2
# rect 2x4
rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]])
R = np.array([[np.cos(angle), -np.sin(angle)],
[np.sin(angle), np.cos(angle)]])
# poly
poly = R.dot(rect)
x0, x1, x2, x3 = poly[0, :4] + x_ctr
y0, y1, y2, y3 = poly[1, :4] + y_ctr
poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float64)
return poly
def intersection(g, p):
"""
Intersection.
"""
g = g[:8].reshape((4, 2))
p = p[:8].reshape((4, 2))
a = g
b = p
use_filter = True
if use_filter:
# step1:
inter_x1 = np.maximum(np.min(a[:, 0]), np.min(b[:, 0]))
inter_x2 = np.minimum(np.max(a[:, 0]), np.max(b[:, 0]))
inter_y1 = np.maximum(np.min(a[:, 1]), np.min(b[:, 1]))
inter_y2 = np.minimum(np.max(a[:, 1]), np.max(b[:, 1]))
if inter_x1 >= inter_x2 or inter_y1 >= inter_y2:
return 0.
x1 = np.minimum(np.min(a[:, 0]), np.min(b[:, 0]))
x2 = np.maximum(np.max(a[:, 0]), np.max(b[:, 0]))
y1 = np.minimum(np.min(a[:, 1]), np.min(b[:, 1]))
y2 = np.maximum(np.max(a[:, 1]), np.max(b[:, 1]))
if x1 >= x2 or y1 >= y2 or (x2 - x1) < 2 or (y2 - y1) < 2:
return 0.
g = Polygon(g)
p = Polygon(p)
if not g.is_valid or not p.is_valid:
return 0
inter = Polygon(g).intersection(Polygon(p)).area
union = g.area + p.area - inter
if union == 0:
return 0
else:
return inter / union
def matched_rbox_overlaps(anchors, gt_bboxes, use_cv2=False):
"""
Args:
anchors: [M, 5] x1,y1,x2,y2,angle
gt_bboxes: [M, 5] x1,y1,x2,y2,angle
Returns:
macthed_iou: [M]
"""
assert anchors.shape[1] == 5
assert gt_bboxes.shape[1] == 5
gt_bboxes_ploy = [rbox2poly_single(e) for e in gt_bboxes]
anchors_ploy = [rbox2poly_single(e) for e in anchors]
num = len(anchors_ploy)
iou = np.zeros((num, ), dtype=np.float64)
start_time = time.time()
for i in range(num):
try:
iou[i] = intersection(gt_bboxes_ploy[i], anchors_ploy[i])
except Exception as e:
print('cur gt_bboxes_ploy[i]', gt_bboxes_ploy[i], 'anchors_ploy[j]',
anchors_ploy[i], e)
return iou
def gen_sample(n):
rbox = np.random.rand(n, 5)
rbox[:, 0:4] = rbox[:, 0:4] * 0.45 + 0.001
rbox[:, 4] = rbox[:, 4] - 0.5
return rbox
class MatchedRBoxIoUTest(unittest.TestCase):
def setUp(self):
self.initTestCase()
self.rbox1 = gen_sample(self.n)
self.rbox2 = gen_sample(self.n)
def initTestCase(self):
self.n = 1000
def assertAllClose(self, x, y, msg, atol=5e-1, rtol=1e-2):
self.assertTrue(np.allclose(x, y, atol=atol, rtol=rtol), msg=msg)
def get_places(self):
places = [paddle.CPUPlace()]
if paddle.device.is_compiled_with_cuda():
places.append(paddle.CUDAPlace(0))
return places
def check_output(self, place):
paddle.disable_static()
pd_rbox1 = paddle.to_tensor(self.rbox1, place=place)
pd_rbox2 = paddle.to_tensor(self.rbox2, place=place)
actual_t = matched_rbox_iou(pd_rbox1, pd_rbox2).numpy()
poly_rbox1 = self.rbox1
poly_rbox2 = self.rbox2
poly_rbox1[:, 0:4] = self.rbox1[:, 0:4] * 1024
poly_rbox2[:, 0:4] = self.rbox2[:, 0:4] * 1024
expect_t = matched_rbox_overlaps(poly_rbox1, poly_rbox2, use_cv2=False)
self.assertAllClose(
actual_t,
expect_t,
msg="rbox_iou has diff at {} \nExpect {}\nBut got {}".format(
str(place), str(expect_t), str(actual_t)))
def test_output(self):
places = self.get_places()
for place in places:
self.check_output(place)
if __name__ == "__main__":
unittest.main()

@ -0,0 +1,151 @@
import numpy as np
import sys
import time
from shapely.geometry import Polygon
import paddle
import unittest
from ext_op import rbox_iou
def rbox2poly_single(rrect, get_best_begin_point=False):
"""
rrect:[x_ctr,y_ctr,w,h,angle]
to
poly:[x0,y0,x1,y1,x2,y2,x3,y3]
"""
x_ctr, y_ctr, width, height, angle = rrect[:5]
tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2
# rect 2x4
rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]])
R = np.array([[np.cos(angle), -np.sin(angle)],
[np.sin(angle), np.cos(angle)]])
# poly
poly = R.dot(rect)
x0, x1, x2, x3 = poly[0, :4] + x_ctr
y0, y1, y2, y3 = poly[1, :4] + y_ctr
poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float64)
return poly
def intersection(g, p):
"""
Intersection.
"""
g = g[:8].reshape((4, 2))
p = p[:8].reshape((4, 2))
a = g
b = p
use_filter = True
if use_filter:
# step1:
inter_x1 = np.maximum(np.min(a[:, 0]), np.min(b[:, 0]))
inter_x2 = np.minimum(np.max(a[:, 0]), np.max(b[:, 0]))
inter_y1 = np.maximum(np.min(a[:, 1]), np.min(b[:, 1]))
inter_y2 = np.minimum(np.max(a[:, 1]), np.max(b[:, 1]))
if inter_x1 >= inter_x2 or inter_y1 >= inter_y2:
return 0.
x1 = np.minimum(np.min(a[:, 0]), np.min(b[:, 0]))
x2 = np.maximum(np.max(a[:, 0]), np.max(b[:, 0]))
y1 = np.minimum(np.min(a[:, 1]), np.min(b[:, 1]))
y2 = np.maximum(np.max(a[:, 1]), np.max(b[:, 1]))
if x1 >= x2 or y1 >= y2 or (x2 - x1) < 2 or (y2 - y1) < 2:
return 0.
g = Polygon(g)
p = Polygon(p)
if not g.is_valid or not p.is_valid:
return 0
inter = Polygon(g).intersection(Polygon(p)).area
union = g.area + p.area - inter
if union == 0:
return 0
else:
return inter / union
def rbox_overlaps(anchors, gt_bboxes, use_cv2=False):
"""
Args:
anchors: [NA, 5] x1,y1,x2,y2,angle
gt_bboxes: [M, 5] x1,y1,x2,y2,angle
Returns:
iou: [NA, M]
"""
assert anchors.shape[1] == 5
assert gt_bboxes.shape[1] == 5
gt_bboxes_ploy = [rbox2poly_single(e) for e in gt_bboxes]
anchors_ploy = [rbox2poly_single(e) for e in anchors]
num_gt, num_anchors = len(gt_bboxes_ploy), len(anchors_ploy)
iou = np.zeros((num_anchors, num_gt), dtype=np.float64)
start_time = time.time()
for i in range(num_anchors):
for j in range(num_gt):
try:
iou[i, j] = intersection(anchors_ploy[i], gt_bboxes_ploy[j])
except Exception as e:
print('cur anchors_ploy[i]', anchors_ploy[i],
'gt_bboxes_ploy[j]', gt_bboxes_ploy[j], e)
return iou
def gen_sample(n):
rbox = np.random.rand(n, 5)
rbox[:, 0:4] = rbox[:, 0:4] * 0.45 + 0.001
rbox[:, 4] = rbox[:, 4] - 0.5
return rbox
class RBoxIoUTest(unittest.TestCase):
def setUp(self):
self.initTestCase()
self.rbox1 = gen_sample(self.n)
self.rbox2 = gen_sample(self.m)
def initTestCase(self):
self.n = 13000
self.m = 7
def assertAllClose(self, x, y, msg, atol=5e-1, rtol=1e-2):
self.assertTrue(np.allclose(x, y, atol=atol, rtol=rtol), msg=msg)
def get_places(self):
places = [paddle.CPUPlace()]
if paddle.device.is_compiled_with_cuda():
places.append(paddle.CUDAPlace(0))
return places
def check_output(self, place):
paddle.disable_static()
pd_rbox1 = paddle.to_tensor(self.rbox1, place=place)
pd_rbox2 = paddle.to_tensor(self.rbox2, place=place)
actual_t = rbox_iou(pd_rbox1, pd_rbox2).numpy()
poly_rbox1 = self.rbox1
poly_rbox2 = self.rbox2
poly_rbox1[:, 0:4] = self.rbox1[:, 0:4] * 1024
poly_rbox2[:, 0:4] = self.rbox2[:, 0:4] * 1024
expect_t = rbox_overlaps(poly_rbox1, poly_rbox2, use_cv2=False)
self.assertAllClose(
actual_t,
expect_t,
msg="rbox_iou has diff at {} \nExpect {}\nBut got {}".format(
str(place), str(expect_t), str(actual_t)))
def test_output(self):
places = self.get_places()
for place in places:
self.check_output(place)
if __name__ == "__main__":
unittest.main()

@ -65,6 +65,14 @@ def get_det_poly_res(bboxes, bbox_nums, image_id, label_to_cat_id_map, bias=0):
return det_res
def strip_mask(mask):
row = mask[0, 0, :]
col = mask[0, :, 0]
im_h = len(col) - np.count_nonzero(col == -1)
im_w = len(row) - np.count_nonzero(row == -1)
return mask[:, :im_h, :im_w]
def get_seg_res(masks, bboxes, mask_nums, image_id, label_to_cat_id_map):
import pycocotools.mask as mask_util
seg_res = []
@ -72,8 +80,10 @@ def get_seg_res(masks, bboxes, mask_nums, image_id, label_to_cat_id_map):
for i in range(len(mask_nums)):
cur_image_id = int(image_id[i][0])
det_nums = mask_nums[i]
mask_i = masks[k:k + det_nums]
mask_i = strip_mask(mask_i)
for j in range(det_nums):
mask = masks[k].astype(np.uint8)
mask = mask_i[j].astype(np.uint8)
score = float(bboxes[k][1])
label = int(bboxes[k][0])
k = k + 1

@ -16,6 +16,7 @@ import os
import json
from collections import defaultdict, OrderedDict
import numpy as np
import paddle
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
from ..modeling.keypoint_utils import oks_nms
@ -70,15 +71,23 @@ class KeyPointTopDownCOCOEval(object):
self.results['all_preds'][self.idx:self.idx + num_images, :, 0:
3] = kpts[:, :, 0:3]
self.results['all_boxes'][self.idx:self.idx + num_images, 0:2] = inputs[
'center'].numpy()[:, 0:2]
'center'].numpy()[:, 0:2] if isinstance(
inputs['center'], paddle.Tensor) else inputs['center'][:, 0:2]
self.results['all_boxes'][self.idx:self.idx + num_images, 2:4] = inputs[
'scale'].numpy()[:, 0:2]
'scale'].numpy()[:, 0:2] if isinstance(
inputs['scale'], paddle.Tensor) else inputs['scale'][:, 0:2]
self.results['all_boxes'][self.idx:self.idx + num_images, 4] = np.prod(
inputs['scale'].numpy() * 200, 1)
self.results['all_boxes'][self.idx:self.idx + num_images,
5] = np.squeeze(inputs['score'].numpy())
inputs['scale'].numpy() * 200,
1) if isinstance(inputs['scale'], paddle.Tensor) else np.prod(
inputs['scale'] * 200, 1)
self.results['all_boxes'][
self.idx:self.idx + num_images,
5] = np.squeeze(inputs['score'].numpy()) if isinstance(
inputs['score'], paddle.Tensor) else np.squeeze(inputs['score'])
if isinstance(inputs['im_id'], paddle.Tensor):
self.results['image_path'].extend(inputs['im_id'].numpy())
else:
self.results['image_path'].extend(inputs['im_id'])
self.idx += num_images
def _write_coco_keypoint_results(self, keypoints):

@ -22,7 +22,7 @@ import sys
import numpy as np
import itertools
import paddle
from paddlers.models.ppdet.modeling.bbox_utils import poly2rbox, rbox2poly_np
from paddlers.models.ppdet.modeling.rbox_utils import poly2rbox_np
from paddlers.models.ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)
@ -91,15 +91,13 @@ def jaccard_overlap(pred, gt, is_bbox_normalized=False):
return overlap
def calc_rbox_iou(pred, gt_rbox):
def calc_rbox_iou(pred, gt_poly):
"""
calc iou between rotated bbox
"""
# calc iou of bounding box for speedup
pred = np.array(pred, np.float32).reshape(-1, 8)
pred = pred.reshape(-1, 2)
gt_poly = rbox2poly_np(np.array(gt_rbox).reshape(-1, 5))[0]
gt_poly = gt_poly.reshape(-1, 2)
pred = np.array(pred, np.float32).reshape(-1, 2)
gt_poly = np.array(gt_poly, np.float32).reshape(-1, 2)
pred_rect = [
np.min(pred[:, 0]), np.min(pred[:, 1]), np.max(pred[:, 0]),
np.max(pred[:, 1])
@ -114,20 +112,15 @@ def calc_rbox_iou(pred, gt_rbox):
return iou
# calc rbox iou
pred = pred.reshape(-1, 8)
pred = np.array(pred, np.float32).reshape(-1, 8)
pred_rbox = poly2rbox(pred)
pred_rbox = pred_rbox.reshape(-1, 5)
pred_rbox = pred_rbox.reshape(-1, 5)
pred_rbox = poly2rbox_np(pred.reshape(-1, 8)).reshape(-1, 5)
gt_rbox = poly2rbox_np(gt_poly.reshape(-1, 8)).reshape(-1, 5)
try:
from rbox_iou_ops import rbox_iou
from ext_op import rbox_iou
except Exception as e:
print("import custom_ops error, try install rbox_iou_ops " \
print("import custom_ops error, try install ext_op " \
"following ppdet/ext_op/README.md", e)
sys.stdout.flush()
sys.exit(-1)
gt_rbox = np.array(gt_rbox, np.float32).reshape(-1, 5)
pd_gt_rbox = paddle.to_tensor(gt_rbox, dtype='float32')
pd_pred_rbox = paddle.to_tensor(pred_rbox, dtype='float32')
iou = rbox_iou(pd_gt_rbox, pd_pred_rbox)
@ -138,8 +131,7 @@ def calc_rbox_iou(pred, gt_rbox):
def prune_zero_padding(gt_box, gt_label, difficult=None):
valid_cnt = 0
for i in range(len(gt_box)):
if gt_box[i, 0] == 0 and gt_box[i, 1] == 0 and \
gt_box[i, 2] == 0 and gt_box[i, 3] == 0:
if (gt_box[i] == 0).all():
break
valid_cnt += 1
return (gt_box[:valid_cnt], gt_label[:valid_cnt], difficult[:valid_cnt]
@ -212,7 +204,7 @@ class DetectionMAP(object):
max_overlap = -1.0
for i, gl in enumerate(gt_label):
if int(gl) == int(l):
if len(gt_box[i]) == 5:
if len(gt_box[i]) == 8:
overlap = calc_rbox_iou(pred, gt_box[i])
else:
overlap = jaccard_overlap(pred, gt_box[i],

@ -21,18 +21,21 @@ import copy
import sys
import math
from collections import defaultdict
from motmetrics.math_util import quiet_divide
import numpy as np
import pandas as pd
import paddle
import paddle.nn.functional as F
from .metrics import Metric
try:
import motmetrics as mm
import openpyxl
from motmetrics.math_util import quiet_divide
metrics = mm.metrics.motchallenge_metrics
mh = mm.metrics.create()
except:
print(
'Warning: Unable to use MCMOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics'
)
pass
from paddlers.models.ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)
@ -302,24 +305,30 @@ class MCMOTEvaluator(object):
self.num_classes = num_classes
self.load_annotations()
try:
import motmetrics as mm
mm.lap.default_solver = 'lap'
except Exception as e:
raise RuntimeError(
'Unable to use MCMOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics'
)
self.reset_accumulator()
self.class_accs = []
def load_annotations(self):
assert self.data_type == 'mcmot'
self.gt_filename = os.path.join(self.data_root, '../', '../',
'sequences',
self.gt_filename = os.path.join(self.data_root, '../', 'sequences',
'{}.txt'.format(self.seq_name))
if not os.path.exists(self.gt_filename):
logger.warning(
"gt_filename '{}' of MCMOTEvaluator is not exist, so the MOTA will be -INF."
)
def reset_accumulator(self):
import motmetrics as mm
mm.lap.default_solver = 'lap'
self.acc = mm.MOTAccumulator(auto_id=True)
def eval_frame_dict(self, trk_objs, gt_objs, rtn_events=False, union=False):
import motmetrics as mm
mm.lap.default_solver = 'lap'
if union:
trk_tlwhs, trk_ids, trk_cls = unzip_objs_cls(trk_objs)[:3]
gt_tlwhs, gt_ids, gt_cls = unzip_objs_cls(gt_objs)[:3]
@ -393,9 +402,6 @@ class MCMOTEvaluator(object):
names,
metrics=('mota', 'num_switches', 'idp', 'idr', 'idf1',
'precision', 'recall')):
import motmetrics as mm
mm.lap.default_solver = 'lap'
names = copy.deepcopy(names)
if metrics is None:
metrics = mm.metrics.motchallenge_metrics

@ -22,11 +22,14 @@ import json
import paddle
import numpy as np
import typing
from collections import defaultdict
from pathlib import Path
from .map_utils import prune_zero_padding, DetectionMAP
from .coco_utils import get_infer_results, cocoapi_eval
from .widerface_utils import face_eval_run
from paddlers.models.ppdet.data.source.category import get_categories
from paddlers.models.ppdet.modeling.rbox_utils import poly2rbox_np
from paddlers.models.ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)
@ -69,8 +72,6 @@ class Metric(paddle.metric.Metric):
class COCOMetric(Metric):
def __init__(self, anno_file, **kwargs):
assert os.path.isfile(anno_file), \
"anno_file {} not a file".format(anno_file)
self.anno_file = anno_file
self.clsid2catid = kwargs.get('clsid2catid', None)
if self.clsid2catid is None:
@ -81,6 +82,14 @@ class COCOMetric(Metric):
self.bias = kwargs.get('bias', 0)
self.save_prediction_only = kwargs.get('save_prediction_only', False)
self.iou_type = kwargs.get('IouType', 'bbox')
if not self.save_prediction_only:
assert os.path.isfile(anno_file), \
"anno_file {} not a file".format(anno_file)
if self.output_eval is not None:
Path(self.output_eval).mkdir(exist_ok=True)
self.reset()
def reset(self):
@ -218,7 +227,9 @@ class VOCMetric(Metric):
map_type='11point',
is_bbox_normalized=False,
evaluate_difficult=False,
classwise=False):
classwise=False,
output_eval=None,
save_prediction_only=False):
assert os.path.isfile(label_list), \
"label_list {} not a file".format(label_list)
self.clsid2catid, self.catid2name = get_categories('VOC', label_list)
@ -226,6 +237,8 @@ class VOCMetric(Metric):
self.overlap_thresh = overlap_thresh
self.map_type = map_type
self.evaluate_difficult = evaluate_difficult
self.output_eval = output_eval
self.save_prediction_only = save_prediction_only
self.detection_map = DetectionMAP(
class_num=class_num,
overlap_thresh=overlap_thresh,
@ -238,34 +251,52 @@ class VOCMetric(Metric):
self.reset()
def reset(self):
self.results = {'bbox': [], 'score': [], 'label': []}
self.detection_map.reset()
def update(self, inputs, outputs):
bbox_np = outputs['bbox'].numpy()
bbox_np = outputs['bbox'].numpy() if isinstance(
outputs['bbox'], paddle.Tensor) else outputs['bbox']
bboxes = bbox_np[:, 2:]
scores = bbox_np[:, 1]
labels = bbox_np[:, 0]
bbox_lengths = outputs['bbox_num'].numpy()
bbox_lengths = outputs['bbox_num'].numpy() if isinstance(
outputs['bbox_num'], paddle.Tensor) else outputs['bbox_num']
self.results['bbox'].append(bboxes.tolist())
self.results['score'].append(scores.tolist())
self.results['label'].append(labels.tolist())
if bboxes.shape == (1, 1) or bboxes is None:
return
if self.save_prediction_only:
return
gt_boxes = inputs['gt_bbox']
gt_labels = inputs['gt_class']
difficults = inputs['difficult'] if not self.evaluate_difficult \
else None
scale_factor = inputs['scale_factor'].numpy(
) if 'scale_factor' in inputs else np.ones(
(gt_boxes.shape[0], 2)).astype('float32')
if 'scale_factor' in inputs:
scale_factor = inputs['scale_factor'].numpy() if isinstance(
inputs['scale_factor'],
paddle.Tensor) else inputs['scale_factor']
else:
scale_factor = np.ones((gt_boxes.shape[0], 2)).astype('float32')
bbox_idx = 0
for i in range(len(gt_boxes)):
gt_box = gt_boxes[i].numpy()
gt_box = gt_boxes[i].numpy() if isinstance(
gt_boxes[i], paddle.Tensor) else gt_boxes[i]
h, w = scale_factor[i]
gt_box = gt_box / np.array([w, h, w, h])
gt_label = gt_labels[i].numpy()
difficult = None if difficults is None \
else difficults[i].numpy()
gt_label = gt_labels[i].numpy() if isinstance(
gt_labels[i], paddle.Tensor) else gt_labels[i]
if difficults is not None:
difficult = difficults[i].numpy() if isinstance(
difficults[i], paddle.Tensor) else difficults[i]
else:
difficult = None
bbox_num = bbox_lengths[i]
bbox = bboxes[bbox_idx:bbox_idx + bbox_num]
score = scores[bbox_idx:bbox_idx + bbox_num]
@ -277,6 +308,15 @@ class VOCMetric(Metric):
bbox_idx += bbox_num
def accumulate(self):
output = "bbox.json"
if self.output_eval:
output = os.path.join(self.output_eval, output)
with open(output, 'w') as f:
json.dump(self.results, f)
logger.info('The bbox result is saved to bbox.json.')
if self.save_prediction_only:
return
logger.info("Accumulating evaluatation results...")
self.detection_map.accumulate()
@ -309,25 +349,16 @@ class WiderFaceMetric(Metric):
class RBoxMetric(Metric):
def __init__(self, anno_file, **kwargs):
assert os.path.isfile(anno_file), \
"anno_file {} not a file".format(anno_file)
assert os.path.exists(anno_file), "anno_file {} not exists".format(
anno_file)
self.anno_file = anno_file
self.gt_anno = json.load(open(self.anno_file))
cats = self.gt_anno['categories']
self.clsid2catid = {i: cat['id'] for i, cat in enumerate(cats)}
self.catid2clsid = {cat['id']: i for i, cat in enumerate(cats)}
self.catid2name = {cat['id']: cat['name'] for cat in cats}
self.clsid2catid, self.catid2name = get_categories('COCO', anno_file)
self.catid2clsid = {v: k for k, v in self.clsid2catid.items()}
self.classwise = kwargs.get('classwise', False)
self.output_eval = kwargs.get('output_eval', None)
# TODO: bias should be unified
self.bias = kwargs.get('bias', 0)
self.save_prediction_only = kwargs.get('save_prediction_only', False)
self.iou_type = kwargs.get('IouType', 'bbox')
self.overlap_thresh = kwargs.get('overlap_thresh', 0.5)
self.map_type = kwargs.get('map_type', '11point')
self.evaluate_difficult = kwargs.get('evaluate_difficult', False)
self.imid2path = kwargs.get('imid2path', None)
class_num = len(self.catid2name)
self.detection_map = DetectionMAP(
class_num=class_num,
@ -341,7 +372,7 @@ class RBoxMetric(Metric):
self.reset()
def reset(self):
self.result_bbox = []
self.results = []
self.detection_map.reset()
def update(self, inputs, outputs):
@ -351,41 +382,81 @@ class RBoxMetric(Metric):
outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v
im_id = inputs['im_id']
outs['im_id'] = im_id.numpy() if isinstance(im_id,
paddle.Tensor) else im_id
im_id = im_id.numpy() if isinstance(im_id, paddle.Tensor) else im_id
outs['im_id'] = im_id
infer_results = get_infer_results(
outs, self.clsid2catid, bias=self.bias)
self.result_bbox += infer_results[
'bbox'] if 'bbox' in infer_results else []
bbox = [b['bbox'] for b in self.result_bbox]
score = [b['score'] for b in self.result_bbox]
label = [b['category_id'] for b in self.result_bbox]
label = [self.catid2clsid[e] for e in label]
gt_box = [
e['bbox'] for e in self.gt_anno['annotations']
if e['image_id'] == outs['im_id']
infer_results = get_infer_results(outs, self.clsid2catid)
infer_results = infer_results['bbox'] if 'bbox' in infer_results else []
self.results += infer_results
if self.save_prediction_only:
return
gt_boxes = inputs['gt_poly']
gt_labels = inputs['gt_class']
if 'scale_factor' in inputs:
scale_factor = inputs['scale_factor'].numpy() if isinstance(
inputs['scale_factor'],
paddle.Tensor) else inputs['scale_factor']
else:
scale_factor = np.ones((gt_boxes.shape[0], 2)).astype('float32')
for i in range(len(gt_boxes)):
gt_box = gt_boxes[i].numpy() if isinstance(
gt_boxes[i], paddle.Tensor) else gt_boxes[i]
h, w = scale_factor[i]
gt_box = gt_box / np.array([w, h, w, h, w, h, w, h])
gt_label = gt_labels[i].numpy() if isinstance(
gt_labels[i], paddle.Tensor) else gt_labels[i]
gt_box, gt_label, _ = prune_zero_padding(gt_box, gt_label)
bbox = [
res['bbox'] for res in infer_results
if int(res['image_id']) == int(im_id[i])
]
score = [
res['score'] for res in infer_results
if int(res['image_id']) == int(im_id[i])
]
gt_label = [
e['category_id'] for e in self.gt_anno['annotations']
if e['image_id'] == outs['im_id']
label = [
self.catid2clsid[int(res['category_id'])]
for res in infer_results
if int(res['image_id']) == int(im_id[i])
]
gt_label = [self.catid2clsid[e] for e in gt_label]
self.detection_map.update(bbox, score, label, gt_box, gt_label)
def save_results(self, results, output_dir, imid2path):
if imid2path:
data_dicts = defaultdict(list)
for result in results:
image_id = result['image_id']
data_dicts[image_id].append(result)
for image_id, image_path in imid2path.items():
basename = os.path.splitext(os.path.split(image_path)[-1])[0]
output = os.path.join(output_dir, "{}.txt".format(basename))
dets = data_dicts.get(image_id, [])
with open(output, 'w') as f:
for det in dets:
catid, bbox, score = det['category_id'], det[
'bbox'], det['score']
bbox_pred = '{} {} '.format(self.catid2name[catid],
score) + ' '.join(
[str(e) for e in bbox])
f.write(bbox_pred + '\n')
logger.info('The bbox result is saved to {}.'.format(output_dir))
else:
output = os.path.join(output_dir, "bbox.json")
with open(output, 'w') as f:
json.dump(results, f)
logger.info('The bbox result is saved to {}.'.format(output))
def accumulate(self):
if len(self.result_bbox) > 0:
output = "bbox.json"
if self.output_eval:
output = os.path.join(self.output_eval, output)
with open(output, 'w') as f:
json.dump(self.result_bbox, f)
logger.info('The bbox result is saved to bbox.json.')
self.save_results(self.results, self.output_eval, self.imid2path)
if self.save_prediction_only:
logger.info('The bbox result is saved to {} and do not '
'evaluate the mAP.'.format(output))
else:
if not self.save_prediction_only:
logger.info("Accumulating evaluatation results...")
self.detection_map.accumulate()

@ -22,13 +22,21 @@ import sys
import math
from collections import defaultdict
import numpy as np
import paddle
import paddle.nn.functional as F
from paddlers.models.ppdet.modeling.bbox_utils import bbox_iou_np_expand
from .map_utils import ap_per_class
from .metrics import Metric
from .munkres import Munkres
try:
import motmetrics as mm
mm.lap.default_solver = 'lap'
except:
print(
'Warning: Unable to use MOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics'
)
pass
from paddlers.models.ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)
@ -36,8 +44,13 @@ __all__ = ['MOTEvaluator', 'MOTMetric', 'JDEDetMetric', 'KITTIMOTMetric']
def read_mot_results(filename, is_gt=False, is_ignore=False):
valid_labels = {1}
ignore_labels = {2, 7, 8, 12} # only in motchallenge datasets like 'MOT16'
valid_label = [1]
ignore_labels = [2, 7, 8, 12] # only in motchallenge datasets like 'MOT16'
if is_gt:
logger.info(
"In MOT16/17 dataset the valid_label of ground truth is '{}', "
"in other dataset it should be '0' for single classs MOT.".format(
valid_label[0]))
results_dict = dict()
if os.path.isfile(filename):
with open(filename, 'r') as f:
@ -50,12 +63,10 @@ def read_mot_results(filename, is_gt=False, is_ignore=False):
continue
results_dict.setdefault(fid, list())
box_size = float(linelist[4]) * float(linelist[5])
if is_gt:
label = int(float(linelist[7]))
mark = int(float(linelist[6]))
if mark == 0 or label not in valid_labels:
if mark == 0 or label not in valid_label:
continue
score = 1
elif is_ignore:
@ -112,24 +123,31 @@ class MOTEvaluator(object):
self.data_type = data_type
self.load_annotations()
try:
import motmetrics as mm
mm.lap.default_solver = 'lap'
except Exception as e:
raise RuntimeError(
'Unable to use MOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics'
)
self.reset_accumulator()
def load_annotations(self):
assert self.data_type == 'mot'
gt_filename = os.path.join(self.data_root, self.seq_name, 'gt',
'gt.txt')
if not os.path.exists(gt_filename):
logger.warning(
"gt_filename '{}' of MOTEvaluator is not exist, so the MOTA will be -INF."
)
self.gt_frame_dict = read_mot_results(gt_filename, is_gt=True)
self.gt_ignore_frame_dict = read_mot_results(
gt_filename, is_ignore=True)
def reset_accumulator(self):
import motmetrics as mm
mm.lap.default_solver = 'lap'
self.acc = mm.MOTAccumulator(auto_id=True)
def eval_frame(self, frame_id, trk_tlwhs, trk_ids, rtn_events=False):
import motmetrics as mm
mm.lap.default_solver = 'lap'
# results
trk_tlwhs = np.copy(trk_tlwhs)
trk_ids = np.copy(trk_ids)
@ -187,8 +205,6 @@ class MOTEvaluator(object):
names,
metrics=('mota', 'num_switches', 'idp', 'idr', 'idf1',
'precision', 'recall')):
import motmetrics as mm
mm.lap.default_solver = 'lap'
names = copy.deepcopy(names)
if metrics is None:
metrics = mm.metrics.motchallenge_metrics
@ -225,8 +241,6 @@ class MOTMetric(Metric):
self.result_root = result_root
def accumulate(self):
import motmetrics as mm
import openpyxl
metrics = mm.metrics.motchallenge_metrics
mh = mm.metrics.create()
summary = self.MOTEvaluator.get_summary(self.accs, self.seqs, metrics)
@ -551,7 +565,7 @@ class KITTIEvaluation(object):
"track ids are not unique for sequence %d: frame %d"
% (seq, t_data.frame))
logger.info(
"track id %d occured at least twice for this frame"
"track id %d occurred at least twice for this frame"
% t_data.track_id)
logger.info("Exiting...")
#continue # this allows to evaluate non-unique result files

@ -0,0 +1,13 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

@ -0,0 +1,48 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import paddle
import paddlers.models.ppdet as ppdet
import unittest
# NOTE: weights downloading costs time, we choose
# a small model for unittesting
MODEL_NAME = 'ppyolo/ppyolo_tiny_650e_coco'
class TestGetConfigFile(unittest.TestCase):
def test_main(self):
try:
cfg_file = ppdet.model_zoo.get_config_file(MODEL_NAME)
assert os.path.isfile(cfg_file)
except:
self.assertTrue(False)
class TestGetModel(unittest.TestCase):
def test_main(self):
try:
model = ppdet.model_zoo.get_model(MODEL_NAME)
assert isinstance(model, paddle.nn.Layer)
except:
self.assertTrue(False)
if __name__ == '__main__':
unittest.main()

@ -0,0 +1,68 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import unittest
import paddlers.models.ppdet as ppdet
class TestListModel(unittest.TestCase):
def setUp(self):
self._filter = []
def test_main(self):
try:
ppdet.model_zoo.list_model(self._filter)
self.assertTrue(True)
except:
self.assertTrue(False)
class TestListModelYOLO(TestListModel):
def setUp(self):
self._filter = ['yolo']
class TestListModelRCNN(TestListModel):
def setUp(self):
self._filter = ['rcnn']
class TestListModelSSD(TestListModel):
def setUp(self):
self._filter = ['ssd']
class TestListModelMultiFilter(TestListModel):
def setUp(self):
self._filter = ['yolo', 'darknet']
class TestListModelError(unittest.TestCase):
def setUp(self):
self._filter = ['xxx']
def test_main(self):
try:
ppdet.model_zoo.list_model(self._filter)
self.assertTrue(False)
except ValueError:
self.assertTrue(True)
if __name__ == '__main__':
unittest.main()

@ -29,6 +29,7 @@ from . import reid
from . import mot
from . import transformers
from . import assigners
from . import rbox_utils
from .ops import *
from .backbones import *
@ -43,3 +44,4 @@ from .reid import *
from .mot import *
from .transformers import *
from .assigners import *
from .rbox_utils import *

@ -5,6 +5,13 @@
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import meta_arch
from . import faster_rcnn
from . import mask_rcnn
@ -26,6 +33,9 @@ from . import picodet
from . import detr
from . import sparse_rcnn
from . import tood
from . import retinanet
from . import bytetrack
from . import yolox
from .meta_arch import *
from .faster_rcnn import *
@ -49,3 +59,6 @@ from .picodet import *
from .detr import *
from .sparse_rcnn import *
from .tood import *
from .retinanet import *
from .bytetrack import *
from .yolox import *

@ -0,0 +1,79 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from paddlers.models.ppdet.core.workspace import register, create
from .meta_arch import BaseArch
__all__ = ['ByteTrack']
@register
class ByteTrack(BaseArch):
"""
ByteTrack network, see https://arxiv.org/abs/2110.06864
Args:
detector (object): detector model instance
reid (object): reid model instance, default None
tracker (object): tracker instance
"""
__category__ = 'architecture'
def __init__(self, detector='YOLOX', reid=None, tracker='JDETracker'):
super(ByteTrack, self).__init__()
self.detector = detector
self.reid = reid
self.tracker = tracker
@classmethod
def from_config(cls, cfg, *args, **kwargs):
detector = create(cfg['detector'])
if cfg['reid'] != 'None':
reid = create(cfg['reid'])
else:
reid = None
tracker = create(cfg['tracker'])
return {
"detector": detector,
"reid": reid,
"tracker": tracker,
}
def _forward(self):
det_outs = self.detector(self.inputs)
if self.training:
return det_outs
else:
if self.reid is not None:
assert 'crops' in self.inputs
crops = self.inputs['crops']
pred_embs = self.reid(crops)
else:
pred_embs = None
det_outs['embeddings'] = pred_embs
return det_outs
def get_loss(self):
return self._forward()
def get_pred(self):
return self._forward()

@ -111,14 +111,14 @@ class CascadeRCNN(BaseArch):
bbox, bbox_num = self.bbox_post_process(
preds, (refined_rois, rois_num), im_shape, scale_factor)
# rescale the prediction back to origin image
bbox_pred = self.bbox_post_process.get_pred(bbox, bbox_num,
im_shape, scale_factor)
bbox, bbox_pred, bbox_num = self.bbox_post_process.get_pred(
bbox, bbox_num, im_shape, scale_factor)
if not self.with_mask:
return bbox_pred, bbox_num, None
mask_out = self.mask_head(body_feats, bbox, bbox_num, self.inputs)
origin_shape = self.bbox_post_process.get_origin_shape()
mask_pred = self.mask_post_process(mask_out[:, 0, :, :], bbox_pred,
bbox_num, origin_shape)
mask_pred = self.mask_post_process(mask_out, bbox_pred, bbox_num,
origin_shape)
return bbox_pred, bbox_num, mask_pred
def get_loss(self, ):

@ -62,8 +62,9 @@ class DeepSORT(BaseArch):
def _forward(self):
crops = self.inputs['crops']
features = self.reid(crops)
return features
outs = {}
outs['embeddings'] = self.reid(crops)
return outs
def get_pred(self):
return self._forward()

@ -87,8 +87,8 @@ class FasterRCNN(BaseArch):
im_shape, scale_factor)
# rescale the prediction back to origin image
bbox_pred = self.bbox_post_process.get_pred(bbox, bbox_num,
im_shape, scale_factor)
bboxes, bbox_pred, bbox_num = self.bbox_post_process.get_pred(
bbox, bbox_num, im_shape, scale_factor)
return bbox_pred, bbox_num
def get_loss(self, ):

@ -153,7 +153,7 @@ class HrHRNetPostProcess(object):
heat_thresh (float): value of topk below this threshhold will be ignored
tag_thresh (float): coord's value sampled in tagmap below this threshold belong to same people for init
inputs(list[heatmap]): the output list of modle, [heatmap, heatmap_maxpool, tagmap], heatmap_maxpool used to get topk
inputs(list[heatmap]): the output list of model, [heatmap, heatmap_maxpool, tagmap], heatmap_maxpool used to get topk
original_height, original_width (float): the original image size
'''

@ -112,11 +112,11 @@ class MaskRCNN(BaseArch):
body_feats, bbox, bbox_num, self.inputs, feat_func=feat_func)
# rescale the prediction back to origin image
bbox_pred = self.bbox_post_process.get_pred(bbox, bbox_num,
im_shape, scale_factor)
bbox, bbox_pred, bbox_num = self.bbox_post_process.get_pred(
bbox, bbox_num, im_shape, scale_factor)
origin_shape = self.bbox_post_process.get_origin_shape()
mask_pred = self.mask_post_process(mask_out[:, 0, :, :], bbox_pred,
bbox_num, origin_shape)
mask_pred = self.mask_post_process(mask_out, bbox_pred, bbox_num,
origin_shape)
return bbox_pred, bbox_num, mask_pred
def get_loss(self, ):

@ -22,22 +22,23 @@ class BaseArch(nn.Layer):
self.fuse_norm = False
def load_meanstd(self, cfg_transform):
self.scale = 1.
self.mean = paddle.to_tensor([0.485, 0.456, 0.406]).reshape(
(1, 3, 1, 1))
self.std = paddle.to_tensor([0.229, 0.224, 0.225]).reshape((1, 3, 1, 1))
scale = 1.
mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
for item in cfg_transform:
if 'NormalizeImage' in item:
self.mean = paddle.to_tensor(item['NormalizeImage'][
'mean']).reshape((1, 3, 1, 1))
self.std = paddle.to_tensor(item['NormalizeImage'][
'std']).reshape((1, 3, 1, 1))
mean = np.array(
item['NormalizeImage']['mean'], dtype=np.float32)
std = np.array(item['NormalizeImage']['std'], dtype=np.float32)
if item['NormalizeImage'].get('is_scale', True):
self.scale = 1. / 255.
scale = 1. / 255.
break
if self.data_format == 'NHWC':
self.mean = self.mean.reshape(1, 1, 1, 3)
self.std = self.std.reshape(1, 1, 1, 3)
self.scale = paddle.to_tensor(scale / std).reshape((1, 1, 1, 3))
self.bias = paddle.to_tensor(-mean / std).reshape((1, 1, 1, 3))
else:
self.scale = paddle.to_tensor(scale / std).reshape((1, 3, 1, 1))
self.bias = paddle.to_tensor(-mean / std).reshape((1, 3, 1, 1))
def forward(self, inputs):
if self.data_format == 'NHWC':
@ -46,7 +47,7 @@ class BaseArch(nn.Layer):
if self.fuse_norm:
image = inputs['image']
self.inputs['image'] = (image * self.scale - self.mean) / self.std
self.inputs['image'] = image * self.scale + self.bias
self.inputs['im_shape'] = inputs['im_shape']
self.inputs['scale_factor'] = inputs['scale_factor']
else:
@ -63,9 +64,13 @@ class BaseArch(nn.Layer):
inputs_list.append(inputs)
else:
inputs_list.extend(inputs)
outs = []
for inp in inputs_list:
if self.fuse_norm:
self.inputs['image'] = inp['image'] * self.scale + self.bias
self.inputs['im_shape'] = inp['im_shape']
self.inputs['scale_factor'] = inp['scale_factor']
else:
self.inputs = inp
outs.append(self.get_pred())
@ -124,16 +129,3 @@ class BaseArch(nn.Layer):
def get_pred(self, ):
raise NotImplementedError("Should implement get_pred method!")
@classmethod
def convert_sync_batchnorm(cls, layer):
layer_output = layer
if getattr(layer, 'norm_type', None) == 'sync_bn':
layer_output = nn.SyncBatchNorm.convert_sync_batchnorm(layer)
else:
for name, sublayer in layer.named_children():
layer_output.add_sublayer(name,
cls.convert_sync_batchnorm(sublayer))
del layer
return layer_output

@ -41,7 +41,8 @@ class PicoDet(BaseArch):
self.backbone = backbone
self.neck = neck
self.head = head
self.deploy = False
self.export_post_process = True
self.export_nms = True
@classmethod
def from_config(cls, cfg, *args, **kwargs):
@ -62,14 +63,13 @@ class PicoDet(BaseArch):
def _forward(self):
body_feats = self.backbone(self.inputs)
fpn_feats = self.neck(body_feats)
head_outs = self.head(fpn_feats, self.deploy)
if self.training or self.deploy:
head_outs = self.head(fpn_feats, self.export_post_process)
if self.training or not self.export_post_process:
return head_outs, None
else:
im_shape = self.inputs['im_shape']
scale_factor = self.inputs['scale_factor']
bboxes, bbox_num = self.head.post_process(head_outs, im_shape,
scale_factor)
bboxes, bbox_num = self.head.post_process(
head_outs, scale_factor, export_nms=self.export_nms)
return bboxes, bbox_num
def get_loss(self, ):
@ -83,9 +83,13 @@ class PicoDet(BaseArch):
return loss
def get_pred(self):
if self.deploy:
if not self.export_post_process:
return {'picodet': self._forward()[0]}
else:
elif self.export_nms:
bbox_pred, bbox_num = self._forward()
output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
return output
else:
bboxes, mlvl_scores = self._forward()
output = {'bbox': bboxes, 'scores': mlvl_scores}
return output

@ -0,0 +1,68 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from paddlers.models.ppdet.core.workspace import register, create
from .meta_arch import BaseArch
import paddle
__all__ = ['RetinaNet']
@register
class RetinaNet(BaseArch):
__category__ = 'architecture'
def __init__(self, backbone, neck, head):
super(RetinaNet, self).__init__()
self.backbone = backbone
self.neck = neck
self.head = head
@classmethod
def from_config(cls, cfg, *args, **kwargs):
backbone = create(cfg['backbone'])
kwargs = {'input_shape': backbone.out_shape}
neck = create(cfg['neck'], **kwargs)
kwargs = {'input_shape': neck.out_shape}
head = create(cfg['head'], **kwargs)
return {
'backbone': backbone,
'neck': neck,
'head': head,
}
def _forward(self):
body_feats = self.backbone(self.inputs)
neck_feats = self.neck(body_feats)
if self.training:
return self.head(neck_feats, self.inputs)
else:
head_outs = self.head(neck_feats)
bbox, bbox_num = self.head.post_process(
head_outs, self.inputs['im_shape'], self.inputs['scale_factor'])
return {'bbox': bbox, 'bbox_num': bbox_num}
def get_loss(self):
return self._forward()
def get_pred(self):
return self._forward()

@ -26,26 +26,21 @@ __all__ = ['S2ANet']
@register
class S2ANet(BaseArch):
__category__ = 'architecture'
__inject__ = [
's2anet_head',
's2anet_bbox_post_process',
]
__inject__ = ['head']
def __init__(self, backbone, neck, s2anet_head, s2anet_bbox_post_process):
def __init__(self, backbone, neck, head):
"""
S2ANet, see https://arxiv.org/pdf/2008.09397.pdf
Args:
backbone (object): backbone instance
neck (object): `FPN` instance
s2anet_head (object): `S2ANetHead` instance
s2anet_bbox_post_process (object): `S2ANetBBoxPostProcess` instance
head (object): `Head` instance
"""
super(S2ANet, self).__init__()
self.backbone = backbone
self.neck = neck
self.s2anet_head = s2anet_head
self.s2anet_bbox_post_process = s2anet_bbox_post_process
self.s2anet_head = head
@classmethod
def from_config(cls, cfg, *args, **kwargs):
@ -55,42 +50,28 @@ class S2ANet(BaseArch):
out_shape = neck and neck.out_shape or backbone.out_shape
kwargs = {'input_shape': out_shape}
s2anet_head = create(cfg['s2anet_head'], **kwargs)
s2anet_bbox_post_process = create(cfg['s2anet_bbox_post_process'],
**kwargs)
head = create(cfg['head'], **kwargs)
return {
'backbone': backbone,
'neck': neck,
"s2anet_head": s2anet_head,
"s2anet_bbox_post_process": s2anet_bbox_post_process,
}
return {'backbone': backbone, 'neck': neck, "head": head}
def _forward(self):
body_feats = self.backbone(self.inputs)
if self.neck is not None:
body_feats = self.neck(body_feats)
self.s2anet_head(body_feats)
if self.training:
loss = self.s2anet_head.get_loss(self.inputs)
total_loss = paddle.add_n(list(loss.values()))
loss.update({'loss': total_loss})
loss = self.s2anet_head(body_feats, self.inputs)
return loss
else:
im_shape = self.inputs['im_shape']
scale_factor = self.inputs['scale_factor']
nms_pre = self.s2anet_bbox_post_process.nms_pre
pred_scores, pred_bboxes = self.s2anet_head.get_prediction(nms_pre)
head_outs = self.s2anet_head(body_feats)
# post_process
pred_bboxes, bbox_num = self.s2anet_bbox_post_process(pred_scores,
pred_bboxes)
bboxes, bbox_num = self.s2anet_head.get_bboxes(head_outs)
# rescale the prediction back to origin image
pred_bboxes = self.s2anet_bbox_post_process.get_pred(
pred_bboxes, bbox_num, im_shape, scale_factor)
im_shape = self.inputs['im_shape']
scale_factor = self.inputs['scale_factor']
bboxes = self.s2anet_head.get_pred(bboxes, bbox_num, im_shape,
scale_factor)
# output
output = {'bbox': pred_bboxes, 'bbox_num': bbox_num}
output = {'bbox': bboxes, 'bbox_num': bbox_num}
return output
def get_loss(self, ):

@ -109,10 +109,13 @@ class YOLOv3(BaseArch):
if self.return_idx:
_, bbox, bbox_num, _ = self.post_process(
yolo_head_outs, self.yolo_head.mask_anchors)
else:
elif self.post_process is not None:
bbox, bbox_num = self.post_process(
yolo_head_outs, self.yolo_head.mask_anchors,
self.inputs['im_shape'], self.inputs['scale_factor'])
else:
bbox, bbox_num = self.yolo_head.post_process(
yolo_head_outs, self.inputs['scale_factor'])
output = {'bbox': bbox, 'bbox_num': bbox_num}
return output

@ -0,0 +1,138 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from paddlers.models.ppdet.core.workspace import register, create
from .meta_arch import BaseArch
import random
import paddle
import paddle.nn.functional as F
import paddle.distributed as dist
__all__ = ['YOLOX']
@register
class YOLOX(BaseArch):
"""
YOLOX network, see https://arxiv.org/abs/2107.08430
Args:
backbone (nn.Layer): backbone instance
neck (nn.Layer): neck instance
head (nn.Layer): head instance
for_mot (bool): whether used for MOT or not
input_size (list[int]): initial scale, will be reset by self._preprocess()
size_stride (int): stride of the size range
size_range (list[int]): multi-scale range for training
random_interval (int): interval of iter to change self._input_size
"""
__category__ = 'architecture'
def __init__(self,
backbone='CSPDarkNet',
neck='YOLOCSPPAN',
head='YOLOXHead',
for_mot=False,
input_size=[640, 640],
size_stride=32,
size_range=[15, 25],
random_interval=10):
super(YOLOX, self).__init__()
self.backbone = backbone
self.neck = neck
self.head = head
self.for_mot = for_mot
self.input_size = input_size
self._input_size = paddle.to_tensor(input_size)
self.size_stride = size_stride
self.size_range = size_range
self.random_interval = random_interval
self._step = 0
@classmethod
def from_config(cls, cfg, *args, **kwargs):
# backbone
backbone = create(cfg['backbone'])
# fpn
kwargs = {'input_shape': backbone.out_shape}
neck = create(cfg['neck'], **kwargs)
# head
kwargs = {'input_shape': neck.out_shape}
head = create(cfg['head'], **kwargs)
return {
'backbone': backbone,
'neck': neck,
"head": head,
}
def _forward(self):
if self.training:
self._preprocess()
body_feats = self.backbone(self.inputs)
neck_feats = self.neck(body_feats, self.for_mot)
if self.training:
yolox_losses = self.head(neck_feats, self.inputs)
yolox_losses.update({'size': self._input_size[0]})
return yolox_losses
else:
head_outs = self.head(neck_feats)
bbox, bbox_num = self.head.post_process(
head_outs, self.inputs['im_shape'], self.inputs['scale_factor'])
return {'bbox': bbox, 'bbox_num': bbox_num}
def get_loss(self):
return self._forward()
def get_pred(self):
return self._forward()
def _preprocess(self):
# YOLOX multi-scale training, interpolate resize before inputs of the network.
self._get_size()
scale_y = self._input_size[0] / self.input_size[0]
scale_x = self._input_size[1] / self.input_size[1]
if scale_x != 1 or scale_y != 1:
self.inputs['image'] = F.interpolate(
self.inputs['image'],
size=self._input_size,
mode='bilinear',
align_corners=False)
gt_bboxes = self.inputs['gt_bbox']
for i in range(len(gt_bboxes)):
if len(gt_bboxes[i]) > 0:
gt_bboxes[i][:, 0::2] = gt_bboxes[i][:, 0::2] * scale_x
gt_bboxes[i][:, 1::2] = gt_bboxes[i][:, 1::2] * scale_y
self.inputs['gt_bbox'] = gt_bboxes
def _get_size(self):
# random_interval = 10 as default, every 10 iters to change self._input_size
image_ratio = self.input_size[1] * 1.0 / self.input_size[0]
if self._step % self.random_interval == 0:
size_factor = random.randint(*self.size_range)
size = [
self.size_stride * size_factor,
self.size_stride * int(size_factor * image_ratio)
]
self._input_size = paddle.to_tensor(size)
self._step += 1

@ -16,8 +16,10 @@ from . import utils
from . import task_aligned_assigner
from . import atss_assigner
from . import simota_assigner
from . import max_iou_assigner
from .utils import *
from .task_aligned_assigner import *
from .atss_assigner import *
from .simota_assigner import *
from .max_iou_assigner import *

@ -22,11 +22,13 @@ import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppdet.core.workspace import register
from ..ops import iou_similarity
from ..bbox_utils import iou_similarity, batch_iou_similarity
from ..bbox_utils import bbox_center
from .utils import (pad_gt, check_points_inside_bboxes, compute_max_iou_anchor,
from .utils import (check_points_inside_bboxes, compute_max_iou_anchor,
compute_max_iou_gt)
__all__ = ['ATSSAssigner']
@register
class ATSSAssigner(nn.Layer):
@ -48,7 +50,6 @@ class ATSSAssigner(nn.Layer):
def _gather_topk_pyramid(self, gt2anchor_distances, num_anchors_list,
pad_gt_mask):
pad_gt_mask = pad_gt_mask.tile([1, 1, self.topk]).astype(paddle.bool)
gt2anchor_distances_list = paddle.split(
gt2anchor_distances, num_anchors_list, axis=-1)
num_anchors_index = np.cumsum(num_anchors_list).tolist()
@ -58,15 +59,12 @@ class ATSSAssigner(nn.Layer):
for distances, anchors_index in zip(gt2anchor_distances_list,
num_anchors_index):
num_anchors = distances.shape[-1]
topk_metrics, topk_idxs = paddle.topk(
_, topk_idxs = paddle.topk(
distances, self.topk, axis=-1, largest=False)
topk_idxs_list.append(topk_idxs + anchors_index)
topk_idxs = paddle.where(pad_gt_mask, topk_idxs,
paddle.zeros_like(topk_idxs))
is_in_topk = F.one_hot(topk_idxs, num_anchors).sum(axis=-2)
is_in_topk = paddle.where(is_in_topk > 1,
paddle.zeros_like(is_in_topk), is_in_topk)
is_in_topk_list.append(is_in_topk.astype(gt2anchor_distances.dtype))
is_in_topk = F.one_hot(topk_idxs, num_anchors).sum(
axis=-2).astype(gt2anchor_distances.dtype)
is_in_topk_list.append(is_in_topk * pad_gt_mask)
is_in_topk_list = paddle.concat(is_in_topk_list, axis=-1)
topk_idxs_list = paddle.concat(topk_idxs_list, axis=-1)
return is_in_topk_list, topk_idxs_list
@ -77,8 +75,10 @@ class ATSSAssigner(nn.Layer):
num_anchors_list,
gt_labels,
gt_bboxes,
pad_gt_mask,
bg_index,
gt_scores=None):
gt_scores=None,
pred_bboxes=None):
r"""This code is based on
https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/atss_assigner.py
@ -99,18 +99,18 @@ class ATSSAssigner(nn.Layer):
anchor_bboxes (Tensor, float32): pre-defined anchors, shape(L, 4),
"xmin, xmax, ymin, ymax" format
num_anchors_list (List): num of anchors in each level
gt_labels (Tensor|List[Tensor], int64): Label of gt_bboxes, shape(B, n, 1)
gt_bboxes (Tensor|List[Tensor], float32): Ground truth bboxes, shape(B, n, 4)
gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4)
pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
bg_index (int): background index
gt_scores (Tensor|List[Tensor]|None, float32) Score of gt_bboxes,
gt_scores (Tensor|None, float32) Score of gt_bboxes,
shape(B, n, 1), if None, then it will initialize with one_hot label
pred_bboxes (Tensor, float32, optional): predicted bounding boxes, shape(B, L, 4)
Returns:
assigned_labels (Tensor): (B, L)
assigned_bboxes (Tensor): (B, L, 4)
assigned_scores (Tensor): (B, L, C)
assigned_scores (Tensor): (B, L, C), if pred_bboxes is not None, then output ious
"""
gt_labels, gt_bboxes, pad_gt_scores, pad_gt_mask = pad_gt(
gt_labels, gt_bboxes, gt_scores)
assert gt_labels.ndim == gt_bboxes.ndim and \
gt_bboxes.ndim == 3
@ -119,7 +119,8 @@ class ATSSAssigner(nn.Layer):
# negative batch
if num_max_boxes == 0:
assigned_labels = paddle.full([batch_size, num_anchors], bg_index)
assigned_labels = paddle.full(
[batch_size, num_anchors], bg_index, dtype='int32')
assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4])
assigned_scores = paddle.zeros(
[batch_size, num_anchors, self.num_classes])
@ -149,9 +150,8 @@ class ATSSAssigner(nn.Layer):
iou_threshold = iou_threshold.reshape([batch_size, num_max_boxes, -1])
iou_threshold = iou_threshold.mean(axis=-1, keepdim=True) + \
iou_threshold.std(axis=-1, keepdim=True)
is_in_topk = paddle.where(
iou_candidates > iou_threshold.tile([1, 1, num_anchors]),
is_in_topk, paddle.zeros_like(is_in_topk))
is_in_topk = paddle.where(iou_candidates > iou_threshold, is_in_topk,
paddle.zeros_like(is_in_topk))
# 6. check the positive sample's center in gt, [B, n, L]
is_in_gts = check_points_inside_bboxes(anchor_centers, gt_bboxes)
@ -178,9 +178,6 @@ class ATSSAssigner(nn.Layer):
mask_positive)
mask_positive_sum = mask_positive.sum(axis=-2)
assigned_gt_index = mask_positive.argmax(axis=-2)
assert mask_positive_sum.max() == 1, \
("one anchor just assign one gt, but received not equals 1. "
"Received: %f" % mask_positive_sum.max().item())
# assigned target
batch_ind = paddle.arange(
@ -197,10 +194,19 @@ class ATSSAssigner(nn.Layer):
gt_bboxes.reshape([-1, 4]), assigned_gt_index.flatten(), axis=0)
assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4])
assigned_scores = F.one_hot(assigned_labels, self.num_classes)
if gt_scores is not None:
assigned_scores = F.one_hot(assigned_labels, self.num_classes + 1)
ind = list(range(self.num_classes + 1))
ind.remove(bg_index)
assigned_scores = paddle.index_select(
assigned_scores, paddle.to_tensor(ind), axis=-1)
if pred_bboxes is not None:
# assigned iou
ious = batch_iou_similarity(gt_bboxes, pred_bboxes) * mask_positive
ious = ious.max(axis=-2).unsqueeze(-1)
assigned_scores *= ious
elif gt_scores is not None:
gather_scores = paddle.gather(
pad_gt_scores.flatten(), assigned_gt_index.flatten(), axis=0)
gt_scores.flatten(), assigned_gt_index.flatten(), axis=0)
gather_scores = gather_scores.reshape([batch_size, num_anchors])
gather_scores = paddle.where(mask_positive_sum > 0, gather_scores,
paddle.zeros_like(gather_scores))

@ -0,0 +1,54 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from paddlers.models.ppdet.core.workspace import register
from paddlers.models.ppdet.modeling.proposal_generator.target import label_box
__all__ = ['MaxIoUAssigner']
@register
class MaxIoUAssigner(object):
"""a standard bbox assigner based on max IoU, use ppdet's label_box
as backend.
Args:
positive_overlap (float): threshold for defining positive samples
negative_overlap (float): threshold for denining negative samples
allow_low_quality (bool): whether to lower IoU thr if a GT poorly
overlaps with candidate bboxes
"""
def __init__(self,
positive_overlap,
negative_overlap,
allow_low_quality=True):
self.positive_overlap = positive_overlap
self.negative_overlap = negative_overlap
self.allow_low_quality = allow_low_quality
def __call__(self, bboxes, gt_bboxes):
matches, match_labels = label_box(
bboxes,
gt_bboxes,
positive_overlap=self.positive_overlap,
negative_overlap=self.negative_overlap,
allow_low_quality=self.allow_low_quality,
ignore_thresh=-1,
is_crowd=None,
assign_on_cpu=False)
return matches, match_labels

@ -115,7 +115,10 @@ class SimOTAAssigner(object):
def dynamic_k_matching(self, cost_matrix, pairwise_ious, num_gt):
match_matrix = np.zeros_like(cost_matrix.numpy())
# select candidate topk ious for dynamic-k calculation
topk_ious, _ = paddle.topk(pairwise_ious, self.candidate_topk, axis=0)
topk_ious, _ = paddle.topk(
pairwise_ious,
min(self.candidate_topk, pairwise_ious.shape[0]),
axis=0)
# calculate dynamic k for each gt
dynamic_ks = paddle.clip(topk_ious.sum(0).cast('int'), min=1)
for gt_idx in range(num_gt):

@ -21,10 +21,12 @@ import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppdet.core.workspace import register
from ..bbox_utils import iou_similarity
from .utils import (pad_gt, gather_topk_anchors, check_points_inside_bboxes,
from ..bbox_utils import batch_iou_similarity
from .utils import (gather_topk_anchors, check_points_inside_bboxes,
compute_max_iou_anchor)
__all__ = ['TaskAlignedAssigner']
@register
class TaskAlignedAssigner(nn.Layer):
@ -43,8 +45,10 @@ class TaskAlignedAssigner(nn.Layer):
pred_scores,
pred_bboxes,
anchor_points,
num_anchors_list,
gt_labels,
gt_bboxes,
pad_gt_mask,
bg_index,
gt_scores=None):
r"""This code is based on
@ -61,20 +65,18 @@ class TaskAlignedAssigner(nn.Layer):
pred_scores (Tensor, float32): predicted class probability, shape(B, L, C)
pred_bboxes (Tensor, float32): predicted bounding boxes, shape(B, L, 4)
anchor_points (Tensor, float32): pre-defined anchors, shape(L, 2), "cxcy" format
gt_labels (Tensor|List[Tensor], int64): Label of gt_bboxes, shape(B, n, 1)
gt_bboxes (Tensor|List[Tensor], float32): Ground truth bboxes, shape(B, n, 4)
num_anchors_list (List): num of anchors in each level, shape(L)
gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4)
pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
bg_index (int): background index
gt_scores (Tensor|List[Tensor]|None, float32) Score of gt_bboxes,
shape(B, n, 1), if None, then it will initialize with one_hot label
gt_scores (Tensor|None, float32) Score of gt_bboxes, shape(B, n, 1)
Returns:
assigned_labels (Tensor): (B, L)
assigned_bboxes (Tensor): (B, L, 4)
assigned_scores (Tensor): (B, L, C)
"""
assert pred_scores.ndim == pred_bboxes.ndim
gt_labels, gt_bboxes, pad_gt_scores, pad_gt_mask = pad_gt(
gt_labels, gt_bboxes, gt_scores)
assert gt_labels.ndim == gt_bboxes.ndim and \
gt_bboxes.ndim == 3
@ -83,14 +85,15 @@ class TaskAlignedAssigner(nn.Layer):
# negative batch
if num_max_boxes == 0:
assigned_labels = paddle.full([batch_size, num_anchors], bg_index)
assigned_labels = paddle.full(
[batch_size, num_anchors], bg_index, dtype='int32')
assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4])
assigned_scores = paddle.zeros(
[batch_size, num_anchors, num_classes])
return assigned_labels, assigned_bboxes, assigned_scores
# compute iou between gt and pred bbox, [B, n, L]
ious = iou_similarity(gt_bboxes, pred_bboxes)
ious = batch_iou_similarity(gt_bboxes, pred_bboxes)
# gather pred bboxes class score
pred_scores = pred_scores.transpose([0, 2, 1])
batch_ind = paddle.arange(
@ -109,9 +112,7 @@ class TaskAlignedAssigner(nn.Layer):
# select topk largest alignment metrics pred bbox as candidates
# for each gt, [B, n, L]
is_in_topk = gather_topk_anchors(
alignment_metrics * is_in_gts,
self.topk,
topk_mask=pad_gt_mask.tile([1, 1, self.topk]).astype(paddle.bool))
alignment_metrics * is_in_gts, self.topk, topk_mask=pad_gt_mask)
# select positive sample, [B, n, L]
mask_positive = is_in_topk * is_in_gts * pad_gt_mask
@ -127,9 +128,6 @@ class TaskAlignedAssigner(nn.Layer):
mask_positive)
mask_positive_sum = mask_positive.sum(axis=-2)
assigned_gt_index = mask_positive.argmax(axis=-2)
assert mask_positive_sum.max() == 1, \
("one anchor just assign one gt, but received not equals 1. "
"Received: %f" % mask_positive_sum.max().item())
# assigned target
assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes
@ -144,7 +142,11 @@ class TaskAlignedAssigner(nn.Layer):
gt_bboxes.reshape([-1, 4]), assigned_gt_index.flatten(), axis=0)
assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4])
assigned_scores = F.one_hot(assigned_labels, num_classes)
assigned_scores = F.one_hot(assigned_labels, num_classes + 1)
ind = list(range(num_classes + 1))
ind.remove(bg_index)
assigned_scores = paddle.index_select(
assigned_scores, paddle.to_tensor(ind), axis=-1)
# rescale alignment metrics
alignment_metrics *= mask_positive
max_metrics_per_instance = alignment_metrics.max(axis=-1, keepdim=True)

@ -88,7 +88,7 @@ def gather_topk_anchors(metrics, topk, largest=True, topk_mask=None, eps=1e-9):
largest (bool) : largest is a flag, if set to true,
algorithm will sort by descending order, otherwise sort by
ascending order. Default: True
topk_mask (Tensor, bool|None): shape[B, n, topk], ignore bbox mask,
topk_mask (Tensor, float32): shape[B, n, 1], ignore bbox mask,
Default: None
eps (float): Default: 1e-9
Returns:
@ -98,20 +98,22 @@ def gather_topk_anchors(metrics, topk, largest=True, topk_mask=None, eps=1e-9):
topk_metrics, topk_idxs = paddle.topk(
metrics, topk, axis=-1, largest=largest)
if topk_mask is None:
topk_mask = (topk_metrics.max(axis=-1, keepdim=True) > eps).tile(
[1, 1, topk])
topk_idxs = paddle.where(topk_mask, topk_idxs, paddle.zeros_like(topk_idxs))
is_in_topk = F.one_hot(topk_idxs, num_anchors).sum(axis=-2)
is_in_topk = paddle.where(is_in_topk > 1,
paddle.zeros_like(is_in_topk), is_in_topk)
return is_in_topk.astype(metrics.dtype)
topk_mask = (
topk_metrics.max(axis=-1, keepdim=True) > eps).astype(metrics.dtype)
is_in_topk = F.one_hot(topk_idxs, num_anchors).sum(
axis=-2).astype(metrics.dtype)
return is_in_topk * topk_mask
def check_points_inside_bboxes(points, bboxes, eps=1e-9):
def check_points_inside_bboxes(points,
bboxes,
center_radius_tensor=None,
eps=1e-9):
r"""
Args:
points (Tensor, float32): shape[L, 2], "xy" format, L: num_anchors
bboxes (Tensor, float32): shape[B, n, 4], "xmin, ymin, xmax, ymax" format
center_radius_tensor (Tensor, float32): shape [L, 1]. Default: None.
eps (float): Default: 1e-9
Returns:
is_in_bboxes (Tensor, float32): shape[B, n, L], value=1. means selected
@ -119,12 +121,28 @@ def check_points_inside_bboxes(points, bboxes, eps=1e-9):
points = points.unsqueeze([0, 1])
x, y = points.chunk(2, axis=-1)
xmin, ymin, xmax, ymax = bboxes.unsqueeze(2).chunk(4, axis=-1)
# check whether `points` is in `bboxes`
l = x - xmin
t = y - ymin
r = xmax - x
b = ymax - y
bbox_ltrb = paddle.concat([l, t, r, b], axis=-1)
return (bbox_ltrb.min(axis=-1) > eps).astype(bboxes.dtype)
delta_ltrb = paddle.concat([l, t, r, b], axis=-1)
is_in_bboxes = (delta_ltrb.min(axis=-1) > eps)
if center_radius_tensor is not None:
# check whether `points` is in `center_radius`
center_radius_tensor = center_radius_tensor.unsqueeze([0, 1])
cx = (xmin + xmax) * 0.5
cy = (ymin + ymax) * 0.5
l = x - (cx - center_radius_tensor)
t = y - (cy - center_radius_tensor)
r = (cx + center_radius_tensor) - x
b = (cy + center_radius_tensor) - y
delta_ltrb_c = paddle.concat([l, t, r, b], axis=-1)
is_in_center = (delta_ltrb_c.min(axis=-1) > eps)
return (paddle.logical_and(is_in_bboxes, is_in_center),
paddle.logical_or(is_in_bboxes, is_in_center))
return is_in_bboxes.astype(bboxes.dtype)
def compute_max_iou_anchor(ious):
@ -158,7 +176,8 @@ def compute_max_iou_gt(ious):
def generate_anchors_for_grid_cell(feats,
fpn_strides,
grid_cell_size=5.0,
grid_cell_offset=0.5):
grid_cell_offset=0.5,
dtype='float32'):
r"""
Like ATSS, generate anchors based on grid size.
Args:
@ -167,14 +186,16 @@ def generate_anchors_for_grid_cell(feats,
grid_cell_size (float): anchor size
grid_cell_offset (float): The range is between 0 and 1.
Returns:
anchors (List[Tensor]): shape[s, (l, 4)]
num_anchors_list (List[int]): shape[s]
stride_tensor_list (List[Tensor]): shape[s, (l, 1)]
anchors (Tensor): shape[l, 4], "xmin, ymin, xmax, ymax" format.
anchor_points (Tensor): shape[l, 2], "x, y" format.
num_anchors_list (List[int]): shape[s], contains [s_1, s_2, ...].
stride_tensor (Tensor): shape[l, 1], contains the stride for each scale.
"""
assert len(feats) == len(fpn_strides)
anchors = []
anchor_points = []
num_anchors_list = []
stride_tensor_list = []
stride_tensor = []
for feat, stride in zip(feats, fpn_strides):
_, _, h, w = feat.shape
cell_half_size = grid_cell_size * stride * 0.5
@ -186,9 +207,19 @@ def generate_anchors_for_grid_cell(feats,
shift_x - cell_half_size, shift_y - cell_half_size,
shift_x + cell_half_size, shift_y + cell_half_size
],
axis=-1).astype(feat.dtype)
axis=-1).astype(dtype)
anchor_point = paddle.stack([shift_x, shift_y], axis=-1).astype(dtype)
anchors.append(anchor.reshape([-1, 4]))
anchor_points.append(anchor_point.reshape([-1, 2]))
num_anchors_list.append(len(anchors[-1]))
stride_tensor_list.append(
paddle.full([num_anchors_list[-1], 1], stride))
return anchors, num_anchors_list, stride_tensor_list
stride_tensor.append(
paddle.full(
[num_anchors_list[-1], 1], stride, dtype=dtype))
anchors = paddle.concat(anchors)
anchors.stop_gradient = True
anchor_points = paddle.concat(anchor_points)
anchor_points.stop_gradient = True
stride_tensor = paddle.concat(stride_tensor)
stride_tensor.stop_gradient = True
return anchors, anchor_points, num_anchors_list, stride_tensor

@ -1,4 +1,4 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -29,6 +29,11 @@ from . import swin_transformer
from . import lcnet
from . import hardnet
from . import esnet
from . import cspresnet
from . import csp_darknet
from . import convnext
from . import vision_transformer
from . import mobileone
from .vgg import *
from .resnet import *
@ -47,3 +52,9 @@ from .swin_transformer import *
from .lcnet import *
from .hardnet import *
from .esnet import *
from .cspresnet import *
from .csp_darknet import *
from .convnext import *
from .vision_transformer import *
from .vision_transformer import *
from .mobileone import *

@ -1,4 +1,4 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.

@ -0,0 +1,245 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
'''
Modified from https://github.com/facebookresearch/ConvNeXt
Copyright (c) Meta Platforms, Inc. and affiliates.
All rights reserved.
This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.
'''
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn.initializer import Constant
import numpy as np
from paddlers.models.ppdet.core.workspace import register, serializable
from ..shape_spec import ShapeSpec
from .transformer_utils import DropPath, trunc_normal_, zeros_
__all__ = ['ConvNeXt']
class Block(nn.Layer):
r""" ConvNeXt Block. There are two equivalent implementations:
(1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
(2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
We use (2) as we find it slightly faster in Pypaddle
Args:
dim (int): Number of input channels.
drop_path (float): Stochastic depth rate. Default: 0.0
layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
"""
def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
super().__init__()
self.dwconv = nn.Conv2D(
dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv
self.norm = LayerNorm(dim, eps=1e-6)
self.pwconv1 = nn.Linear(
dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers
self.act = nn.GELU()
self.pwconv2 = nn.Linear(4 * dim, dim)
if layer_scale_init_value > 0:
self.gamma = self.create_parameter(
shape=(dim, ),
attr=ParamAttr(initializer=Constant(layer_scale_init_value)))
else:
self.gamma = None
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity(
)
def forward(self, x):
input = x
x = self.dwconv(x)
x = x.transpose([0, 2, 3, 1])
x = self.norm(x)
x = self.pwconv1(x)
x = self.act(x)
x = self.pwconv2(x)
if self.gamma is not None:
x = self.gamma * x
x = x.transpose([0, 3, 1, 2])
x = input + self.drop_path(x)
return x
class LayerNorm(nn.Layer):
r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
shape (batch_size, height, width, channels) while channels_first corresponds to inputs
with shape (batch_size, channels, height, width).
"""
def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
super().__init__()
self.weight = self.create_parameter(
shape=(normalized_shape, ),
attr=ParamAttr(initializer=Constant(1.)))
self.bias = self.create_parameter(
shape=(normalized_shape, ),
attr=ParamAttr(initializer=Constant(0.)))
self.eps = eps
self.data_format = data_format
if self.data_format not in ["channels_last", "channels_first"]:
raise NotImplementedError
self.normalized_shape = (normalized_shape, )
def forward(self, x):
if self.data_format == "channels_last":
return F.layer_norm(x, self.normalized_shape, self.weight,
self.bias, self.eps)
elif self.data_format == "channels_first":
u = x.mean(1, keepdim=True)
s = (x - u).pow(2).mean(1, keepdim=True)
x = (x - u) / paddle.sqrt(s + self.eps)
x = self.weight[:, None, None] * x + self.bias[:, None, None]
return x
@register
@serializable
class ConvNeXt(nn.Layer):
r""" ConvNeXt
A Pypaddle impl of : `A ConvNet for the 2020s` -
https://arxiv.org/pdf/2201.03545.pdf
Args:
in_chans (int): Number of input image channels. Default: 3
depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
drop_path_rate (float): Stochastic depth rate. Default: 0.
layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
"""
arch_settings = {
'tiny': {
'depths': [3, 3, 9, 3],
'dims': [96, 192, 384, 768]
},
'small': {
'depths': [3, 3, 27, 3],
'dims': [96, 192, 384, 768]
},
'base': {
'depths': [3, 3, 27, 3],
'dims': [128, 256, 512, 1024]
},
'large': {
'depths': [3, 3, 27, 3],
'dims': [192, 384, 768, 1536]
},
'xlarge': {
'depths': [3, 3, 27, 3],
'dims': [256, 512, 1024, 2048]
},
}
def __init__(
self,
arch='tiny',
in_chans=3,
drop_path_rate=0.,
layer_scale_init_value=1e-6,
return_idx=[1, 2, 3],
norm_output=True,
pretrained=None, ):
super().__init__()
depths = self.arch_settings[arch]['depths']
dims = self.arch_settings[arch]['dims']
self.downsample_layers = nn.LayerList(
) # stem and 3 intermediate downsampling conv layers
stem = nn.Sequential(
nn.Conv2D(
in_chans, dims[0], kernel_size=4, stride=4),
LayerNorm(
dims[0], eps=1e-6, data_format="channels_first"))
self.downsample_layers.append(stem)
for i in range(3):
downsample_layer = nn.Sequential(
LayerNorm(
dims[i], eps=1e-6, data_format="channels_first"),
nn.Conv2D(
dims[i], dims[i + 1], kernel_size=2, stride=2), )
self.downsample_layers.append(downsample_layer)
self.stages = nn.LayerList(
) # 4 feature resolution stages, each consisting of multiple residual blocks
dp_rates = [x for x in np.linspace(0, drop_path_rate, sum(depths))]
cur = 0
for i in range(4):
stage = nn.Sequential(*[
Block(
dim=dims[i],
drop_path=dp_rates[cur + j],
layer_scale_init_value=layer_scale_init_value)
for j in range(depths[i])
])
self.stages.append(stage)
cur += depths[i]
self.return_idx = return_idx
self.dims = [dims[i] for i in return_idx] # [::-1]
self.norm_output = norm_output
if norm_output:
self.norms = nn.LayerList([
LayerNorm(
c, eps=1e-6, data_format="channels_first")
for c in self.dims
])
self.apply(self._init_weights)
if pretrained is not None:
if 'http' in pretrained: #URL
path = paddle.utils.download.get_weights_path_from_url(
pretrained)
else: #model in local path
path = pretrained
self.set_state_dict(paddle.load(path))
def _init_weights(self, m):
if isinstance(m, (nn.Conv2D, nn.Linear)):
trunc_normal_(m.weight)
zeros_(m.bias)
def forward_features(self, x):
output = []
for i in range(4):
x = self.downsample_layers[i](x)
x = self.stages[i](x)
output.append(x)
outputs = [output[i] for i in self.return_idx]
if self.norm_output:
outputs = [self.norms[i](out) for i, out in enumerate(outputs)]
return outputs
def forward(self, x):
x = self.forward_features(x['image'])
return x
@property
def out_shape(self):
return [ShapeSpec(channels=c) for c in self.dims]

@ -0,0 +1,404 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from paddlers.models.ppdet.core.workspace import register, serializable
from paddlers.models.ppdet.modeling.initializer import conv_init_
from ..shape_spec import ShapeSpec
__all__ = [
'CSPDarkNet', 'BaseConv', 'DWConv', 'BottleNeck', 'SPPLayer', 'SPPFLayer'
]
class BaseConv(nn.Layer):
def __init__(self,
in_channels,
out_channels,
ksize,
stride,
groups=1,
bias=False,
act="silu"):
super(BaseConv, self).__init__()
self.conv = nn.Conv2D(
in_channels,
out_channels,
kernel_size=ksize,
stride=stride,
padding=(ksize - 1) // 2,
groups=groups,
bias_attr=bias)
self.bn = nn.BatchNorm2D(
out_channels,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
self._init_weights()
def _init_weights(self):
conv_init_(self.conv)
def forward(self, x):
# use 'x * F.sigmoid(x)' replace 'silu'
x = self.bn(self.conv(x))
y = x * F.sigmoid(x)
return y
class DWConv(nn.Layer):
"""Depthwise Conv"""
def __init__(self,
in_channels,
out_channels,
ksize,
stride=1,
bias=False,
act="silu"):
super(DWConv, self).__init__()
self.dw_conv = BaseConv(
in_channels,
in_channels,
ksize=ksize,
stride=stride,
groups=in_channels,
bias=bias,
act=act)
self.pw_conv = BaseConv(
in_channels,
out_channels,
ksize=1,
stride=1,
groups=1,
bias=bias,
act=act)
def forward(self, x):
return self.pw_conv(self.dw_conv(x))
class Focus(nn.Layer):
"""Focus width and height information into channel space, used in YOLOX."""
def __init__(self,
in_channels,
out_channels,
ksize=3,
stride=1,
bias=False,
act="silu"):
super(Focus, self).__init__()
self.conv = BaseConv(
in_channels * 4,
out_channels,
ksize=ksize,
stride=stride,
bias=bias,
act=act)
def forward(self, inputs):
# inputs [bs, C, H, W] -> outputs [bs, 4C, W/2, H/2]
top_left = inputs[:, :, 0::2, 0::2]
top_right = inputs[:, :, 0::2, 1::2]
bottom_left = inputs[:, :, 1::2, 0::2]
bottom_right = inputs[:, :, 1::2, 1::2]
outputs = paddle.concat(
[top_left, bottom_left, top_right, bottom_right], 1)
return self.conv(outputs)
class BottleNeck(nn.Layer):
def __init__(self,
in_channels,
out_channels,
shortcut=True,
expansion=0.5,
depthwise=False,
bias=False,
act="silu"):
super(BottleNeck, self).__init__()
hidden_channels = int(out_channels * expansion)
Conv = DWConv if depthwise else BaseConv
self.conv1 = BaseConv(
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
self.conv2 = Conv(
hidden_channels,
out_channels,
ksize=3,
stride=1,
bias=bias,
act=act)
self.add_shortcut = shortcut and in_channels == out_channels
def forward(self, x):
y = self.conv2(self.conv1(x))
if self.add_shortcut:
y = y + x
return y
class SPPLayer(nn.Layer):
"""Spatial Pyramid Pooling (SPP) layer used in YOLOv3-SPP and YOLOX"""
def __init__(self,
in_channels,
out_channels,
kernel_sizes=(5, 9, 13),
bias=False,
act="silu"):
super(SPPLayer, self).__init__()
hidden_channels = in_channels // 2
self.conv1 = BaseConv(
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
self.maxpoolings = nn.LayerList([
nn.MaxPool2D(
kernel_size=ks, stride=1, padding=ks // 2)
for ks in kernel_sizes
])
conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
self.conv2 = BaseConv(
conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act)
def forward(self, x):
x = self.conv1(x)
x = paddle.concat([x] + [mp(x) for mp in self.maxpoolings], axis=1)
x = self.conv2(x)
return x
class SPPFLayer(nn.Layer):
""" Spatial Pyramid Pooling - Fast (SPPF) layer used in YOLOv5 by Glenn Jocher,
equivalent to SPP(k=(5, 9, 13))
"""
def __init__(self,
in_channels,
out_channels,
ksize=5,
bias=False,
act='silu'):
super(SPPFLayer, self).__init__()
hidden_channels = in_channels // 2
self.conv1 = BaseConv(
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
self.maxpooling = nn.MaxPool2D(
kernel_size=ksize, stride=1, padding=ksize // 2)
conv2_channels = hidden_channels * 4
self.conv2 = BaseConv(
conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act)
def forward(self, x):
x = self.conv1(x)
y1 = self.maxpooling(x)
y2 = self.maxpooling(y1)
y3 = self.maxpooling(y2)
concats = paddle.concat([x, y1, y2, y3], axis=1)
out = self.conv2(concats)
return out
class CSPLayer(nn.Layer):
"""CSP (Cross Stage Partial) layer with 3 convs, named C3 in YOLOv5"""
def __init__(self,
in_channels,
out_channels,
num_blocks=1,
shortcut=True,
expansion=0.5,
depthwise=False,
bias=False,
act="silu"):
super(CSPLayer, self).__init__()
hidden_channels = int(out_channels * expansion)
self.conv1 = BaseConv(
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
self.conv2 = BaseConv(
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
self.bottlenecks = nn.Sequential(*[
BottleNeck(
hidden_channels,
hidden_channels,
shortcut=shortcut,
expansion=1.0,
depthwise=depthwise,
bias=bias,
act=act) for _ in range(num_blocks)
])
self.conv3 = BaseConv(
hidden_channels * 2,
out_channels,
ksize=1,
stride=1,
bias=bias,
act=act)
def forward(self, x):
x_1 = self.conv1(x)
x_1 = self.bottlenecks(x_1)
x_2 = self.conv2(x)
x = paddle.concat([x_1, x_2], axis=1)
x = self.conv3(x)
return x
@register
@serializable
class CSPDarkNet(nn.Layer):
"""
CSPDarkNet backbone.
Args:
arch (str): Architecture of CSPDarkNet, from {P5, P6, X}, default as X,
and 'X' means used in YOLOX, 'P5/P6' means used in YOLOv5.
depth_mult (float): Depth multiplier, multiply number of channels in
each layer, default as 1.0.
width_mult (float): Width multiplier, multiply number of blocks in
CSPLayer, default as 1.0.
depthwise (bool): Whether to use depth-wise conv layer.
act (str): Activation function type, default as 'silu'.
return_idx (list): Index of stages whose feature maps are returned.
"""
__shared__ = ['depth_mult', 'width_mult', 'act', 'trt']
# in_channels, out_channels, num_blocks, add_shortcut, use_spp(use_sppf)
# 'X' means setting used in YOLOX, 'P5/P6' means setting used in YOLOv5.
arch_settings = {
'X': [[64, 128, 3, True, False], [128, 256, 9, True, False],
[256, 512, 9, True, False], [512, 1024, 3, False, True]],
'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False],
[256, 512, 9, True, False], [512, 1024, 3, True, True]],
'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False],
[256, 512, 9, True, False], [512, 768, 3, True, False],
[768, 1024, 3, True, True]],
}
def __init__(self,
arch='X',
depth_mult=1.0,
width_mult=1.0,
depthwise=False,
act='silu',
trt=False,
return_idx=[2, 3, 4]):
super(CSPDarkNet, self).__init__()
self.arch = arch
self.return_idx = return_idx
Conv = DWConv if depthwise else BaseConv
arch_setting = self.arch_settings[arch]
base_channels = int(arch_setting[0][0] * width_mult)
# Note: differences between the latest YOLOv5 and the original YOLOX
# 1. self.stem, use SPPF(in YOLOv5) or SPP(in YOLOX)
# 2. use SPPF(in YOLOv5) or SPP(in YOLOX)
# 3. put SPPF before(YOLOv5) or SPP after(YOLOX) the last cspdark block's CSPLayer
# 4. whether SPPF(SPP)'CSPLayer add shortcut, True in YOLOv5, False in YOLOX
if arch in ['P5', 'P6']:
# in the latest YOLOv5, use Conv stem, and SPPF (fast, only single spp kernal size)
self.stem = Conv(
3, base_channels, ksize=6, stride=2, bias=False, act=act)
spp_kernal_sizes = 5
elif arch in ['X']:
# in the original YOLOX, use Focus stem, and SPP (three spp kernal sizes)
self.stem = Focus(
3, base_channels, ksize=3, stride=1, bias=False, act=act)
spp_kernal_sizes = (5, 9, 13)
else:
raise AttributeError("Unsupported arch type: {}".format(arch))
_out_channels = [base_channels]
layers_num = 1
self.csp_dark_blocks = []
for i, (in_channels, out_channels, num_blocks, shortcut,
use_spp) in enumerate(arch_setting):
in_channels = int(in_channels * width_mult)
out_channels = int(out_channels * width_mult)
_out_channels.append(out_channels)
num_blocks = max(round(num_blocks * depth_mult), 1)
stage = []
conv_layer = self.add_sublayer(
'layers{}.stage{}.conv_layer'.format(layers_num, i + 1),
Conv(
in_channels, out_channels, 3, 2, bias=False, act=act))
stage.append(conv_layer)
layers_num += 1
if use_spp and arch in ['X']:
# in YOLOX use SPPLayer
spp_layer = self.add_sublayer(
'layers{}.stage{}.spp_layer'.format(layers_num, i + 1),
SPPLayer(
out_channels,
out_channels,
kernel_sizes=spp_kernal_sizes,
bias=False,
act=act))
stage.append(spp_layer)
layers_num += 1
csp_layer = self.add_sublayer(
'layers{}.stage{}.csp_layer'.format(layers_num, i + 1),
CSPLayer(
out_channels,
out_channels,
num_blocks=num_blocks,
shortcut=shortcut,
depthwise=depthwise,
bias=False,
act=act))
stage.append(csp_layer)
layers_num += 1
if use_spp and arch in ['P5', 'P6']:
# in latest YOLOv5 use SPPFLayer instead of SPPLayer
sppf_layer = self.add_sublayer(
'layers{}.stage{}.sppf_layer'.format(layers_num, i + 1),
SPPFLayer(
out_channels,
out_channels,
ksize=5,
bias=False,
act=act))
stage.append(sppf_layer)
layers_num += 1
self.csp_dark_blocks.append(nn.Sequential(*stage))
self._out_channels = [_out_channels[i] for i in self.return_idx]
self.strides = [[2, 4, 8, 16, 32, 64][i] for i in self.return_idx]
def forward(self, inputs):
x = inputs['image']
outputs = []
x = self.stem(x)
for i, layer in enumerate(self.csp_dark_blocks):
x = layer(x)
if i + 1 in self.return_idx:
outputs.append(x)
return outputs
@property
def out_shape(self):
return [
ShapeSpec(
channels=c, stride=s)
for c, s in zip(self._out_channels, self.strides)
]

@ -0,0 +1,321 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from paddle.nn.initializer import Constant
from paddlers.models.ppdet.modeling.ops import get_act_fn
from paddlers.models.ppdet.core.workspace import register, serializable
from ..shape_spec import ShapeSpec
__all__ = ['CSPResNet', 'BasicBlock', 'EffectiveSELayer', 'ConvBNLayer']
class ConvBNLayer(nn.Layer):
def __init__(self,
ch_in,
ch_out,
filter_size=3,
stride=1,
groups=1,
padding=0,
act=None):
super(ConvBNLayer, self).__init__()
self.conv = nn.Conv2D(
in_channels=ch_in,
out_channels=ch_out,
kernel_size=filter_size,
stride=stride,
padding=padding,
groups=groups,
bias_attr=False)
self.bn = nn.BatchNorm2D(
ch_out,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
self.act = get_act_fn(act) if act is None or isinstance(act, (
str, dict)) else act
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
x = self.act(x)
return x
class RepVggBlock(nn.Layer):
def __init__(self, ch_in, ch_out, act='relu', alpha=False):
super(RepVggBlock, self).__init__()
self.ch_in = ch_in
self.ch_out = ch_out
self.conv1 = ConvBNLayer(
ch_in, ch_out, 3, stride=1, padding=1, act=None)
self.conv2 = ConvBNLayer(
ch_in, ch_out, 1, stride=1, padding=0, act=None)
self.act = get_act_fn(act) if act is None or isinstance(act, (
str, dict)) else act
if alpha:
self.alpha = self.create_parameter(
shape=[1],
attr=ParamAttr(initializer=Constant(value=1.)),
dtype="float32")
else:
self.alpha = None
def forward(self, x):
if hasattr(self, 'conv'):
y = self.conv(x)
else:
if self.alpha:
y = self.conv1(x) + self.alpha * self.conv2(x)
else:
y = self.conv1(x) + self.conv2(x)
y = self.act(y)
return y
def convert_to_deploy(self):
if not hasattr(self, 'conv'):
self.conv = nn.Conv2D(
in_channels=self.ch_in,
out_channels=self.ch_out,
kernel_size=3,
stride=1,
padding=1,
groups=1)
kernel, bias = self.get_equivalent_kernel_bias()
self.conv.weight.set_value(kernel)
self.conv.bias.set_value(bias)
self.__delattr__('conv1')
self.__delattr__('conv2')
def get_equivalent_kernel_bias(self):
kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
if self.alpha:
return kernel3x3 + self.alpha * self._pad_1x1_to_3x3_tensor(
kernel1x1), bias3x3 + self.alpha * bias1x1
else:
return kernel3x3 + self._pad_1x1_to_3x3_tensor(
kernel1x1), bias3x3 + bias1x1
def _pad_1x1_to_3x3_tensor(self, kernel1x1):
if kernel1x1 is None:
return 0
else:
return nn.functional.pad(kernel1x1, [1, 1, 1, 1])
def _fuse_bn_tensor(self, branch):
if branch is None:
return 0, 0
kernel = branch.conv.weight
running_mean = branch.bn._mean
running_var = branch.bn._variance
gamma = branch.bn.weight
beta = branch.bn.bias
eps = branch.bn._epsilon
std = (running_var + eps).sqrt()
t = (gamma / std).reshape((-1, 1, 1, 1))
return kernel * t, beta - running_mean * gamma / std
class BasicBlock(nn.Layer):
def __init__(self,
ch_in,
ch_out,
act='relu',
shortcut=True,
use_alpha=False):
super(BasicBlock, self).__init__()
assert ch_in == ch_out
self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=act)
self.conv2 = RepVggBlock(ch_out, ch_out, act=act, alpha=use_alpha)
self.shortcut = shortcut
def forward(self, x):
y = self.conv1(x)
y = self.conv2(y)
if self.shortcut:
return paddle.add(x, y)
else:
return y
class EffectiveSELayer(nn.Layer):
""" Effective Squeeze-Excitation
From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667
"""
def __init__(self, channels, act='hardsigmoid'):
super(EffectiveSELayer, self).__init__()
self.fc = nn.Conv2D(channels, channels, kernel_size=1, padding=0)
self.act = get_act_fn(act) if act is None or isinstance(act, (
str, dict)) else act
def forward(self, x):
x_se = x.mean((2, 3), keepdim=True)
x_se = self.fc(x_se)
return x * self.act(x_se)
class CSPResStage(nn.Layer):
def __init__(self,
block_fn,
ch_in,
ch_out,
n,
stride,
act='relu',
attn='eca',
use_alpha=False):
super(CSPResStage, self).__init__()
ch_mid = (ch_in + ch_out) // 2
if stride == 2:
self.conv_down = ConvBNLayer(
ch_in, ch_mid, 3, stride=2, padding=1, act=act)
else:
self.conv_down = None
self.conv1 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
self.conv2 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
self.blocks = nn.Sequential(*[
block_fn(
ch_mid // 2,
ch_mid // 2,
act=act,
shortcut=True,
use_alpha=use_alpha) for i in range(n)
])
if attn:
self.attn = EffectiveSELayer(ch_mid, act='hardsigmoid')
else:
self.attn = None
self.conv3 = ConvBNLayer(ch_mid, ch_out, 1, act=act)
def forward(self, x):
if self.conv_down is not None:
x = self.conv_down(x)
y1 = self.conv1(x)
y2 = self.blocks(self.conv2(x))
y = paddle.concat([y1, y2], axis=1)
if self.attn is not None:
y = self.attn(y)
y = self.conv3(y)
return y
@register
@serializable
class CSPResNet(nn.Layer):
__shared__ = ['width_mult', 'depth_mult', 'trt']
def __init__(self,
layers=[3, 6, 6, 3],
channels=[64, 128, 256, 512, 1024],
act='swish',
return_idx=[1, 2, 3],
depth_wise=False,
use_large_stem=False,
width_mult=1.0,
depth_mult=1.0,
trt=False,
use_checkpoint=False,
use_alpha=False,
**args):
super(CSPResNet, self).__init__()
self.use_checkpoint = use_checkpoint
channels = [max(round(c * width_mult), 1) for c in channels]
layers = [max(round(l * depth_mult), 1) for l in layers]
act = get_act_fn(
act, trt=trt) if act is None or isinstance(act,
(str, dict)) else act
if use_large_stem:
self.stem = nn.Sequential(
('conv1', ConvBNLayer(
3, channels[0] // 2, 3, stride=2, padding=1, act=act)),
('conv2', ConvBNLayer(
channels[0] // 2,
channels[0] // 2,
3,
stride=1,
padding=1,
act=act)), ('conv3', ConvBNLayer(
channels[0] // 2,
channels[0],
3,
stride=1,
padding=1,
act=act)))
else:
self.stem = nn.Sequential(
('conv1', ConvBNLayer(
3, channels[0] // 2, 3, stride=2, padding=1, act=act)),
('conv2', ConvBNLayer(
channels[0] // 2,
channels[0],
3,
stride=1,
padding=1,
act=act)))
n = len(channels) - 1
self.stages = nn.Sequential(*[(str(i), CSPResStage(
BasicBlock,
channels[i],
channels[i + 1],
layers[i],
2,
act=act,
use_alpha=use_alpha)) for i in range(n)])
self._out_channels = channels[1:]
self._out_strides = [4 * 2**i for i in range(n)]
self.return_idx = return_idx
if use_checkpoint:
paddle.seed(0)
def forward(self, inputs):
x = inputs['image']
x = self.stem(x)
outs = []
for idx, stage in enumerate(self.stages):
if self.use_checkpoint and self.training:
x = paddle.distributed.fleet.utils.recompute(
stage, x, **{"preserve_rng_state": True})
else:
x = stage(x)
if idx in self.return_idx:
outs.append(x)
return outs
@property
def out_shape(self):
return [
ShapeSpec(
channels=self._out_channels[i], stride=self._out_strides[i])
for i in self.return_idx
]

@ -77,8 +77,8 @@ class ConvBNLayer(nn.Layer):
out = self.batch_norm(out)
if self.act == 'leaky':
out = F.leaky_relu(out, 0.1)
elif self.act == 'mish':
out = mish(out)
else:
out = getattr(F, self.act)(out)
return out
@ -149,9 +149,14 @@ class BasicBlock(nn.Layer):
super(BasicBlock, self).__init__()
assert ch_in == ch_out and (ch_in % 2) == 0, \
f"ch_in and ch_out should be the same even int, but the input \'ch_in is {ch_in}, \'ch_out is {ch_out}"
# example:
# --------------{conv1} --> {conv2}
# channel route: 10-->5 --> 5-->10
self.conv1 = ConvBNLayer(
ch_in=ch_in,
ch_out=ch_out,
ch_out=int(ch_out / 2),
filter_size=1,
stride=1,
padding=0,
@ -160,8 +165,8 @@ class BasicBlock(nn.Layer):
freeze_norm=freeze_norm,
data_format=data_format)
self.conv2 = ConvBNLayer(
ch_in=ch_out,
ch_out=ch_out * 2,
ch_in=int(ch_out / 2),
ch_out=ch_out,
filter_size=3,
stride=1,
padding=1,
@ -215,7 +220,7 @@ class Blocks(nn.Layer):
res_out = self.add_sublayer(
block_name,
BasicBlock(
ch_out * 2,
ch_out,
ch_out,
norm_type=norm_type,
norm_decay=norm_decay,
@ -296,7 +301,7 @@ class DarkNet(nn.Layer):
name,
Blocks(
int(ch_in[i]),
32 * (2**i),
int(ch_in[i]),
stage,
norm_type=norm_type,
norm_decay=norm_decay,
@ -305,14 +310,14 @@ class DarkNet(nn.Layer):
name=name))
self.darknet_conv_block_list.append(conv_block)
if i in return_idx:
self._out_channels.append(64 * (2**i))
self._out_channels.append(int(ch_in[i]))
for i in range(num_stages - 1):
down_name = 'stage.{}.downsample'.format(i)
downsample = self.add_sublayer(
down_name,
DownSample(
ch_in=32 * (2**(i + 1)),
ch_out=32 * (2**(i + 2)),
ch_in=int(ch_in[i]),
ch_out=int(ch_in[i + 1]),
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,

@ -1,4 +1,4 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -20,7 +20,7 @@ import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn import Conv2D, MaxPool2D, AdaptiveAvgPool2D
from paddle.nn import Conv2D, MaxPool2D, AdaptiveAvgPool2D, BatchNorm
from paddle.nn.initializer import KaimingNormal
from paddle.regularizer import L2Decay

@ -1,4 +1,4 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -299,12 +299,12 @@ class GhostBottleneck(nn.Layer):
class GhostNet(nn.Layer):
__shared__ = ['norm_type']
def __init__(self,
def __init__(
self,
scale=1.3,
feature_maps=[6, 12, 15],
with_extra_blocks=False,
extra_block_filters=[[256, 512], [128, 256], [128, 256],
[64, 128]],
extra_block_filters=[[256, 512], [128, 256], [128, 256], [64, 128]],
lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
conv_decay=0.,
norm_type='bn',

@ -1,4 +1,4 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -146,7 +146,7 @@ class HarDBlock(nn.Layer):
class HarDNet(nn.Layer):
def __init__(self, depth_wise=False, return_idx=[1, 3, 8, 13], arch=85):
super(HarDNet, self).__init__()
assert arch in [39, 68, 85], "HarDNet-{} not support.".format(arch)
assert arch in [68, 85], "HarDNet-{} is not supported.".format(arch)
if arch == 85:
first_ch = [48, 96]
second_kernel = 3
@ -161,6 +161,8 @@ class HarDNet(nn.Layer):
grmul = 1.7
gr = [14, 16, 20, 40]
n_layers = [8, 16, 16, 16]
else:
raise ValueError("HarDNet-{} is not supported.".format(arch))
self.return_idx = return_idx
self._out_channels = [96, 214, 458, 784]

@ -1,4 +1,4 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -68,7 +68,8 @@ class ConvBNLayer(nn.Layer):
filter_size,
num_filters,
stride,
num_groups=1):
num_groups=1,
act='hard_swish'):
super().__init__()
self.conv = Conv2D(
@ -85,12 +86,15 @@ class ConvBNLayer(nn.Layer):
num_filters,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
self.hardswish = nn.Hardswish()
if act == 'hard_swish':
self.act = nn.Hardswish()
elif act == 'relu6':
self.act = nn.ReLU6()
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
x = self.hardswish(x)
x = self.act(x)
return x
@ -100,7 +104,8 @@ class DepthwiseSeparable(nn.Layer):
num_filters,
stride,
dw_size=3,
use_se=False):
use_se=False,
act='hard_swish'):
super().__init__()
self.use_se = use_se
self.dw_conv = ConvBNLayer(
@ -108,14 +113,16 @@ class DepthwiseSeparable(nn.Layer):
num_filters=num_channels,
filter_size=dw_size,
stride=stride,
num_groups=num_channels)
num_groups=num_channels,
act=act)
if use_se:
self.se = SEModule(num_channels)
self.pw_conv = ConvBNLayer(
num_channels=num_channels,
filter_size=1,
num_filters=num_filters,
stride=1)
stride=1,
act=act)
def forward(self, x):
x = self.dw_conv(x)
@ -158,7 +165,7 @@ class SEModule(nn.Layer):
@register
@serializable
class LCNet(nn.Layer):
def __init__(self, scale=1.0, feature_maps=[3, 4, 5]):
def __init__(self, scale=1.0, feature_maps=[3, 4, 5], act='hard_swish'):
super().__init__()
self.scale = scale
self.feature_maps = feature_maps
@ -169,7 +176,8 @@ class LCNet(nn.Layer):
num_channels=3,
filter_size=3,
num_filters=make_divisible(16 * scale),
stride=2)
stride=2,
act=act)
self.blocks2 = nn.Sequential(*[
DepthwiseSeparable(
@ -177,7 +185,8 @@ class LCNet(nn.Layer):
num_filters=make_divisible(out_c * scale),
dw_size=k,
stride=s,
use_se=se)
use_se=se,
act=act)
for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks2"])
])
@ -187,7 +196,8 @@ class LCNet(nn.Layer):
num_filters=make_divisible(out_c * scale),
dw_size=k,
stride=s,
use_se=se)
use_se=se,
act=act)
for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks3"])
])
@ -200,7 +210,8 @@ class LCNet(nn.Layer):
num_filters=make_divisible(out_c * scale),
dw_size=k,
stride=s,
use_se=se)
use_se=se,
act=act)
for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks4"])
])
@ -213,7 +224,8 @@ class LCNet(nn.Layer):
num_filters=make_divisible(out_c * scale),
dw_size=k,
stride=s,
use_se=se)
use_se=se,
act=act)
for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks5"])
])
@ -226,7 +238,8 @@ class LCNet(nn.Layer):
num_filters=make_divisible(out_c * scale),
dw_size=k,
stride=s,
use_se=se)
use_se=se,
act=act)
for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks6"])
])

@ -1,4 +1,4 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.

@ -1,4 +1,4 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -282,13 +282,13 @@ class ExtraBlockDW(nn.Layer):
class MobileNetV3(nn.Layer):
__shared__ = ['norm_type']
def __init__(self,
def __init__(
self,
scale=1.0,
model_name="large",
feature_maps=[6, 12, 15],
with_extra_blocks=False,
extra_block_filters=[[256, 512], [128, 256], [128, 256],
[64, 128]],
extra_block_filters=[[256, 512], [128, 256], [128, 256], [64, 128]],
lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
conv_decay=0.0,
multiplier=1.0,

@ -0,0 +1,266 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is the paddle implementation of MobileOne block, see: https://arxiv.org/pdf/2206.04040.pdf.
Some codes are based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
Ths copyright of microsoft/Swin-Transformer is as follows:
MIT License [see LICENSE for details]
"""
import paddle
import paddle.nn as nn
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from paddle.nn.initializer import Normal, Constant
from paddlers.models.ppdet.modeling.ops import get_act_fn
from paddlers.models.ppdet.modeling.layers import ConvNormLayer
class MobileOneBlock(nn.Layer):
def __init__(
self,
ch_in,
ch_out,
stride,
kernel_size,
conv_num=1,
norm_type='bn',
norm_decay=0.,
norm_groups=32,
bias_on=False,
lr_scale=1.,
freeze_norm=False,
initializer=Normal(
mean=0., std=0.01),
skip_quant=False,
act='relu', ):
super(MobileOneBlock, self).__init__()
self.ch_in = ch_in
self.ch_out = ch_out
self.kernel_size = kernel_size
self.stride = stride
self.padding = (kernel_size - 1) // 2
self.k = conv_num
self.depth_conv = nn.LayerList()
self.point_conv = nn.LayerList()
for _ in range(self.k):
self.depth_conv.append(
ConvNormLayer(
ch_in,
ch_in,
kernel_size,
stride=stride,
groups=ch_in,
norm_type=norm_type,
norm_decay=norm_decay,
norm_groups=norm_groups,
bias_on=bias_on,
lr_scale=lr_scale,
freeze_norm=freeze_norm,
initializer=initializer,
skip_quant=skip_quant))
self.point_conv.append(
ConvNormLayer(
ch_in,
ch_out,
1,
stride=1,
groups=1,
norm_type=norm_type,
norm_decay=norm_decay,
norm_groups=norm_groups,
bias_on=bias_on,
lr_scale=lr_scale,
freeze_norm=freeze_norm,
initializer=initializer,
skip_quant=skip_quant))
self.rbr_1x1 = ConvNormLayer(
ch_in,
ch_in,
1,
stride=self.stride,
groups=ch_in,
norm_type=norm_type,
norm_decay=norm_decay,
norm_groups=norm_groups,
bias_on=bias_on,
lr_scale=lr_scale,
freeze_norm=freeze_norm,
initializer=initializer,
skip_quant=skip_quant)
self.rbr_identity_st1 = nn.BatchNorm2D(
num_features=ch_in,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(
0.0))) if ch_in == ch_out and self.stride == 1 else None
self.rbr_identity_st2 = nn.BatchNorm2D(
num_features=ch_out,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(
0.0))) if ch_in == ch_out and self.stride == 1 else None
self.act = get_act_fn(act) if act is None or isinstance(act, (
str, dict)) else act
def forward(self, x):
if hasattr(self, "conv1") and hasattr(self, "conv2"):
y = self.act(self.conv2(self.act(self.conv1(x))))
else:
if self.rbr_identity_st1 is None:
id_out_st1 = 0
else:
id_out_st1 = self.rbr_identity_st1(x)
x1_1 = 0
for i in range(self.k):
x1_1 += self.depth_conv[i](x)
x1_2 = self.rbr_1x1(x)
x1 = self.act(x1_1 + x1_2 + id_out_st1)
if self.rbr_identity_st2 is None:
id_out_st2 = 0
else:
id_out_st2 = self.rbr_identity_st2(x1)
x2_1 = 0
for i in range(self.k):
x2_1 += self.point_conv[i](x1)
y = self.act(x2_1 + id_out_st2)
return y
def convert_to_deploy(self):
if not hasattr(self, 'conv1'):
self.conv1 = nn.Conv2D(
in_channels=self.ch_in,
out_channels=self.ch_in,
kernel_size=self.kernel_size,
stride=self.stride,
padding=self.padding,
groups=self.ch_in,
bias_attr=ParamAttr(
initializer=Constant(value=0.), learning_rate=1.))
if not hasattr(self, 'conv2'):
self.conv2 = nn.Conv2D(
in_channels=self.ch_in,
out_channels=self.ch_out,
kernel_size=1,
stride=1,
padding='SAME',
groups=1,
bias_attr=ParamAttr(
initializer=Constant(value=0.), learning_rate=1.))
conv1_kernel, conv1_bias, conv2_kernel, conv2_bias = self.get_equivalent_kernel_bias(
)
self.conv1.weight.set_value(conv1_kernel)
self.conv1.bias.set_value(conv1_bias)
self.conv2.weight.set_value(conv2_kernel)
self.conv2.bias.set_value(conv2_bias)
self.__delattr__('depth_conv')
self.__delattr__('point_conv')
self.__delattr__('rbr_1x1')
if hasattr(self, 'rbr_identity_st1'):
self.__delattr__('rbr_identity_st1')
if hasattr(self, 'rbr_identity_st2'):
self.__delattr__('rbr_identity_st2')
def get_equivalent_kernel_bias(self):
st1_kernel3x3, st1_bias3x3 = self._fuse_bn_tensor(self.depth_conv)
st1_kernel1x1, st1_bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
st1_kernelid, st1_biasid = self._fuse_bn_tensor(
self.rbr_identity_st1, kernel_size=self.kernel_size)
st2_kernel1x1, st2_bias1x1 = self._fuse_bn_tensor(self.point_conv)
st2_kernelid, st2_biasid = self._fuse_bn_tensor(
self.rbr_identity_st2, kernel_size=1)
conv1_kernel = st1_kernel3x3 + self._pad_1x1_to_3x3_tensor(
st1_kernel1x1) + st1_kernelid
conv1_bias = st1_bias3x3 + st1_bias1x1 + st1_biasid
conv2_kernel = st2_kernel1x1 + st2_kernelid
conv2_bias = st2_bias1x1 + st2_biasid
return conv1_kernel, conv1_bias, conv2_kernel, conv2_bias
def _pad_1x1_to_3x3_tensor(self, kernel1x1):
if kernel1x1 is None:
return 0
else:
padding_size = (self.kernel_size - 1) // 2
return nn.functional.pad(
kernel1x1,
[padding_size, padding_size, padding_size, padding_size])
def _fuse_bn_tensor(self, branch, kernel_size=3):
if branch is None:
return 0, 0
if isinstance(branch, nn.LayerList):
fused_kernels = []
fused_bias = []
for block in branch:
kernel = block.conv.weight
running_mean = block.norm._mean
running_var = block.norm._variance
gamma = block.norm.weight
beta = block.norm.bias
eps = block.norm._epsilon
std = (running_var + eps).sqrt()
t = (gamma / std).reshape((-1, 1, 1, 1))
fused_kernels.append(kernel * t)
fused_bias.append(beta - running_mean * gamma / std)
return sum(fused_kernels), sum(fused_bias)
elif isinstance(branch, ConvNormLayer):
kernel = branch.conv.weight
running_mean = branch.norm._mean
running_var = branch.norm._variance
gamma = branch.norm.weight
beta = branch.norm.bias
eps = branch.norm._epsilon
else:
assert isinstance(branch, nn.BatchNorm2D)
input_dim = self.ch_in if kernel_size == 1 else 1
kernel_value = paddle.zeros(
shape=[self.ch_in, input_dim, kernel_size, kernel_size],
dtype='float32')
if kernel_size > 1:
for i in range(self.ch_in):
kernel_value[i, i % input_dim, (kernel_size - 1) // 2, (
kernel_size - 1) // 2] = 1
elif kernel_size == 1:
for i in range(self.ch_in):
kernel_value[i, i % input_dim, 0, 0] = 1
else:
raise ValueError("Invalid kernel size recieved!")
kernel = paddle.to_tensor(kernel_value, place=branch.weight.place)
running_mean = branch._mean
running_var = branch._variance
gamma = branch.weight
beta = branch.bias
eps = branch._epsilon
std = (running_var + eps).sqrt()
t = (gamma / std).reshape((-1, 1, 1, 1))
return kernel * t, beta - running_mean * gamma / std

@ -16,6 +16,8 @@ import paddle.nn as nn
from paddlers.models.ppdet.core.workspace import register, serializable
from .resnet import ResNet, Blocks, BasicBlock, BottleNeck
from ..shape_spec import ShapeSpec
from .name_adapter import NameAdapter
__all__ = ['SENet', 'SERes5Head']

@ -1,4 +1,4 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -188,11 +188,10 @@ class ShuffleNetV2(nn.Layer):
elif scale == 1.5:
stage_out_channels = [-1, 24, 176, 352, 704, 1024]
elif scale == 2.0:
stage_out_channels = [-1, 24, 224, 488, 976, 2048]
stage_out_channels = [-1, 24, 244, 488, 976, 2048]
else:
raise NotImplementedError("This scale size:[" + str(scale) +
"] is not implemented!")
self._out_channels = []
self._feature_idx = 0
# 1. conv1

@ -20,62 +20,13 @@ MIT License [see LICENSE for details]
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn.initializer import TruncatedNormal, Constant, Assign
from paddlers.models.ppdet.modeling.shape_spec import ShapeSpec
from paddlers.models.ppdet.core.workspace import register, serializable
import numpy as np
# Common initializations
ones_ = Constant(value=1.)
zeros_ = Constant(value=0.)
trunc_normal_ = TruncatedNormal(std=.02)
# Common Functions
def to_2tuple(x):
return tuple([x] * 2)
def add_parameter(layer, datas, name=None):
parameter = layer.create_parameter(
shape=(datas.shape), default_initializer=Assign(datas))
if name:
layer.add_parameter(name, parameter)
return parameter
# Common Layers
def drop_path(x, drop_prob=0., training=False):
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
"""
if drop_prob == 0. or not training:
return x
keep_prob = paddle.to_tensor(1 - drop_prob)
shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
random_tensor = paddle.floor(random_tensor) # binarize
output = x.divide(keep_prob) * random_tensor
return output
class DropPath(nn.Layer):
def __init__(self, drop_prob=None):
super(DropPath, self).__init__()
self.drop_prob = drop_prob
def forward(self, x):
return drop_path(x, self.drop_prob, self.training)
class Identity(nn.Layer):
def __init__(self):
super(Identity, self).__init__()
def forward(self, input):
return input
from .transformer_utils import DropPath, Identity
from .transformer_utils import add_parameter, to_2tuple
from .transformer_utils import ones_, zeros_, trunc_normal_
class Mlp(nn.Layer):
@ -112,7 +63,7 @@ def window_partition(x, window_size):
"""
B, H, W, C = x.shape
x = x.reshape(
[B, H // window_size, window_size, W // window_size, window_size, C])
[-1, H // window_size, window_size, W // window_size, window_size, C])
windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape(
[-1, window_size, window_size, C])
return windows
@ -128,10 +79,11 @@ def window_reverse(windows, window_size, H, W):
Returns:
x: (B, H, W, C)
"""
_, _, _, C = windows.shape
B = int(windows.shape[0] / (H * W / window_size / window_size))
x = windows.reshape(
[B, H // window_size, W // window_size, window_size, window_size, -1])
x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([B, H, W, -1])
[-1, H // window_size, W // window_size, window_size, window_size, C])
x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, H, W, C])
return x
@ -206,14 +158,14 @@ class WindowAttention(nn.Layer):
"""
B_, N, C = x.shape
qkv = self.qkv(x).reshape(
[B_, N, 3, self.num_heads, C // self.num_heads]).transpose(
[-1, N, 3, self.num_heads, C // self.num_heads]).transpose(
[2, 0, 3, 1, 4])
q, k, v = qkv[0], qkv[1], qkv[2]
q = q * self.scale
attn = paddle.mm(q, k.transpose([0, 1, 3, 2]))
index = self.relative_position_index.reshape([-1])
index = self.relative_position_index.flatten()
relative_position_bias = paddle.index_select(
self.relative_position_bias_table, index)
@ -227,7 +179,7 @@ class WindowAttention(nn.Layer):
if mask is not None:
nW = mask.shape[0]
attn = attn.reshape([B_ // nW, nW, self.num_heads, N, N
attn = attn.reshape([-1, nW, self.num_heads, N, N
]) + mask.unsqueeze(1).unsqueeze(0)
attn = attn.reshape([-1, self.num_heads, N, N])
attn = self.softmax(attn)
@ -237,7 +189,7 @@ class WindowAttention(nn.Layer):
attn = self.attn_drop(attn)
# x = (attn @ v).transpose(1, 2).reshape([B_, N, C])
x = paddle.mm(attn, v).transpose([0, 2, 1, 3]).reshape([B_, N, C])
x = paddle.mm(attn, v).transpose([0, 2, 1, 3]).reshape([-1, N, C])
x = self.proj(x)
x = self.proj_drop(x)
return x
@ -315,7 +267,7 @@ class SwinTransformerBlock(nn.Layer):
shortcut = x
x = self.norm1(x)
x = x.reshape([B, H, W, C])
x = x.reshape([-1, H, W, C])
# pad feature maps to multiples of window size
pad_l = pad_t = 0
@ -337,7 +289,7 @@ class SwinTransformerBlock(nn.Layer):
x_windows = window_partition(
shifted_x, self.window_size) # nW*B, window_size, window_size, C
x_windows = x_windows.reshape(
[-1, self.window_size * self.window_size,
[x_windows.shape[0], self.window_size * self.window_size,
C]) # nW*B, window_size*window_size, C
# W-MSA/SW-MSA
@ -346,7 +298,7 @@ class SwinTransformerBlock(nn.Layer):
# merge windows
attn_windows = attn_windows.reshape(
[-1, self.window_size, self.window_size, C])
[x_windows.shape[0], self.window_size, self.window_size, C])
shifted_x = window_reverse(attn_windows, self.window_size, Hp,
Wp) # B H' W' C
@ -362,7 +314,7 @@ class SwinTransformerBlock(nn.Layer):
if pad_r > 0 or pad_b > 0:
x = x[:, :H, :W, :]
x = x.reshape([B, H * W, C])
x = x.reshape([-1, H * W, C])
# FFN
x = shortcut + self.drop_path(x)
@ -393,7 +345,7 @@ class PatchMerging(nn.Layer):
B, L, C = x.shape
assert L == H * W, "input feature has wrong size"
x = x.reshape([B, H, W, C])
x = x.reshape([-1, H, W, C])
# padding
pad_input = (H % 2 == 1) or (W % 2 == 1)
@ -405,7 +357,7 @@ class PatchMerging(nn.Layer):
x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C
x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C
x = paddle.concat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C
x = x.reshape([B, H * W // 4, 4 * C]) # B H/2*W/2 4*C
x = x.reshape([-1, H * W // 4, 4 * C]) # B H/2*W/2 4*C
x = self.norm(x)
x = self.reduction(x)
@ -482,8 +434,7 @@ class BasicLayer(nn.Layer):
# calculate attention mask for SW-MSA
Hp = int(np.ceil(H / self.window_size)) * self.window_size
Wp = int(np.ceil(W / self.window_size)) * self.window_size
img_mask = paddle.fluid.layers.zeros(
[1, Hp, Wp, 1], dtype='float32') # 1 Hp Wp 1
img_mask = paddle.zeros([1, Hp, Wp, 1], dtype='float32') # 1 Hp Wp 1
h_slices = (slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None))
@ -688,10 +639,10 @@ class SwinTransformer(nn.Layer):
if self.frozen_stages >= 0:
self.patch_embed.eval()
for param in self.patch_embed.parameters():
param.requires_grad = False
param.stop_gradient = True
if self.frozen_stages >= 1 and self.ape:
self.absolute_pos_embed.requires_grad = False
self.absolute_pos_embed.stop_gradient = True
if self.frozen_stages >= 2:
self.pos_drop.eval()
@ -699,7 +650,7 @@ class SwinTransformer(nn.Layer):
m = self.layers[i]
m.eval()
for param in m.parameters():
param.requires_grad = False
param.stop_gradient = True
def _init_weights(self, m):
if isinstance(m, nn.Linear):
@ -713,7 +664,7 @@ class SwinTransformer(nn.Layer):
def forward(self, x):
"""Forward function."""
x = self.patch_embed(x['image'])
_, _, Wh, Ww = x.shape
B, _, Wh, Ww = x.shape
if self.ape:
# interpolate the position embedding to the corresponding size
absolute_pos_embed = F.interpolate(

@ -0,0 +1,74 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
from paddle.nn.initializer import TruncatedNormal, Constant, Assign
# Common initializations
ones_ = Constant(value=1.)
zeros_ = Constant(value=0.)
trunc_normal_ = TruncatedNormal(std=.02)
# Common Layers
def drop_path(x, drop_prob=0., training=False):
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
"""
if drop_prob == 0. or not training:
return x
keep_prob = paddle.to_tensor(1 - drop_prob)
shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
random_tensor = paddle.floor(random_tensor) # binarize
output = x.divide(keep_prob) * random_tensor
return output
class DropPath(nn.Layer):
def __init__(self, drop_prob=None):
super(DropPath, self).__init__()
self.drop_prob = drop_prob
def forward(self, x):
return drop_path(x, self.drop_prob, self.training)
class Identity(nn.Layer):
def __init__(self):
super(Identity, self).__init__()
def forward(self, input):
return input
# common funcs
def to_2tuple(x):
if isinstance(x, (list, tuple)):
return x
return tuple([x] * 2)
def add_parameter(layer, datas, name=None):
parameter = layer.create_parameter(
shape=(datas.shape), default_initializer=Assign(datas))
if name:
layer.add_parameter(name, parameter)
return parameter

@ -168,9 +168,9 @@ class VGG(nn.Layer):
self.norms = []
for i, n in enumerate(self.normalizations):
if n != -1:
norm = self.add_sublayer(
"norm{}".format(i),
L2NormScale(self.extra_block_filters[i][1], n))
norm = self.add_sublayer("norm{}".format(i),
L2NormScale(
self.extra_block_filters[i][1], n))
else:
norm = None
self.norms.append(norm)

@ -0,0 +1,634 @@
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
import numpy as np
from paddle.nn.initializer import Constant
from paddlers.models.ppdet.modeling.shape_spec import ShapeSpec
from paddlers.models.ppdet.core.workspace import register, serializable
from .transformer_utils import zeros_, DropPath, Identity
class Mlp(nn.Layer):
def __init__(self,
in_features,
hidden_features=None,
out_features=None,
act_layer=nn.GELU,
drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
class Attention(nn.Layer):
def __init__(self,
dim,
num_heads=8,
qkv_bias=False,
qk_scale=None,
attn_drop=0.,
proj_drop=0.,
window_size=None):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = qk_scale or head_dim**-0.5
self.qkv = nn.Linear(dim, dim * 3, bias_attr=False)
if qkv_bias:
self.q_bias = self.create_parameter(
shape=([dim]), default_initializer=zeros_)
self.v_bias = self.create_parameter(
shape=([dim]), default_initializer=zeros_)
else:
self.q_bias = None
self.v_bias = None
if window_size:
self.window_size = window_size
self.num_relative_distance = (2 * window_size[0] - 1) * (
2 * window_size[1] - 1) + 3
self.relative_position_bias_table = self.create_parameter(
shape=(self.num_relative_distance, num_heads),
default_initializer=zeros_) # 2*Wh-1 * 2*Ww-1, nH
# cls to token & token 2 cls & cls to cls
# get pair-wise relative position index for each token inside the window
coords_h = paddle.arange(window_size[0])
coords_w = paddle.arange(window_size[1])
coords = paddle.stack(paddle.meshgrid(
[coords_h, coords_w])) # 2, Wh, Ww
coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww
coords_flatten_1 = paddle.unsqueeze(coords_flatten, 2)
coords_flatten_2 = paddle.unsqueeze(coords_flatten, 1)
relative_coords = coords_flatten_1.clone() - coords_flatten_2.clone(
)
#relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Wh
relative_coords = relative_coords.transpose(
(1, 2, 0)) #.contiguous() # Wh*Ww, Wh*Ww, 2
relative_coords[:, :, 0] += window_size[
0] - 1 # shift to start from 0
relative_coords[:, :, 1] += window_size[1] - 1
relative_coords[:, :, 0] *= 2 * window_size[1] - 1
relative_position_index = \
paddle.zeros(shape=(window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype)
relative_position_index[1:, 1:] = relative_coords.sum(
-1) # Wh*Ww, Wh*Ww
relative_position_index[0, 0:] = self.num_relative_distance - 3
relative_position_index[0:, 0] = self.num_relative_distance - 2
relative_position_index[0, 0] = self.num_relative_distance - 1
self.register_buffer("relative_position_index",
relative_position_index)
# trunc_normal_(self.relative_position_bias_table, std=.0)
else:
self.window_size = None
self.relative_position_bias_table = None
self.relative_position_index = None
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x, rel_pos_bias=None):
x_shape = paddle.shape(x)
N, C = x_shape[1], x_shape[2]
qkv_bias = None
if self.q_bias is not None:
qkv_bias = paddle.concat(
(self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias))
qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias)
qkv = qkv.reshape((-1, N, 3, self.num_heads,
C // self.num_heads)).transpose((2, 0, 3, 1, 4))
q, k, v = qkv[0], qkv[1], qkv[2]
attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
if self.relative_position_bias_table is not None:
relative_position_bias = self.relative_position_bias_table[
self.relative_position_index.reshape([-1])].reshape([
self.window_size[0] * self.window_size[1] + 1,
self.window_size[0] * self.window_size[1] + 1, -1
]) # Wh*Ww,Wh*Ww,nH
relative_position_bias = relative_position_bias.transpose(
(2, 0, 1)) #.contiguous() # nH, Wh*Ww, Wh*Ww
attn = attn + relative_position_bias.unsqueeze(0)
if rel_pos_bias is not None:
attn = attn + rel_pos_bias
attn = nn.functional.softmax(attn, axis=-1)
attn = self.attn_drop(attn)
x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
x = self.proj(x)
x = self.proj_drop(x)
return x
class Block(nn.Layer):
def __init__(self,
dim,
num_heads,
mlp_ratio=4.,
qkv_bias=False,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
window_size=None,
init_values=None,
act_layer=nn.GELU,
norm_layer='nn.LayerNorm',
epsilon=1e-5):
super().__init__()
self.norm1 = nn.LayerNorm(dim, epsilon=1e-6)
self.attn = Attention(
dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=drop,
window_size=window_size)
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(in_features=dim,
hidden_features=mlp_hidden_dim,
act_layer=act_layer,
drop=drop)
if init_values is not None:
self.gamma_1 = self.create_parameter(
shape=([dim]), default_initializer=Constant(value=init_values))
self.gamma_2 = self.create_parameter(
shape=([dim]), default_initializer=Constant(value=init_values))
else:
self.gamma_1, self.gamma_2 = None, None
def forward(self, x, rel_pos_bias=None):
if self.gamma_1 is None:
x = x + self.drop_path(
self.attn(
self.norm1(x), rel_pos_bias=rel_pos_bias))
x = x + self.drop_path(self.mlp(self.norm2(x)))
else:
x = x + self.drop_path(self.gamma_1 * self.attn(
self.norm1(x), rel_pos_bias=rel_pos_bias))
x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
return x
class PatchEmbed(nn.Layer):
""" Image to Patch Embedding
"""
def __init__(self,
img_size=[224, 224],
patch_size=16,
in_chans=3,
embed_dim=768):
super().__init__()
self.num_patches_w = img_size[0] // patch_size
self.num_patches_h = img_size[1] // patch_size
num_patches = self.num_patches_w * self.num_patches_h
self.patch_shape = (img_size[0] // patch_size,
img_size[1] // patch_size)
self.img_size = img_size
self.patch_size = patch_size
self.num_patches = num_patches
self.proj = nn.Conv2D(
in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
@property
def num_patches_in_h(self):
return self.img_size[1] // self.patch_size
@property
def num_patches_in_w(self):
return self.img_size[0] // self.patch_size
def forward(self, x, mask=None):
B, C, H, W = x.shape
return self.proj(x)
class RelativePositionBias(nn.Layer):
def __init__(self, window_size, num_heads):
super().__init__()
self.window_size = window_size
self.num_relative_distance = (2 * window_size[0] - 1) * (
2 * window_size[1] - 1) + 3
self.relative_position_bias_table = self.create_parameter(
shape=(self.num_relative_distance, num_heads),
default_initialize=zeros_)
# cls to token & token 2 cls & cls to cls
# get pair-wise relative position index for each token inside the window
coords_h = paddle.arange(window_size[0])
coords_w = paddle.arange(window_size[1])
coords = paddle.stack(paddle.meshgrid(
[coords_h, coords_w])) # 2, Wh, Ww
coords_flatten = coords.flatten(1) # 2, Wh*Ww
relative_coords = coords_flatten[:, :,
None] - coords_flatten[:,
None, :] # 2, Wh*Ww, Wh*Ww
relative_coords = relative_coords.transpos(
(1, 2, 0)) # Wh*Ww, Wh*Ww, 2
relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0
relative_coords[:, :, 1] += window_size[1] - 1
relative_coords[:, :, 0] *= 2 * window_size[1] - 1
relative_position_index = \
paddle.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
relative_position_index[1:, 1:] = relative_coords.sum(
-1) # Wh*Ww, Wh*Ww
relative_position_index[0, 0:] = self.num_relative_distance - 3
relative_position_index[0:, 0] = self.num_relative_distance - 2
relative_position_index[0, 0] = self.num_relative_distance - 1
self.register_buffer("relative_position_index", relative_position_index)
def forward(self):
relative_position_bias = \
self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
self.window_size[0] * self.window_size[1] + 1,
self.window_size[0] * self.window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH
return relative_position_bias.transpose((2, 0, 1)) # nH, Wh*Ww, Wh*Ww
def get_sinusoid_encoding_table(n_position, d_hid, token=False):
''' Sinusoid position encoding table '''
def get_position_angle_vec(position):
return [
position / np.power(10000, 2 * (hid_j // 2) / d_hid)
for hid_j in range(d_hid)
]
sinusoid_table = np.array(
[get_position_angle_vec(pos_i) for pos_i in range(n_position)])
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
if token:
sinusoid_table = np.concatenate(
[sinusoid_table, np.zeros([1, d_hid])], dim=0)
return paddle.to_tensor(sinusoid_table, dtype=paddle.float32).unsqueeze(0)
@register
@serializable
class VisionTransformer(nn.Layer):
""" Vision Transformer with support for patch input
"""
def __init__(self,
img_size=[672, 1092],
patch_size=16,
in_chans=3,
embed_dim=768,
depth=12,
num_heads=12,
mlp_ratio=4,
qkv_bias=False,
qk_scale=None,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.,
norm_layer='nn.LayerNorm',
init_values=None,
use_rel_pos_bias=False,
use_shared_rel_pos_bias=False,
epsilon=1e-5,
final_norm=False,
pretrained=None,
out_indices=[3, 5, 7, 11],
use_abs_pos_emb=False,
use_sincos_pos_emb=True,
with_fpn=True,
use_checkpoint=False,
**args):
super().__init__()
self.img_size = img_size
self.embed_dim = embed_dim
self.with_fpn = with_fpn
self.use_checkpoint = use_checkpoint
self.use_sincos_pos_emb = use_sincos_pos_emb
self.use_rel_pos_bias = use_rel_pos_bias
self.final_norm = final_norm
if use_checkpoint:
paddle.seed(0)
self.patch_embed = PatchEmbed(
img_size=img_size,
patch_size=patch_size,
in_chans=in_chans,
embed_dim=embed_dim)
self.pos_w = self.patch_embed.num_patches_in_w
self.pos_h = self.patch_embed.num_patches_in_h
self.cls_token = self.create_parameter(
shape=(1, 1, embed_dim),
default_initializer=paddle.nn.initializer.Constant(value=0.))
if use_abs_pos_emb:
self.pos_embed = self.create_parameter(
shape=(1, self.pos_w * self.pos_h + 1, embed_dim),
default_initializer=paddle.nn.initializer.TruncatedNormal(
std=.02))
elif use_sincos_pos_emb:
pos_embed = self.build_2d_sincos_position_embedding(embed_dim)
self.pos_embed = pos_embed
self.pos_embed = self.create_parameter(shape=pos_embed.shape)
self.pos_embed.set_value(pos_embed.numpy())
self.pos_embed.stop_gradient = True
else:
self.pos_embed = None
self.pos_drop = nn.Dropout(p=drop_rate)
if use_shared_rel_pos_bias:
self.rel_pos_bias = RelativePositionBias(
window_size=self.patch_embed.patch_shape, num_heads=num_heads)
else:
self.rel_pos_bias = None
dpr = np.linspace(0, drop_path_rate, depth)
self.blocks = nn.LayerList([
Block(
dim=embed_dim,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
attn_drop=attn_drop_rate,
drop_path=dpr[i],
norm_layer=norm_layer,
init_values=init_values,
window_size=self.patch_embed.patch_shape
if use_rel_pos_bias else None,
epsilon=epsilon) for i in range(depth)
])
self.pretrained = pretrained
self.init_weight()
assert len(out_indices) <= 4, ''
self.out_indices = out_indices
self.out_channels = [embed_dim for _ in range(len(out_indices))]
self.out_strides = [4, 8, 16, 32][-len(out_indices):] if with_fpn else [
8 for _ in range(len(out_indices))
]
self.norm = Identity()
if self.with_fpn:
self.init_fpn(
embed_dim=embed_dim,
patch_size=patch_size, )
def init_weight(self):
pretrained = self.pretrained
if pretrained:
if 'http' in pretrained: #URL
path = paddle.utils.download.get_weights_path_from_url(
pretrained)
else: #model in local path
path = pretrained
load_state_dict = paddle.load(path)
model_state_dict = self.state_dict()
pos_embed_name = "pos_embed"
if pos_embed_name in load_state_dict.keys():
load_pos_embed = paddle.to_tensor(
load_state_dict[pos_embed_name], dtype="float32")
if self.pos_embed.shape != load_pos_embed.shape:
pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1))
model_state_dict[pos_embed_name] = self.resize_pos_embed(
load_pos_embed, (pos_size, pos_size),
(self.pos_h, self.pos_w))
# self.set_state_dict(model_state_dict)
load_state_dict[pos_embed_name] = model_state_dict[
pos_embed_name]
print("Load pos_embed and resize it from {} to {} .".format(
load_pos_embed.shape, self.pos_embed.shape))
self.set_state_dict(load_state_dict)
print("Load load_state_dict....")
def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False):
if patch_size == 16:
self.fpn1 = nn.Sequential(
nn.Conv2DTranspose(
embed_dim, embed_dim, kernel_size=2, stride=2),
nn.BatchNorm2D(embed_dim),
nn.GELU(),
nn.Conv2DTranspose(
embed_dim, embed_dim, kernel_size=2, stride=2), )
self.fpn2 = nn.Sequential(
nn.Conv2DTranspose(
embed_dim, embed_dim, kernel_size=2, stride=2), )
self.fpn3 = Identity()
self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2)
elif patch_size == 8:
self.fpn1 = nn.Sequential(
nn.Conv2DTranspose(
embed_dim, embed_dim, kernel_size=2, stride=2), )
self.fpn2 = Identity()
self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), )
self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), )
if not out_with_norm:
self.norm = Identity()
else:
self.norm = nn.LayerNorm(embed_dim, epsilon=1e-6)
def interpolate_pos_encoding(self, x, w, h):
npatch = x.shape[1] - 1
N = self.pos_embed.shape[1] - 1
w0 = w // self.patch_embed.patch_size
h0 = h // self.patch_embed.patch_size
if npatch == N and w0 == self.patch_embed.num_patches_w and h0 == self.patch_embed.num_patches_h:
return self.pos_embed
class_pos_embed = self.pos_embed[:, 0]
patch_pos_embed = self.pos_embed[:, 1:]
dim = x.shape[-1]
# we add a small number to avoid floating point error in the interpolation
# see discussion at https://github.com/facebookresearch/dino/issues/8
w0, h0 = w0 + 0.1, h0 + 0.1
patch_pos_embed = nn.functional.interpolate(
patch_pos_embed.reshape([
1, self.patch_embed.num_patches_w,
self.patch_embed.num_patches_h, dim
]).transpose((0, 3, 1, 2)),
scale_factor=(w0 / self.patch_embed.num_patches_w,
h0 / self.patch_embed.num_patches_h),
mode='bicubic', )
assert int(w0) == patch_pos_embed.shape[-2] and int(
h0) == patch_pos_embed.shape[-1]
patch_pos_embed = patch_pos_embed.transpose(
(0, 2, 3, 1)).reshape([1, -1, dim])
return paddle.concat(
(class_pos_embed.unsqueeze(0), patch_pos_embed), axis=1)
def resize_pos_embed(self, pos_embed, old_hw, new_hw):
"""
Resize pos_embed weight.
Args:
pos_embed (Tensor): the pos_embed weight
old_hw (list[int]): the height and width of old pos_embed
new_hw (list[int]): the height and width of new pos_embed
Returns:
Tensor: the resized pos_embed weight
"""
cls_pos_embed = pos_embed[:, :1, :]
pos_embed = pos_embed[:, 1:, :]
pos_embed = pos_embed.transpose([0, 2, 1])
pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]])
pos_embed = F.interpolate(
pos_embed, new_hw, mode='bicubic', align_corners=False)
pos_embed = pos_embed.flatten(2).transpose([0, 2, 1])
pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1)
return pos_embed
def build_2d_sincos_position_embedding(
self,
embed_dim=768,
temperature=10000., ):
h, w = self.patch_embed.patch_shape
grid_w = paddle.arange(w, dtype=paddle.float32)
grid_h = paddle.arange(h, dtype=paddle.float32)
grid_w, grid_h = paddle.meshgrid(grid_w, grid_h)
assert embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
pos_dim = embed_dim // 4
omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
omega = 1. / (temperature**omega)
out_w = grid_w.flatten()[..., None] @omega[None]
out_h = grid_h.flatten()[..., None] @omega[None]
pos_emb = paddle.concat(
[
paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h),
paddle.cos(out_h)
],
axis=1)[None, :, :]
pe_token = paddle.zeros([1, 1, embed_dim], dtype=paddle.float32)
pos_embed = paddle.concat([pe_token, pos_emb], axis=1)
# pos_embed.stop_gradient = True
return pos_embed
def forward(self, x):
x = x['image'] if isinstance(x, dict) else x
_, _, h, w = x.shape
x = self.patch_embed(x)
B, D, Hp, Wp = x.shape # b * c * h * w
cls_tokens = self.cls_token.expand(
(B, self.cls_token.shape[-2], self.cls_token.shape[-1]))
x = x.flatten(2).transpose([0, 2, 1]) # b * hw * c
x = paddle.concat([cls_tokens, x], axis=1)
if self.pos_embed is not None:
# x = x + self.interpolate_pos_encoding(x, w, h)
x = x + self.interpolate_pos_encoding(x, h, w)
x = self.pos_drop(x)
rel_pos_bias = self.rel_pos_bias(
) if self.rel_pos_bias is not None else None
feats = []
for idx, blk in enumerate(self.blocks):
if self.use_checkpoint and self.training:
x = paddle.distributed.fleet.utils.recompute(
blk, x, rel_pos_bias, **{"preserve_rng_state": True})
else:
x = blk(x, rel_pos_bias)
if idx in self.out_indices:
xp = paddle.reshape(
paddle.transpose(
self.norm(x[:, 1:, :]), perm=[0, 2, 1]),
shape=[B, D, Hp, Wp])
feats.append(xp)
if self.with_fpn:
fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
for i in range(len(feats)):
feats[i] = fpns[i](feats[i])
return feats
@property
def num_layers(self):
return len(self.blocks)
@property
def no_weight_decay(self):
return {'pos_embed', 'cls_token'}
@property
def out_shape(self):
return [
ShapeSpec(
channels=c, stride=s)
for c, s in zip(self.out_channels, self.out_strides)
]

@ -278,8 +278,8 @@ def decode_yolo(box, anchor, downsample_ratio):
return [x1, y1, w1, h1]
def iou_similarity(box1, box2, eps=1e-9):
"""Calculate iou of box1 and box2
def batch_iou_similarity(box1, box2, eps=1e-9):
"""Calculate iou of box1 and box2 in batch
Args:
box1 (Tensor): box with the shape [N, M1, 4]
@ -359,295 +359,6 @@ def bbox_iou(box1, box2, giou=False, diou=False, ciou=False, eps=1e-9):
return iou
def rect2rbox(bboxes):
"""
:param bboxes: shape (n, 4) (xmin, ymin, xmax, ymax)
:return: dbboxes: shape (n, 5) (x_ctr, y_ctr, w, h, angle)
"""
bboxes = bboxes.reshape(-1, 4)
num_boxes = bboxes.shape[0]
x_ctr = (bboxes[:, 2] + bboxes[:, 0]) / 2.0
y_ctr = (bboxes[:, 3] + bboxes[:, 1]) / 2.0
edges1 = np.abs(bboxes[:, 2] - bboxes[:, 0])
edges2 = np.abs(bboxes[:, 3] - bboxes[:, 1])
angles = np.zeros([num_boxes], dtype=bboxes.dtype)
inds = edges1 < edges2
rboxes = np.stack((x_ctr, y_ctr, edges1, edges2, angles), axis=1)
rboxes[inds, 2] = edges2[inds]
rboxes[inds, 3] = edges1[inds]
rboxes[inds, 4] = np.pi / 2.0
return rboxes
def delta2rbox(rrois,
deltas,
means=[0, 0, 0, 0, 0],
stds=[1, 1, 1, 1, 1],
wh_ratio_clip=1e-6):
"""
:param rrois: (cx, cy, w, h, theta)
:param deltas: (dx, dy, dw, dh, dtheta)
:param means:
:param stds:
:param wh_ratio_clip:
:return:
"""
means = paddle.to_tensor(means)
stds = paddle.to_tensor(stds)
deltas = paddle.reshape(deltas, [-1, deltas.shape[-1]])
denorm_deltas = deltas * stds + means
dx = denorm_deltas[:, 0]
dy = denorm_deltas[:, 1]
dw = denorm_deltas[:, 2]
dh = denorm_deltas[:, 3]
dangle = denorm_deltas[:, 4]
max_ratio = np.abs(np.log(wh_ratio_clip))
dw = paddle.clip(dw, min=-max_ratio, max=max_ratio)
dh = paddle.clip(dh, min=-max_ratio, max=max_ratio)
rroi_x = rrois[:, 0]
rroi_y = rrois[:, 1]
rroi_w = rrois[:, 2]
rroi_h = rrois[:, 3]
rroi_angle = rrois[:, 4]
gx = dx * rroi_w * paddle.cos(rroi_angle) - dy * rroi_h * paddle.sin(
rroi_angle) + rroi_x
gy = dx * rroi_w * paddle.sin(rroi_angle) + dy * rroi_h * paddle.cos(
rroi_angle) + rroi_y
gw = rroi_w * dw.exp()
gh = rroi_h * dh.exp()
ga = np.pi * dangle + rroi_angle
ga = (ga + np.pi / 4) % np.pi - np.pi / 4
ga = paddle.to_tensor(ga)
gw = paddle.to_tensor(gw, dtype='float32')
gh = paddle.to_tensor(gh, dtype='float32')
bboxes = paddle.stack([gx, gy, gw, gh, ga], axis=-1)
return bboxes
def rbox2delta(proposals, gt, means=[0, 0, 0, 0, 0], stds=[1, 1, 1, 1, 1]):
"""
Args:
proposals:
gt:
means: 1x5
stds: 1x5
Returns:
"""
proposals = proposals.astype(np.float64)
PI = np.pi
gt_widths = gt[..., 2]
gt_heights = gt[..., 3]
gt_angle = gt[..., 4]
proposals_widths = proposals[..., 2]
proposals_heights = proposals[..., 3]
proposals_angle = proposals[..., 4]
coord = gt[..., 0:2] - proposals[..., 0:2]
dx = (np.cos(proposals[..., 4]) * coord[..., 0] + np.sin(proposals[..., 4])
* coord[..., 1]) / proposals_widths
dy = (-np.sin(proposals[..., 4]) * coord[..., 0] + np.cos(proposals[..., 4])
* coord[..., 1]) / proposals_heights
dw = np.log(gt_widths / proposals_widths)
dh = np.log(gt_heights / proposals_heights)
da = (gt_angle - proposals_angle)
da = (da + PI / 4) % PI - PI / 4
da /= PI
deltas = np.stack([dx, dy, dw, dh, da], axis=-1)
means = np.array(means, dtype=deltas.dtype)
stds = np.array(stds, dtype=deltas.dtype)
deltas = (deltas - means) / stds
deltas = deltas.astype(np.float32)
return deltas
def bbox_decode(bbox_preds,
anchors,
means=[0, 0, 0, 0, 0],
stds=[1, 1, 1, 1, 1]):
"""decode bbox from deltas
Args:
bbox_preds: [N,H,W,5]
anchors: [H*W,5]
return:
bboxes: [N,H,W,5]
"""
means = paddle.to_tensor(means)
stds = paddle.to_tensor(stds)
num_imgs, H, W, _ = bbox_preds.shape
bboxes_list = []
for img_id in range(num_imgs):
bbox_pred = bbox_preds[img_id]
# bbox_pred.shape=[5,H,W]
bbox_delta = bbox_pred
anchors = paddle.to_tensor(anchors)
bboxes = delta2rbox(
anchors, bbox_delta, means, stds, wh_ratio_clip=1e-6)
bboxes = paddle.reshape(bboxes, [H, W, 5])
bboxes_list.append(bboxes)
return paddle.stack(bboxes_list, axis=0)
def poly2rbox(polys):
"""
poly:[x0,y0,x1,y1,x2,y2,x3,y3]
to
rotated_boxes:[x_ctr,y_ctr,w,h,angle]
"""
rotated_boxes = []
for poly in polys:
poly = np.array(poly[:8], dtype=np.float32)
pt1 = (poly[0], poly[1])
pt2 = (poly[2], poly[3])
pt3 = (poly[4], poly[5])
pt4 = (poly[6], poly[7])
edge1 = np.sqrt((pt1[0] - pt2[0]) * (pt1[0] - pt2[0]) + (pt1[1] - pt2[
1]) * (pt1[1] - pt2[1]))
edge2 = np.sqrt((pt2[0] - pt3[0]) * (pt2[0] - pt3[0]) + (pt2[1] - pt3[
1]) * (pt2[1] - pt3[1]))
width = max(edge1, edge2)
height = min(edge1, edge2)
rbox_angle = 0
if edge1 > edge2:
rbox_angle = np.arctan2(
float(pt2[1] - pt1[1]), float(pt2[0] - pt1[0]))
elif edge2 >= edge1:
rbox_angle = np.arctan2(
float(pt4[1] - pt1[1]), float(pt4[0] - pt1[0]))
def norm_angle(angle, range=[-np.pi / 4, np.pi]):
return (angle - range[0]) % range[1] + range[0]
rbox_angle = norm_angle(rbox_angle)
x_ctr = float(pt1[0] + pt3[0]) / 2
y_ctr = float(pt1[1] + pt3[1]) / 2
rotated_box = np.array([x_ctr, y_ctr, width, height, rbox_angle])
rotated_boxes.append(rotated_box)
ret_rotated_boxes = np.array(rotated_boxes)
assert ret_rotated_boxes.shape[1] == 5
return ret_rotated_boxes
def cal_line_length(point1, point2):
import math
return math.sqrt(
math.pow(point1[0] - point2[0], 2) + math.pow(point1[1] - point2[1], 2))
def get_best_begin_point_single(coordinate):
x1, y1, x2, y2, x3, y3, x4, y4 = coordinate
xmin = min(x1, x2, x3, x4)
ymin = min(y1, y2, y3, y4)
xmax = max(x1, x2, x3, x4)
ymax = max(y1, y2, y3, y4)
combinate = [[[x1, y1], [x2, y2], [x3, y3], [x4, y4]],
[[x4, y4], [x1, y1], [x2, y2], [x3, y3]],
[[x3, y3], [x4, y4], [x1, y1], [x2, y2]],
[[x2, y2], [x3, y3], [x4, y4], [x1, y1]]]
dst_coordinate = [[xmin, ymin], [xmax, ymin], [xmax, ymax], [xmin, ymax]]
force = 100000000.0
force_flag = 0
for i in range(4):
temp_force = cal_line_length(combinate[i][0], dst_coordinate[0]) \
+ cal_line_length(combinate[i][1], dst_coordinate[1]) \
+ cal_line_length(combinate[i][2], dst_coordinate[2]) \
+ cal_line_length(combinate[i][3], dst_coordinate[3])
if temp_force < force:
force = temp_force
force_flag = i
if force_flag != 0:
pass
return np.array(combinate[force_flag]).reshape(8)
def rbox2poly_np(rrects):
"""
rrect:[x_ctr,y_ctr,w,h,angle]
to
poly:[x0,y0,x1,y1,x2,y2,x3,y3]
"""
polys = []
for i in range(rrects.shape[0]):
rrect = rrects[i]
# x_ctr, y_ctr, width, height, angle = rrect[:5]
x_ctr = rrect[0]
y_ctr = rrect[1]
width = rrect[2]
height = rrect[3]
angle = rrect[4]
tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2
rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]])
R = np.array([[np.cos(angle), -np.sin(angle)],
[np.sin(angle), np.cos(angle)]])
poly = R.dot(rect)
x0, x1, x2, x3 = poly[0, :4] + x_ctr
y0, y1, y2, y3 = poly[1, :4] + y_ctr
poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float32)
poly = get_best_begin_point_single(poly)
polys.append(poly)
polys = np.array(polys)
return polys
def rbox2poly(rrects):
"""
rrect:[x_ctr,y_ctr,w,h,angle]
to
poly:[x0,y0,x1,y1,x2,y2,x3,y3]
"""
N = paddle.shape(rrects)[0]
x_ctr = rrects[:, 0]
y_ctr = rrects[:, 1]
width = rrects[:, 2]
height = rrects[:, 3]
angle = rrects[:, 4]
tl_x, tl_y, br_x, br_y = -width * 0.5, -height * 0.5, width * 0.5, height * 0.5
normal_rects = paddle.stack(
[tl_x, br_x, br_x, tl_x, tl_y, tl_y, br_y, br_y], axis=0)
normal_rects = paddle.reshape(normal_rects, [2, 4, N])
normal_rects = paddle.transpose(normal_rects, [2, 0, 1])
sin, cos = paddle.sin(angle), paddle.cos(angle)
# M.shape=[N,2,2]
M = paddle.stack([cos, -sin, sin, cos], axis=0)
M = paddle.reshape(M, [2, 2, N])
M = paddle.transpose(M, [2, 0, 1])
# polys:[N,8]
polys = paddle.matmul(M, normal_rects)
polys = paddle.transpose(polys, [2, 1, 0])
polys = paddle.reshape(polys, [-1, N])
polys = paddle.transpose(polys, [1, 0])
tmp = paddle.stack(
[x_ctr, y_ctr, x_ctr, y_ctr, x_ctr, y_ctr, x_ctr, y_ctr], axis=1)
polys = polys + tmp
return polys
def bbox_iou_np_expand(box1, box2, x1y1x2y2=True, eps=1e-16):
"""
Calculate the iou of box1 and box2 with numpy.
@ -744,9 +455,9 @@ def distance2bbox(points, distance, max_shape=None):
def bbox_center(boxes):
"""Get bbox centers from boxes.
Args:
boxes (Tensor): boxes with shape (N, 4), "xmin, ymin, xmax, ymax" format.
boxes (Tensor): boxes with shape (..., 4), "xmin, ymin, xmax, ymax" format.
Returns:
Tensor: boxes centers with shape (N, 2), "cx, cy" format.
Tensor: boxes centers with shape (..., 2), "cx, cy" format.
"""
boxes_cx = (boxes[..., 0] + boxes[..., 2]) / 2
boxes_cy = (boxes[..., 1] + boxes[..., 3]) / 2
@ -756,20 +467,136 @@ def bbox_center(boxes):
def batch_distance2bbox(points, distance, max_shapes=None):
"""Decode distance prediction to bounding box for batch.
Args:
points (Tensor): [B, ..., 2]
distance (Tensor): [B, ..., 4]
max_shapes (tuple): [B, 2], "h,w" format, Shape of the image.
points (Tensor): [B, ..., 2], "xy" format
distance (Tensor): [B, ..., 4], "ltrb" format
max_shapes (Tensor): [B, 2], "h,w" format, Shape of the image.
Returns:
Tensor: Decoded bboxes.
Tensor: Decoded bboxes, "x1y1x2y2" format.
"""
x1 = points[..., 0] - distance[..., 0]
y1 = points[..., 1] - distance[..., 1]
x2 = points[..., 0] + distance[..., 2]
y2 = points[..., 1] + distance[..., 3]
lt, rb = paddle.split(distance, 2, -1)
# while tensor add parameters, parameters should be better placed on the second place
x1y1 = -lt + points
x2y2 = rb + points
out_bbox = paddle.concat([x1y1, x2y2], -1)
if max_shapes is not None:
for i, max_shape in enumerate(max_shapes):
x1[i] = x1[i].clip(min=0, max=max_shape[1])
y1[i] = y1[i].clip(min=0, max=max_shape[0])
x2[i] = x2[i].clip(min=0, max=max_shape[1])
y2[i] = y2[i].clip(min=0, max=max_shape[0])
return paddle.stack([x1, y1, x2, y2], -1)
max_shapes = max_shapes.flip(-1).tile([1, 2])
delta_dim = out_bbox.ndim - max_shapes.ndim
for _ in range(delta_dim):
max_shapes.unsqueeze_(1)
out_bbox = paddle.where(out_bbox < max_shapes, out_bbox, max_shapes)
out_bbox = paddle.where(out_bbox > 0, out_bbox,
paddle.zeros_like(out_bbox))
return out_bbox
def delta2bbox_v2(rois,
deltas,
means=(0.0, 0.0, 0.0, 0.0),
stds=(1.0, 1.0, 1.0, 1.0),
max_shape=None,
wh_ratio_clip=16.0 / 1000.0,
ctr_clip=None):
"""Transform network output(delta) to bboxes.
Based on https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/
bbox/coder/delta_xywh_bbox_coder.py
Args:
rois (Tensor): shape [..., 4], base bboxes, typical examples include
anchor and rois
deltas (Tensor): shape [..., 4], offset relative to base bboxes
means (list[float]): the mean that was used to normalize deltas,
must be of size 4
stds (list[float]): the std that was used to normalize deltas,
must be of size 4
max_shape (list[float] or None): height and width of image, will be
used to clip bboxes if not None
wh_ratio_clip (float): to clip delta wh of decoded bboxes
ctr_clip (float or None): whether to clip delta xy of decoded bboxes
"""
if rois.size == 0:
return paddle.empty_like(rois)
means = paddle.to_tensor(means)
stds = paddle.to_tensor(stds)
deltas = deltas * stds + means
dxy = deltas[..., :2]
dwh = deltas[..., 2:]
pxy = (rois[..., :2] + rois[..., 2:]) * 0.5
pwh = rois[..., 2:] - rois[..., :2]
dxy_wh = pwh * dxy
max_ratio = np.abs(np.log(wh_ratio_clip))
if ctr_clip is not None:
dxy_wh = paddle.clip(dxy_wh, max=ctr_clip, min=-ctr_clip)
dwh = paddle.clip(dwh, max=max_ratio)
else:
dwh = dwh.clip(min=-max_ratio, max=max_ratio)
gxy = pxy + dxy_wh
gwh = pwh * dwh.exp()
x1y1 = gxy - (gwh * 0.5)
x2y2 = gxy + (gwh * 0.5)
bboxes = paddle.concat([x1y1, x2y2], axis=-1)
if max_shape is not None:
bboxes[..., 0::2] = bboxes[..., 0::2].clip(min=0, max=max_shape[1])
bboxes[..., 1::2] = bboxes[..., 1::2].clip(min=0, max=max_shape[0])
return bboxes
def bbox2delta_v2(src_boxes,
tgt_boxes,
means=(0.0, 0.0, 0.0, 0.0),
stds=(1.0, 1.0, 1.0, 1.0)):
"""Encode bboxes to deltas.
Modified from paddlers.models.ppdet.modeling.bbox_utils.bbox2delta.
Args:
src_boxes (Tensor[..., 4]): base bboxes
tgt_boxes (Tensor[..., 4]): target bboxes
means (list[float]): the mean that will be used to normalize delta
stds (list[float]): the std that will be used to normalize delta
"""
if src_boxes.size == 0:
return paddle.empty_like(src_boxes)
src_w = src_boxes[..., 2] - src_boxes[..., 0]
src_h = src_boxes[..., 3] - src_boxes[..., 1]
src_ctr_x = src_boxes[..., 0] + 0.5 * src_w
src_ctr_y = src_boxes[..., 1] + 0.5 * src_h
tgt_w = tgt_boxes[..., 2] - tgt_boxes[..., 0]
tgt_h = tgt_boxes[..., 3] - tgt_boxes[..., 1]
tgt_ctr_x = tgt_boxes[..., 0] + 0.5 * tgt_w
tgt_ctr_y = tgt_boxes[..., 1] + 0.5 * tgt_h
dx = (tgt_ctr_x - src_ctr_x) / src_w
dy = (tgt_ctr_y - src_ctr_y) / src_h
dw = paddle.log(tgt_w / src_w)
dh = paddle.log(tgt_h / src_h)
deltas = paddle.stack((dx, dy, dw, dh), axis=1) # [n, 4]
means = paddle.to_tensor(means, place=src_boxes.place)
stds = paddle.to_tensor(stds, place=src_boxes.place)
deltas = (deltas - means) / stds
return deltas
def iou_similarity(box1, box2, eps=1e-10):
"""Calculate iou of box1 and box2
Args:
box1 (Tensor): box with the shape [M1, 4]
box2 (Tensor): box with the shape [M2, 4]
Return:
iou (Tensor): iou between box1 and box2 with the shape [M1, M2]
"""
box1 = box1.unsqueeze(1) # [M1, 4] -> [M1, 1, 4]
box2 = box2.unsqueeze(0) # [M2, 4] -> [1, M2, 4]
px1y1, px2y2 = box1[:, :, 0:2], box1[:, :, 2:4]
gx1y1, gx2y2 = box2[:, :, 0:2], box2[:, :, 2:4]
x1y1 = paddle.maximum(px1y1, gx1y1)
x2y2 = paddle.minimum(px2y2, gx2y2)
overlap = (x2y2 - x1y1).clip(0).prod(-1)
area1 = (px2y2 - px1y1).clip(0).prod(-1)
area2 = (gx2y2 - gx1y1).clip(0).prod(-1)
union = area1 + area2 - overlap + eps
return overlap / union

@ -0,0 +1,40 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
def _get_class_default_kwargs(cls, *args, **kwargs):
"""
Get default arguments of a class in dict format, if args and
kwargs is specified, it will replace default arguments
"""
varnames = cls.__init__.__code__.co_varnames
argcount = cls.__init__.__code__.co_argcount
keys = varnames[:argcount]
assert keys[0] == 'self'
keys = keys[1:]
values = list(cls.__init__.__defaults__)
assert len(values) == len(keys)
if len(args) > 0:
for i, arg in enumerate(args):
values[i] = arg
default_kwargs = dict(zip(keys, values))
if len(kwargs) > 0:
for k, v in kwargs.items():
default_kwargs[k] = v
return default_kwargs

@ -31,6 +31,8 @@ from . import pico_head
from . import detr_head
from . import sparsercnn_head
from . import tood_head
from . import retina_head
from . import ppyoloe_head
from .bbox_head import *
from .mask_head import *
@ -51,3 +53,5 @@ from .pico_head import *
from .detr_head import *
from .sparsercnn_head import *
from .tood_head import *
from .retina_head import *
from .ppyoloe_head import *

@ -24,6 +24,7 @@ from paddlers.models.ppdet.core.workspace import register, create
from .roi_extractor import RoIAlign
from ..shape_spec import ShapeSpec
from ..bbox_utils import bbox2delta
from ..cls_utils import _get_class_default_kwargs
from paddlers.models.ppdet.modeling.layers import ConvNormLayer
__all__ = ['TwoFCHead', 'XConvNormHead', 'BBoxHead']
@ -178,12 +179,13 @@ class BBoxHead(nn.Layer):
def __init__(self,
head,
in_channel,
roi_extractor=RoIAlign().__dict__,
roi_extractor=_get_class_default_kwargs(RoIAlign),
bbox_assigner='BboxAssigner',
with_pool=False,
num_classes=80,
bbox_weight=[10., 10., 5., 5.],
bbox_loss=None):
bbox_loss=None,
loss_normalize_pos=False):
super(BBoxHead, self).__init__()
self.head = head
self.roi_extractor = roi_extractor
@ -195,6 +197,7 @@ class BBoxHead(nn.Layer):
self.num_classes = num_classes
self.bbox_weight = bbox_weight
self.bbox_loss = bbox_loss
self.loss_normalize_pos = loss_normalize_pos
self.bbox_score = nn.Linear(
in_channel,
@ -249,14 +252,25 @@ class BBoxHead(nn.Layer):
deltas = self.bbox_delta(feat)
if self.training:
loss = self.get_loss(scores, deltas, targets, rois,
self.bbox_weight)
loss = self.get_loss(
scores,
deltas,
targets,
rois,
self.bbox_weight,
loss_normalize_pos=self.loss_normalize_pos)
return loss, bbox_feat
else:
pred = self.get_prediction(scores, deltas)
return pred, self.head
def get_loss(self, scores, deltas, targets, rois, bbox_weight):
def get_loss(self,
scores,
deltas,
targets,
rois,
bbox_weight,
loss_normalize_pos=False):
"""
scores (Tensor): scores from bbox head outputs
deltas (Tensor): deltas from bbox head outputs
@ -279,8 +293,15 @@ class BBoxHead(nn.Layer):
else:
tgt_labels = tgt_labels.cast('int64')
tgt_labels.stop_gradient = True
if not loss_normalize_pos:
loss_bbox_cls = F.cross_entropy(
input=scores, label=tgt_labels, reduction='mean')
else:
loss_bbox_cls = F.cross_entropy(
input=scores, label=tgt_labels,
reduction='none').sum() / (tgt_labels.shape[0] + 1e-7)
loss_bbox[cls_name] = loss_bbox_cls
# bbox reg
@ -321,9 +342,16 @@ class BBoxHead(nn.Layer):
if self.bbox_loss is not None:
reg_delta = self.bbox_transform(reg_delta)
reg_target = self.bbox_transform(reg_target)
if not loss_normalize_pos:
loss_bbox_reg = self.bbox_loss(
reg_delta, reg_target).sum() / tgt_labels.shape[0]
loss_bbox_reg *= self.num_classes
else:
loss_bbox_reg = self.bbox_loss(
reg_delta, reg_target).sum() / (tgt_labels.shape[0] + 1e-7)
else:
loss_bbox_reg = paddle.abs(reg_delta - reg_target).sum(
) / tgt_labels.shape[0]

@ -22,6 +22,7 @@ from .bbox_head import BBoxHead, TwoFCHead, XConvNormHead
from .roi_extractor import RoIAlign
from ..shape_spec import ShapeSpec
from ..bbox_utils import delta2bbox, clip_bbox, nonempty_bbox
from ..cls_utils import _get_class_default_kwargs
__all__ = ['CascadeTwoFCHead', 'CascadeXConvNormHead', 'CascadeHead']
@ -153,13 +154,18 @@ class CascadeHead(BBoxHead):
def __init__(self,
head,
in_channel,
roi_extractor=RoIAlign().__dict__,
roi_extractor=_get_class_default_kwargs(RoIAlign),
bbox_assigner='BboxAssigner',
num_classes=80,
bbox_weight=[[10., 10., 5., 5.], [20.0, 20.0, 10.0, 10.0],
[30.0, 30.0, 15.0, 15.0]],
num_cascade_stages=3,
bbox_loss=None):
bbox_loss=None,
reg_class_agnostic=True,
stage_loss_weights=None,
loss_normalize_pos=False,
add_gt_as_proposals=[True, False, False]):
nn.Layer.__init__(self, )
self.head = head
self.roi_extractor = roi_extractor
@ -171,6 +177,18 @@ class CascadeHead(BBoxHead):
self.bbox_weight = bbox_weight
self.num_cascade_stages = num_cascade_stages
self.bbox_loss = bbox_loss
self.stage_loss_weights = [
1. / num_cascade_stages for _ in range(num_cascade_stages)
] if stage_loss_weights is None else stage_loss_weights
self.add_gt_as_proposals = add_gt_as_proposals
assert len(
self.stage_loss_weights
) == num_cascade_stages, f'stage_loss_weights({len(self.stage_loss_weights)}) do not equal to num_cascade_stages({num_cascade_stages})'
self.reg_class_agnostic = reg_class_agnostic
num_bbox_delta = 4 if reg_class_agnostic else 4 * num_classes
self.loss_normalize_pos = loss_normalize_pos
self.bbox_score_list = []
self.bbox_delta_list = []
@ -189,7 +207,7 @@ class CascadeHead(BBoxHead):
delta_name,
nn.Linear(
in_channel,
4,
num_bbox_delta,
weight_attr=paddle.ParamAttr(initializer=Normal(
mean=0.0, std=0.001))))
self.bbox_score_list.append(bbox_score)
@ -206,7 +224,11 @@ class CascadeHead(BBoxHead):
"""
targets = []
if self.training:
rois, rois_num, targets = self.bbox_assigner(rois, rois_num, inputs)
rois, rois_num, targets = self.bbox_assigner(
rois,
rois_num,
inputs,
add_gt_as_proposals=self.add_gt_as_proposals[0])
targets_list = [targets]
self.assigned_rois = (rois, rois_num)
self.assigned_targets = targets
@ -219,13 +241,32 @@ class CascadeHead(BBoxHead):
inputs['im_shape'])
if self.training:
rois, rois_num, targets = self.bbox_assigner(
rois, rois_num, inputs, i, is_cascade=True)
rois,
rois_num,
inputs,
i,
is_cascade=True,
add_gt_as_proposals=self.add_gt_as_proposals[i])
targets_list.append(targets)
rois_feat = self.roi_extractor(body_feats, rois, rois_num)
bbox_feat = self.head(rois_feat, i)
scores = self.bbox_score_list[i](bbox_feat)
deltas = self.bbox_delta_list[i](bbox_feat)
# TODO (lyuwenyu) Is it correct for only one class ?
if not self.reg_class_agnostic and i < self.num_cascade_stages - 1:
deltas = deltas.reshape([deltas.shape[0], self.num_classes, 4])
labels = scores[:, :-1].argmax(axis=-1)
if self.training:
deltas = deltas[paddle.arange(deltas.shape[0]), labels]
else:
deltas = deltas[((deltas + 10000) * F.one_hot(
labels, num_classes=self.num_classes).unsqueeze(-1) != 0
).nonzero(as_tuple=True)].reshape(
[deltas.shape[0], 4])
head_out_list.append([scores, deltas, rois])
pred_bbox = self._get_pred_bbox(deltas, rois, self.bbox_weight[i])
@ -233,11 +274,16 @@ class CascadeHead(BBoxHead):
loss = {}
for stage, value in enumerate(zip(head_out_list, targets_list)):
(scores, deltas, rois), targets = value
loss_stage = self.get_loss(scores, deltas, targets, rois,
self.bbox_weight[stage])
loss_stage = self.get_loss(
scores,
deltas,
targets,
rois,
self.bbox_weight[stage],
loss_normalize_pos=self.loss_normalize_pos)
for k, v in loss_stage.items():
loss[k + "_stage{}".format(
stage)] = v / self.num_cascade_stages
stage)] = v * self.stage_loss_weights[stage]
return loss, bbox_feat
else:
@ -266,6 +312,14 @@ class CascadeHead(BBoxHead):
num_prop = []
for p in proposals:
num_prop.append(p.shape[0])
# NOTE(dev): num_prob will be tagged as LoDTensorArray because it
# depends on batch_size under @to_static. However the argument
# num_or_sections in paddle.split does not support LoDTensorArray,
# so we use [-1] to replace it if num_prop is not list. The modification
# This ensures the correctness of both dynamic and static graphs.
if not isinstance(num_prop, list):
num_prop = [-1]
return pred_bbox.split(num_prop)
def get_prediction(self, head_out_list):

@ -17,6 +17,7 @@ import paddle.nn as nn
from paddlers.models.ppdet.core.workspace import register
from ..layers import AnchorGeneratorSSD
from ..cls_utils import _get_class_default_kwargs
@register
@ -39,7 +40,7 @@ class FaceHead(nn.Layer):
def __init__(self,
num_classes=80,
in_channels=[96, 96],
anchor_generator=AnchorGeneratorSSD().__dict__,
anchor_generator=_get_class_default_kwargs(AnchorGeneratorSSD),
kernel_size=3,
padding=1,
conv_decay=0.,

@ -64,6 +64,8 @@ class FCOSFeat(nn.Layer):
norm_type='bn',
use_dcn=False):
super(FCOSFeat, self).__init__()
self.feat_in = feat_in
self.feat_out = feat_out
self.num_convs = num_convs
self.norm_type = norm_type
self.cls_subnet_convs = []

@ -29,7 +29,7 @@ from paddle.nn.initializer import Normal, Constant
from paddlers.models.ppdet.core.workspace import register
from paddlers.models.ppdet.modeling.layers import ConvNormLayer
from paddlers.models.ppdet.modeling.bbox_utils import distance2bbox, bbox2distance
from paddlers.models.ppdet.modeling.bbox_utils import distance2bbox, bbox2distance, batch_distance2bbox
from paddlers.models.ppdet.data.transform.atss_assigner import bbox_overlaps
@ -79,7 +79,9 @@ class Integral(nn.Layer):
offsets from the box center in four directions, shape (N, 4).
"""
x = F.softmax(x.reshape([-1, self.reg_max + 1]), axis=1)
x = F.linear(x, self.project).reshape([-1, 4])
x = F.linear(x, self.project)
if self.training:
x = x.reshape([-1, 4])
return x
@ -241,18 +243,34 @@ class GFLHead(nn.Layer):
), "The size of fpn_feats is not equal to size of fpn_stride"
cls_logits_list = []
bboxes_reg_list = []
for scale_reg, fpn_feat in zip(self.scales_regs, fpn_feats):
for stride, scale_reg, fpn_feat in zip(self.fpn_stride,
self.scales_regs, fpn_feats):
conv_cls_feat, conv_reg_feat = self.conv_feat(fpn_feat)
cls_logits = self.gfl_head_cls(conv_cls_feat)
bbox_reg = scale_reg(self.gfl_head_reg(conv_reg_feat))
cls_score = self.gfl_head_cls(conv_cls_feat)
bbox_pred = scale_reg(self.gfl_head_reg(conv_reg_feat))
if self.dgqp_module:
quality_score = self.dgqp_module(bbox_reg)
cls_logits = F.sigmoid(cls_logits) * quality_score
quality_score = self.dgqp_module(bbox_pred)
cls_score = F.sigmoid(cls_score) * quality_score
if not self.training:
cls_logits = F.sigmoid(cls_logits.transpose([0, 2, 3, 1]))
bbox_reg = bbox_reg.transpose([0, 2, 3, 1])
cls_logits_list.append(cls_logits)
bboxes_reg_list.append(bbox_reg)
cls_score = F.sigmoid(cls_score.transpose([0, 2, 3, 1]))
bbox_pred = bbox_pred.transpose([0, 2, 3, 1])
b, cell_h, cell_w, _ = paddle.shape(cls_score)
y, x = self.get_single_level_center_point(
[cell_h, cell_w], stride, cell_offset=self.cell_offset)
center_points = paddle.stack([x, y], axis=-1)
cls_score = cls_score.reshape([b, -1, self.cls_out_channels])
bbox_pred = self.distribution_project(bbox_pred) * stride
bbox_pred = bbox_pred.reshape([-1, cell_h * cell_w, 4])
# NOTE: If keep_ratio=False and image shape value that
# multiples of 32, distance2bbox not set max_shapes parameter
# to speed up model prediction. If need to set max_shapes,
# please use inputs['im_shape'].
bbox_pred = batch_distance2bbox(
center_points, bbox_pred, max_shapes=None)
cls_logits_list.append(cls_score)
bboxes_reg_list.append(bbox_pred)
return (cls_logits_list, bboxes_reg_list)
@ -370,7 +388,7 @@ class GFLHead(nn.Layer):
avg_factor = sum(avg_factor)
try:
avg_factor = paddle.distributed.all_reduce(avg_factor.clone())
paddle.distributed.all_reduce(avg_factor)
avg_factor = paddle.clip(
avg_factor / paddle.distributed.get_world_size(), min=1)
except:
@ -410,71 +428,13 @@ class GFLHead(nn.Layer):
x = x.flatten()
return y, x
def get_bboxes_single(self,
cls_scores,
bbox_preds,
img_shape,
scale_factor,
rescale=True,
cell_offset=0):
assert len(cls_scores) == len(bbox_preds)
mlvl_bboxes = []
mlvl_scores = []
for stride, cls_score, bbox_pred in zip(self.fpn_stride, cls_scores,
bbox_preds):
featmap_size = [
paddle.shape(cls_score)[0], paddle.shape(cls_score)[1]
]
y, x = self.get_single_level_center_point(
featmap_size, stride, cell_offset=cell_offset)
center_points = paddle.stack([x, y], axis=-1)
scores = cls_score.reshape([-1, self.cls_out_channels])
bbox_pred = self.distribution_project(bbox_pred) * stride
if scores.shape[0] > self.nms_pre:
max_scores = scores.max(axis=1)
_, topk_inds = max_scores.topk(self.nms_pre)
center_points = center_points.gather(topk_inds)
bbox_pred = bbox_pred.gather(topk_inds)
scores = scores.gather(topk_inds)
bboxes = distance2bbox(
center_points, bbox_pred, max_shape=img_shape)
mlvl_bboxes.append(bboxes)
mlvl_scores.append(scores)
mlvl_bboxes = paddle.concat(mlvl_bboxes)
if rescale:
# [h_scale, w_scale] to [w_scale, h_scale, w_scale, h_scale]
im_scale = paddle.concat([scale_factor[::-1], scale_factor[::-1]])
mlvl_bboxes /= im_scale
mlvl_scores = paddle.concat(mlvl_scores)
mlvl_scores = mlvl_scores.transpose([1, 0])
return mlvl_bboxes, mlvl_scores
def decode(self, cls_scores, bbox_preds, im_shape, scale_factor,
cell_offset):
batch_bboxes = []
batch_scores = []
for img_id in range(cls_scores[0].shape[0]):
num_levels = len(cls_scores)
cls_score_list = [cls_scores[i][img_id] for i in range(num_levels)]
bbox_pred_list = [bbox_preds[i][img_id] for i in range(num_levels)]
bboxes, scores = self.get_bboxes_single(
cls_score_list,
bbox_pred_list,
im_shape[img_id],
scale_factor[img_id],
cell_offset=cell_offset)
batch_bboxes.append(bboxes)
batch_scores.append(scores)
batch_bboxes = paddle.stack(batch_bboxes, axis=0)
batch_scores = paddle.stack(batch_scores, axis=0)
return batch_bboxes, batch_scores
def post_process(self, gfl_head_outs, im_shape, scale_factor):
cls_scores, bboxes_reg = gfl_head_outs
bboxes, score = self.decode(cls_scores, bboxes_reg, im_shape,
scale_factor, self.cell_offset)
bbox_pred, bbox_num, _ = self.nms(bboxes, score)
bboxes = paddle.concat(bboxes_reg, axis=1)
# rescale: [h_scale, w_scale] -> [w_scale, h_scale, w_scale, h_scale]
im_scale = scale_factor.flip([1]).tile([1, 2]).unsqueeze(1)
bboxes /= im_scale
mlvl_scores = paddle.concat(cls_scores, axis=1)
mlvl_scores = mlvl_scores.transpose([0, 2, 1])
bbox_pred, bbox_num, _ = self.nms(bboxes, mlvl_scores)
return bbox_pred, bbox_num

@ -20,6 +20,7 @@ from paddle.nn.initializer import KaimingNormal
from paddlers.models.ppdet.core.workspace import register, create
from paddlers.models.ppdet.modeling.layers import ConvNormLayer
from .roi_extractor import RoIAlign
from ..cls_utils import _get_class_default_kwargs
@register
@ -103,7 +104,7 @@ class MaskFeat(nn.Layer):
@register
class MaskHead(nn.Layer):
__shared__ = ['num_classes']
__shared__ = ['num_classes', 'export_onnx']
__inject__ = ['mask_assigner']
"""
RCNN mask head
@ -120,12 +121,14 @@ class MaskHead(nn.Layer):
def __init__(self,
head,
roi_extractor=RoIAlign().__dict__,
roi_extractor=_get_class_default_kwargs(RoIAlign),
mask_assigner='MaskAssigner',
num_classes=80,
share_bbox_feat=False):
share_bbox_feat=False,
export_onnx=False):
super(MaskHead, self).__init__()
self.num_classes = num_classes
self.export_onnx = export_onnx
self.roi_extractor = roi_extractor
if isinstance(roi_extractor, dict):
@ -206,8 +209,8 @@ class MaskHead(nn.Layer):
rois_num (Tensor): The number of prediction for each batch
scale_factor (Tensor): The scale factor from origin size to input size
"""
if rois.shape[0] == 0:
mask_out = paddle.full([1, 1, 1, 1], -1)
if not self.export_onnx and rois.shape[0] == 0:
mask_out = paddle.full([1, 1, 1], -1)
else:
bbox = [rois[:, 2:]]
labels = rois[:, 0].cast('int32')
@ -218,19 +221,17 @@ class MaskHead(nn.Layer):
mask_feat = self.head(rois_feat)
mask_logit = self.mask_fcn_logits(mask_feat)
mask_num_class = mask_logit.shape[1]
if mask_num_class == 1:
mask_out = F.sigmoid(mask_logit)
if self.num_classes == 1:
mask_out = F.sigmoid(mask_logit)[:, 0, :, :]
else:
num_masks = mask_logit.shape[0]
mask_out = []
# TODO: need to optimize gather
for i in range(mask_logit.shape[0]):
pred_masks = paddle.unsqueeze(
mask_logit[i, :, :, :], axis=0)
mask = paddle.gather(pred_masks, labels[i], axis=1)
mask_out.append(mask)
mask_out = F.sigmoid(paddle.concat(mask_out))
num_masks = paddle.shape(mask_logit)[0]
index = paddle.arange(num_masks).cast('int32')
mask_out = mask_logit[index, labels]
mask_out_shape = paddle.shape(mask_out)
mask_out = paddle.reshape(mask_out, [
paddle.shape(index), mask_out_shape[-2], mask_out_shape[-1]
])
mask_out = F.sigmoid(mask_out)
return mask_out
def forward(self,

@ -24,9 +24,36 @@ import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn.initializer import Normal, Constant
from paddlers.models.ppdet.modeling.ops import get_static_shape
from ..initializer import normal_
from ..assigners.utils import generate_anchors_for_grid_cell
from ..bbox_utils import bbox_center, batch_distance2bbox, bbox2distance
from paddlers.models.ppdet.core.workspace import register
from paddlers.models.ppdet.modeling.layers import ConvNormLayer
from .simota_head import OTAVFLHead
from .gfl_head import Integral, GFLHead
from paddlers.models.ppdet.modeling.necks.csp_pan import DPModule
eps = 1e-9
__all__ = ['PicoHead', 'PicoHeadV2', 'PicoFeat']
class PicoSE(nn.Layer):
def __init__(self, feat_channels):
super(PicoSE, self).__init__()
self.fc = nn.Conv2D(feat_channels, feat_channels, 1)
self.conv = ConvNormLayer(feat_channels, feat_channels, 1, 1)
self._init_weights()
def _init_weights(self):
normal_(self.fc.weight, std=0.001)
def forward(self, feat, avg_feat):
weight = F.sigmoid(self.fc(avg_feat))
out = self.conv(feat * weight)
return out
@register
@ -39,6 +66,9 @@ class PicoFeat(nn.Layer):
feat_out (int): The channel number of output Tensor.
num_convs (int): The convolution number of the LiteGFLFeat.
norm_type (str): Normalization type, 'bn'/'sync_bn'/'gn'.
share_cls_reg (bool): Whether to share the cls and reg output.
act (str): The act of per layers.
use_se (bool): Whether to use se module.
"""
def __init__(self,
@ -48,14 +78,20 @@ class PicoFeat(nn.Layer):
num_convs=2,
norm_type='bn',
share_cls_reg=False,
act='hard_swish'):
act='hard_swish',
use_se=False):
super(PicoFeat, self).__init__()
self.num_convs = num_convs
self.norm_type = norm_type
self.share_cls_reg = share_cls_reg
self.act = act
self.use_se = use_se
self.cls_convs = []
self.reg_convs = []
if use_se:
assert share_cls_reg == True, \
'In the case of using se, share_cls_reg must be set to True'
self.se = nn.LayerList()
for stage_idx in range(num_fpn_stride):
cls_subnet_convs = []
reg_subnet_convs = []
@ -111,12 +147,16 @@ class PicoFeat(nn.Layer):
reg_subnet_convs.append(reg_conv_pw)
self.cls_convs.append(cls_subnet_convs)
self.reg_convs.append(reg_subnet_convs)
if use_se:
self.se.append(PicoSE(feat_out))
def act_func(self, x):
if self.act == "leaky_relu":
x = F.leaky_relu(x)
elif self.act == "hard_swish":
x = F.hardswish(x)
elif self.act == "relu6":
x = F.relu6(x)
return x
def forward(self, fpn_feat, stage_idx):
@ -125,8 +165,13 @@ class PicoFeat(nn.Layer):
reg_feat = fpn_feat
for i in range(len(self.cls_convs[stage_idx])):
cls_feat = self.act_func(self.cls_convs[stage_idx][i](cls_feat))
reg_feat = cls_feat
if not self.share_cls_reg:
reg_feat = self.act_func(self.reg_convs[stage_idx][i](reg_feat))
if self.use_se:
avg_feat = F.adaptive_avg_pool2d(cls_feat, (1, 1))
se_feat = self.act_func(self.se[stage_idx](cls_feat, avg_feat))
return cls_feat, se_feat
return cls_feat, reg_feat
@ -150,7 +195,7 @@ class PicoHead(OTAVFLHead):
'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox',
'assigner', 'nms'
]
__shared__ = ['num_classes']
__shared__ = ['num_classes', 'eval_size']
def __init__(self,
conv_feat='PicoFeat',
@ -166,7 +211,8 @@ class PicoHead(OTAVFLHead):
feat_in_chan=96,
nms=None,
nms_pre=1000,
cell_offset=0):
cell_offset=0,
eval_size=None):
super(PicoHead, self).__init__(
conv_feat=conv_feat,
dgqp_module=dgqp_module,
@ -195,6 +241,7 @@ class PicoHead(OTAVFLHead):
self.nms = nms
self.nms_pre = nms_pre
self.cell_offset = cell_offset
self.eval_size = eval_size
self.use_sigmoid = self.loss_vfl.use_sigmoid
if self.use_sigmoid:
@ -238,12 +285,23 @@ class PicoHead(OTAVFLHead):
bias_attr=ParamAttr(initializer=Constant(value=0))))
self.head_reg_list.append(head_reg)
def forward(self, fpn_feats, deploy=False):
# initialize the anchor points
if self.eval_size:
self.anchor_points, self.stride_tensor = self._generate_anchors()
def forward(self, fpn_feats, export_post_process=True):
assert len(fpn_feats) == len(
self.fpn_stride
), "The size of fpn_feats is not equal to size of fpn_stride"
cls_logits_list = []
bboxes_reg_list = []
if self.training:
return self.forward_train(fpn_feats)
else:
return self.forward_eval(
fpn_feats, export_post_process=export_post_process)
def forward_train(self, fpn_feats):
cls_logits_list, bboxes_reg_list = [], []
for i, fpn_feat in enumerate(fpn_feats):
conv_cls_feat, conv_reg_feat = self.conv_feat(fpn_feat, i)
if self.conv_feat.share_cls_reg:
@ -260,18 +318,466 @@ class PicoHead(OTAVFLHead):
quality_score = self.dgqp_module(bbox_pred)
cls_score = F.sigmoid(cls_score) * quality_score
if deploy:
cls_logits_list.append(cls_score)
bboxes_reg_list.append(bbox_pred)
return (cls_logits_list, bboxes_reg_list)
def forward_eval(self, fpn_feats, export_post_process=True):
if self.eval_size:
anchor_points, stride_tensor = self.anchor_points, self.stride_tensor
else:
anchor_points, stride_tensor = self._generate_anchors(fpn_feats)
cls_logits_list, bboxes_reg_list = [], []
for i, fpn_feat in enumerate(fpn_feats):
conv_cls_feat, conv_reg_feat = self.conv_feat(fpn_feat, i)
if self.conv_feat.share_cls_reg:
cls_logits = self.head_cls_list[i](conv_cls_feat)
cls_score, bbox_pred = paddle.split(
cls_logits,
[self.cls_out_channels, 4 * (self.reg_max + 1)],
axis=1)
else:
cls_score = self.head_cls_list[i](conv_cls_feat)
bbox_pred = self.head_reg_list[i](conv_reg_feat)
if self.dgqp_module:
quality_score = self.dgqp_module(bbox_pred)
cls_score = F.sigmoid(cls_score) * quality_score
if not export_post_process:
# Now only supports batch size = 1 in deploy
# TODO(ygh): support batch size > 1
cls_score = F.sigmoid(cls_score).reshape(
cls_score_out = F.sigmoid(cls_score).reshape(
[1, self.cls_out_channels, -1]).transpose([0, 2, 1])
bbox_pred = bbox_pred.reshape([1, (self.reg_max + 1) * 4,
-1]).transpose([0, 2, 1])
elif not self.training:
cls_score = F.sigmoid(cls_score.transpose([0, 2, 3, 1]))
else:
_, _, h, w = fpn_feat.shape
l = h * w
cls_score_out = F.sigmoid(
cls_score.reshape([-1, self.cls_out_channels, l]))
bbox_pred = bbox_pred.transpose([0, 2, 3, 1])
bbox_pred = self.distribution_project(bbox_pred)
bbox_pred = bbox_pred.reshape([-1, l, 4])
cls_logits_list.append(cls_score)
cls_logits_list.append(cls_score_out)
bboxes_reg_list.append(bbox_pred)
if export_post_process:
cls_logits_list = paddle.concat(cls_logits_list, axis=-1)
bboxes_reg_list = paddle.concat(bboxes_reg_list, axis=1)
bboxes_reg_list = batch_distance2bbox(anchor_points,
bboxes_reg_list)
bboxes_reg_list *= stride_tensor
return (cls_logits_list, bboxes_reg_list)
def _generate_anchors(self, feats=None):
# just use in eval time
anchor_points = []
stride_tensor = []
for i, stride in enumerate(self.fpn_stride):
if feats is not None:
_, _, h, w = feats[i].shape
else:
h = math.ceil(self.eval_size[0] / stride)
w = math.ceil(self.eval_size[1] / stride)
shift_x = paddle.arange(end=w) + self.cell_offset
shift_y = paddle.arange(end=h) + self.cell_offset
shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
anchor_point = paddle.cast(
paddle.stack(
[shift_x, shift_y], axis=-1), dtype='float32')
anchor_points.append(anchor_point.reshape([-1, 2]))
stride_tensor.append(
paddle.full(
[h * w, 1], stride, dtype='float32'))
anchor_points = paddle.concat(anchor_points)
stride_tensor = paddle.concat(stride_tensor)
return anchor_points, stride_tensor
def post_process(self, head_outs, scale_factor, export_nms=True):
pred_scores, pred_bboxes = head_outs
if not export_nms:
return pred_bboxes, pred_scores
else:
# rescale: [h_scale, w_scale] -> [w_scale, h_scale, w_scale, h_scale]
scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1)
scale_factor = paddle.concat(
[scale_x, scale_y, scale_x, scale_y],
axis=-1).reshape([-1, 1, 4])
# scale bbox to origin image size.
pred_bboxes /= scale_factor
bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
return bbox_pred, bbox_num
@register
class PicoHeadV2(GFLHead):
"""
PicoHeadV2
Args:
conv_feat (object): Instance of 'PicoFeat'
num_classes (int): Number of classes
fpn_stride (list): The stride of each FPN Layer
prior_prob (float): Used to set the bias init for the class prediction layer
loss_class (object): Instance of VariFocalLoss.
loss_dfl (object): Instance of DistributionFocalLoss.
loss_bbox (object): Instance of bbox loss.
assigner (object): Instance of label assigner.
reg_max: Max value of integral set :math: `{0, ..., reg_max}`
n QFL setting. Default: 7.
"""
__inject__ = [
'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox',
'static_assigner', 'assigner', 'nms'
]
__shared__ = ['num_classes', 'eval_size']
def __init__(self,
conv_feat='PicoFeatV2',
dgqp_module=None,
num_classes=80,
fpn_stride=[8, 16, 32],
prior_prob=0.01,
use_align_head=True,
loss_class='VariFocalLoss',
loss_dfl='DistributionFocalLoss',
loss_bbox='GIoULoss',
static_assigner_epoch=60,
static_assigner='ATSSAssigner',
assigner='TaskAlignedAssigner',
reg_max=16,
feat_in_chan=96,
nms=None,
nms_pre=1000,
cell_offset=0,
act='hard_swish',
grid_cell_scale=5.0,
eval_size=None):
super(PicoHeadV2, self).__init__(
conv_feat=conv_feat,
dgqp_module=dgqp_module,
num_classes=num_classes,
fpn_stride=fpn_stride,
prior_prob=prior_prob,
loss_class=loss_class,
loss_dfl=loss_dfl,
loss_bbox=loss_bbox,
reg_max=reg_max,
feat_in_chan=feat_in_chan,
nms=nms,
nms_pre=nms_pre,
cell_offset=cell_offset, )
self.conv_feat = conv_feat
self.num_classes = num_classes
self.fpn_stride = fpn_stride
self.prior_prob = prior_prob
self.loss_vfl = loss_class
self.loss_dfl = loss_dfl
self.loss_bbox = loss_bbox
self.static_assigner_epoch = static_assigner_epoch
self.static_assigner = static_assigner
self.assigner = assigner
self.reg_max = reg_max
self.feat_in_chan = feat_in_chan
self.nms = nms
self.nms_pre = nms_pre
self.cell_offset = cell_offset
self.act = act
self.grid_cell_scale = grid_cell_scale
self.use_align_head = use_align_head
self.cls_out_channels = self.num_classes
self.eval_size = eval_size
bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob)
# Clear the super class initialization
self.gfl_head_cls = None
self.gfl_head_reg = None
self.scales_regs = None
self.head_cls_list = []
self.head_reg_list = []
self.cls_align = nn.LayerList()
for i in range(len(fpn_stride)):
head_cls = self.add_sublayer(
"head_cls" + str(i),
nn.Conv2D(
in_channels=self.feat_in_chan,
out_channels=self.cls_out_channels,
kernel_size=1,
stride=1,
padding=0,
weight_attr=ParamAttr(initializer=Normal(
mean=0., std=0.01)),
bias_attr=ParamAttr(
initializer=Constant(value=bias_init_value))))
self.head_cls_list.append(head_cls)
head_reg = self.add_sublayer(
"head_reg" + str(i),
nn.Conv2D(
in_channels=self.feat_in_chan,
out_channels=4 * (self.reg_max + 1),
kernel_size=1,
stride=1,
padding=0,
weight_attr=ParamAttr(initializer=Normal(
mean=0., std=0.01)),
bias_attr=ParamAttr(initializer=Constant(value=0))))
self.head_reg_list.append(head_reg)
if self.use_align_head:
self.cls_align.append(
DPModule(
self.feat_in_chan,
1,
5,
act=self.act,
use_act_in_out=False))
# initialize the anchor points
if self.eval_size:
self.anchor_points, self.stride_tensor = self._generate_anchors()
def forward(self, fpn_feats, export_post_process=True):
assert len(fpn_feats) == len(
self.fpn_stride
), "The size of fpn_feats is not equal to size of fpn_stride"
if self.training:
return self.forward_train(fpn_feats)
else:
return self.forward_eval(
fpn_feats, export_post_process=export_post_process)
def forward_train(self, fpn_feats):
cls_score_list, reg_list, box_list = [], [], []
for i, (fpn_feat, stride) in enumerate(zip(fpn_feats, self.fpn_stride)):
b, _, h, w = get_static_shape(fpn_feat)
# task decomposition
conv_cls_feat, se_feat = self.conv_feat(fpn_feat, i)
cls_logit = self.head_cls_list[i](se_feat)
reg_pred = self.head_reg_list[i](se_feat)
# cls prediction and alignment
if self.use_align_head:
cls_prob = F.sigmoid(self.cls_align[i](conv_cls_feat))
cls_score = (F.sigmoid(cls_logit) * cls_prob + eps).sqrt()
else:
cls_score = F.sigmoid(cls_logit)
cls_score_out = cls_score.transpose([0, 2, 3, 1])
bbox_pred = reg_pred.transpose([0, 2, 3, 1])
b, cell_h, cell_w, _ = paddle.shape(cls_score_out)
y, x = self.get_single_level_center_point(
[cell_h, cell_w], stride, cell_offset=self.cell_offset)
center_points = paddle.stack([x, y], axis=-1)
cls_score_out = cls_score_out.reshape(
[b, -1, self.cls_out_channels])
bbox_pred = self.distribution_project(bbox_pred) * stride
bbox_pred = bbox_pred.reshape([b, cell_h * cell_w, 4])
bbox_pred = batch_distance2bbox(
center_points, bbox_pred, max_shapes=None)
cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))
reg_list.append(reg_pred.flatten(2).transpose([0, 2, 1]))
box_list.append(bbox_pred / stride)
cls_score_list = paddle.concat(cls_score_list, axis=1)
box_list = paddle.concat(box_list, axis=1)
reg_list = paddle.concat(reg_list, axis=1)
return cls_score_list, reg_list, box_list, fpn_feats
def forward_eval(self, fpn_feats, export_post_process=True):
if self.eval_size:
anchor_points, stride_tensor = self.anchor_points, self.stride_tensor
else:
anchor_points, stride_tensor = self._generate_anchors(fpn_feats)
cls_score_list, box_list = [], []
for i, (fpn_feat, stride) in enumerate(zip(fpn_feats, self.fpn_stride)):
_, _, h, w = fpn_feat.shape
# task decomposition
conv_cls_feat, se_feat = self.conv_feat(fpn_feat, i)
cls_logit = self.head_cls_list[i](se_feat)
reg_pred = self.head_reg_list[i](se_feat)
# cls prediction and alignment
if self.use_align_head:
cls_prob = F.sigmoid(self.cls_align[i](conv_cls_feat))
cls_score = (F.sigmoid(cls_logit) * cls_prob + eps).sqrt()
else:
cls_score = F.sigmoid(cls_logit)
if not export_post_process:
# Now only supports batch size = 1 in deploy
cls_score_list.append(
cls_score.reshape([1, self.cls_out_channels, -1]).transpose(
[0, 2, 1]))
box_list.append(
reg_pred.reshape([1, (self.reg_max + 1) * 4, -1]).transpose(
[0, 2, 1]))
else:
l = h * w
cls_score_out = cls_score.reshape(
[-1, self.cls_out_channels, l])
bbox_pred = reg_pred.transpose([0, 2, 3, 1])
bbox_pred = self.distribution_project(bbox_pred)
bbox_pred = bbox_pred.reshape([-1, l, 4])
cls_score_list.append(cls_score_out)
box_list.append(bbox_pred)
if export_post_process:
cls_score_list = paddle.concat(cls_score_list, axis=-1)
box_list = paddle.concat(box_list, axis=1)
box_list = batch_distance2bbox(anchor_points, box_list)
box_list *= stride_tensor
return cls_score_list, box_list
def get_loss(self, head_outs, gt_meta):
pred_scores, pred_regs, pred_bboxes, fpn_feats = head_outs
gt_labels = gt_meta['gt_class']
gt_bboxes = gt_meta['gt_bbox']
gt_scores = gt_meta['gt_score'] if 'gt_score' in gt_meta else None
num_imgs = gt_meta['im_id'].shape[0]
pad_gt_mask = gt_meta['pad_gt_mask']
anchors, _, num_anchors_list, stride_tensor_list = generate_anchors_for_grid_cell(
fpn_feats, self.fpn_stride, self.grid_cell_scale, self.cell_offset)
centers = bbox_center(anchors)
# label assignment
if gt_meta['epoch_id'] < self.static_assigner_epoch:
assigned_labels, assigned_bboxes, assigned_scores = self.static_assigner(
anchors,
num_anchors_list,
gt_labels,
gt_bboxes,
pad_gt_mask,
bg_index=self.num_classes,
gt_scores=gt_scores,
pred_bboxes=pred_bboxes.detach() * stride_tensor_list)
else:
assigned_labels, assigned_bboxes, assigned_scores = self.assigner(
pred_scores.detach(),
pred_bboxes.detach() * stride_tensor_list,
centers,
num_anchors_list,
gt_labels,
gt_bboxes,
pad_gt_mask,
bg_index=self.num_classes,
gt_scores=gt_scores)
assigned_bboxes /= stride_tensor_list
centers_shape = centers.shape
flatten_centers = centers.expand(
[num_imgs, centers_shape[0], centers_shape[1]]).reshape([-1, 2])
flatten_strides = stride_tensor_list.expand(
[num_imgs, centers_shape[0], 1]).reshape([-1, 1])
flatten_cls_preds = pred_scores.reshape([-1, self.num_classes])
flatten_regs = pred_regs.reshape([-1, 4 * (self.reg_max + 1)])
flatten_bboxes = pred_bboxes.reshape([-1, 4])
flatten_bbox_targets = assigned_bboxes.reshape([-1, 4])
flatten_labels = assigned_labels.reshape([-1])
flatten_assigned_scores = assigned_scores.reshape(
[-1, self.num_classes])
pos_inds = paddle.nonzero(
paddle.logical_and((flatten_labels >= 0),
(flatten_labels < self.num_classes)),
as_tuple=False).squeeze(1)
num_total_pos = len(pos_inds)
if num_total_pos > 0:
pos_bbox_targets = paddle.gather(
flatten_bbox_targets, pos_inds, axis=0)
pos_decode_bbox_pred = paddle.gather(
flatten_bboxes, pos_inds, axis=0)
pos_reg = paddle.gather(flatten_regs, pos_inds, axis=0)
pos_strides = paddle.gather(flatten_strides, pos_inds, axis=0)
pos_centers = paddle.gather(
flatten_centers, pos_inds, axis=0) / pos_strides
weight_targets = flatten_assigned_scores.detach()
weight_targets = paddle.gather(
weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0)
pred_corners = pos_reg.reshape([-1, self.reg_max + 1])
target_corners = bbox2distance(pos_centers, pos_bbox_targets,
self.reg_max).reshape([-1])
# regression loss
loss_bbox = paddle.sum(
self.loss_bbox(pos_decode_bbox_pred,
pos_bbox_targets) * weight_targets)
# dfl loss
loss_dfl = self.loss_dfl(
pred_corners,
target_corners,
weight=weight_targets.expand([-1, 4]).reshape([-1]),
avg_factor=4.0)
else:
loss_bbox = paddle.zeros([1])
loss_dfl = paddle.zeros([1])
avg_factor = flatten_assigned_scores.sum()
if paddle.distributed.get_world_size() > 1:
paddle.distributed.all_reduce(avg_factor)
avg_factor = paddle.clip(
avg_factor / paddle.distributed.get_world_size(), min=1)
loss_vfl = self.loss_vfl(
flatten_cls_preds, flatten_assigned_scores, avg_factor=avg_factor)
loss_bbox = loss_bbox / avg_factor
loss_dfl = loss_dfl / avg_factor
loss_states = dict(
loss_vfl=loss_vfl, loss_bbox=loss_bbox, loss_dfl=loss_dfl)
return loss_states
def _generate_anchors(self, feats=None):
# just use in eval time
anchor_points = []
stride_tensor = []
for i, stride in enumerate(self.fpn_stride):
if feats is not None:
_, _, h, w = feats[i].shape
else:
h = math.ceil(self.eval_size[0] / stride)
w = math.ceil(self.eval_size[1] / stride)
shift_x = paddle.arange(end=w) + self.cell_offset
shift_y = paddle.arange(end=h) + self.cell_offset
shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
anchor_point = paddle.cast(
paddle.stack(
[shift_x, shift_y], axis=-1), dtype='float32')
anchor_points.append(anchor_point.reshape([-1, 2]))
stride_tensor.append(
paddle.full(
[h * w, 1], stride, dtype='float32'))
anchor_points = paddle.concat(anchor_points)
stride_tensor = paddle.concat(stride_tensor)
return anchor_points, stride_tensor
def post_process(self, head_outs, scale_factor, export_nms=True):
pred_scores, pred_bboxes = head_outs
if not export_nms:
return pred_bboxes, pred_scores
else:
# rescale: [h_scale, w_scale] -> [w_scale, h_scale, w_scale, h_scale]
scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1)
scale_factor = paddle.concat(
[scale_x, scale_y, scale_x, scale_y],
axis=-1).reshape([-1, 1, 4])
# scale bbox to origin image size.
pred_bboxes /= scale_factor
bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
return bbox_pred, bbox_num

@ -0,0 +1,388 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppdet.core.workspace import register
from ..bbox_utils import batch_distance2bbox
from ..losses import GIoULoss
from ..initializer import bias_init_with_prob, constant_, normal_
from ..assigners.utils import generate_anchors_for_grid_cell
from paddlers.models.ppdet.modeling.backbones.cspresnet import ConvBNLayer
from paddlers.models.ppdet.modeling.ops import get_static_shape, get_act_fn
from paddlers.models.ppdet.modeling.layers import MultiClassNMS
__all__ = ['PPYOLOEHead']
class ESEAttn(nn.Layer):
def __init__(self, feat_channels, act='swish'):
super(ESEAttn, self).__init__()
self.fc = nn.Conv2D(feat_channels, feat_channels, 1)
self.conv = ConvBNLayer(feat_channels, feat_channels, 1, act=act)
self._init_weights()
def _init_weights(self):
normal_(self.fc.weight, std=0.001)
def forward(self, feat, avg_feat):
weight = F.sigmoid(self.fc(avg_feat))
return self.conv(feat * weight)
@register
class PPYOLOEHead(nn.Layer):
__shared__ = [
'num_classes', 'eval_size', 'trt', 'exclude_nms', 'exclude_post_process'
]
__inject__ = ['static_assigner', 'assigner', 'nms']
def __init__(self,
in_channels=[1024, 512, 256],
num_classes=80,
act='swish',
fpn_strides=(32, 16, 8),
grid_cell_scale=5.0,
grid_cell_offset=0.5,
reg_max=16,
static_assigner_epoch=4,
use_varifocal_loss=True,
static_assigner='ATSSAssigner',
assigner='TaskAlignedAssigner',
nms='MultiClassNMS',
eval_size=None,
loss_weight={
'class': 1.0,
'iou': 2.5,
'dfl': 0.5,
},
trt=False,
exclude_nms=False,
exclude_post_process=False):
super(PPYOLOEHead, self).__init__()
assert len(in_channels) > 0, "len(in_channels) should > 0"
self.in_channels = in_channels
self.num_classes = num_classes
self.fpn_strides = fpn_strides
self.grid_cell_scale = grid_cell_scale
self.grid_cell_offset = grid_cell_offset
self.reg_max = reg_max
self.iou_loss = GIoULoss()
self.loss_weight = loss_weight
self.use_varifocal_loss = use_varifocal_loss
self.eval_size = eval_size
self.static_assigner_epoch = static_assigner_epoch
self.static_assigner = static_assigner
self.assigner = assigner
self.nms = nms
if isinstance(self.nms, MultiClassNMS) and trt:
self.nms.trt = trt
self.exclude_nms = exclude_nms
self.exclude_post_process = exclude_post_process
# stem
self.stem_cls = nn.LayerList()
self.stem_reg = nn.LayerList()
act = get_act_fn(
act, trt=trt) if act is None or isinstance(act,
(str, dict)) else act
for in_c in self.in_channels:
self.stem_cls.append(ESEAttn(in_c, act=act))
self.stem_reg.append(ESEAttn(in_c, act=act))
# pred head
self.pred_cls = nn.LayerList()
self.pred_reg = nn.LayerList()
for in_c in self.in_channels:
self.pred_cls.append(
nn.Conv2D(
in_c, self.num_classes, 3, padding=1))
self.pred_reg.append(
nn.Conv2D(
in_c, 4 * (self.reg_max + 1), 3, padding=1))
# projection conv
self.proj_conv = nn.Conv2D(self.reg_max + 1, 1, 1, bias_attr=False)
self.proj_conv.skip_quant = True
self._init_weights()
@classmethod
def from_config(cls, cfg, input_shape):
return {'in_channels': [i.channels for i in input_shape], }
def _init_weights(self):
bias_cls = bias_init_with_prob(0.01)
for cls_, reg_ in zip(self.pred_cls, self.pred_reg):
constant_(cls_.weight)
constant_(cls_.bias, bias_cls)
constant_(reg_.weight)
constant_(reg_.bias, 1.0)
proj = paddle.linspace(0, self.reg_max, self.reg_max + 1).reshape(
[1, self.reg_max + 1, 1, 1])
self.proj_conv.weight.set_value(proj)
self.proj_conv.weight.stop_gradient = True
if self.eval_size:
anchor_points, stride_tensor = self._generate_anchors()
self.anchor_points = anchor_points
self.stride_tensor = stride_tensor
def forward_train(self, feats, targets):
anchors, anchor_points, num_anchors_list, stride_tensor = \
generate_anchors_for_grid_cell(
feats, self.fpn_strides, self.grid_cell_scale,
self.grid_cell_offset)
cls_score_list, reg_distri_list = [], []
for i, feat in enumerate(feats):
avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) +
feat)
reg_distri = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))
# cls and reg
cls_score = F.sigmoid(cls_logit)
cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))
reg_distri_list.append(reg_distri.flatten(2).transpose([0, 2, 1]))
cls_score_list = paddle.concat(cls_score_list, axis=1)
reg_distri_list = paddle.concat(reg_distri_list, axis=1)
return self.get_loss([
cls_score_list, reg_distri_list, anchors, anchor_points,
num_anchors_list, stride_tensor
], targets)
def _generate_anchors(self, feats=None, dtype='float32'):
# just use in eval time
anchor_points = []
stride_tensor = []
for i, stride in enumerate(self.fpn_strides):
if feats is not None:
_, _, h, w = feats[i].shape
else:
h = int(self.eval_size[0] / stride)
w = int(self.eval_size[1] / stride)
shift_x = paddle.arange(end=w) + self.grid_cell_offset
shift_y = paddle.arange(end=h) + self.grid_cell_offset
shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
anchor_point = paddle.cast(
paddle.stack(
[shift_x, shift_y], axis=-1), dtype=dtype)
anchor_points.append(anchor_point.reshape([-1, 2]))
stride_tensor.append(paddle.full([h * w, 1], stride, dtype=dtype))
anchor_points = paddle.concat(anchor_points)
stride_tensor = paddle.concat(stride_tensor)
return anchor_points, stride_tensor
def forward_eval(self, feats):
if self.eval_size:
anchor_points, stride_tensor = self.anchor_points, self.stride_tensor
else:
anchor_points, stride_tensor = self._generate_anchors(feats)
cls_score_list, reg_dist_list = [], []
for i, feat in enumerate(feats):
_, _, h, w = feat.shape
l = h * w
avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) +
feat)
reg_dist = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))
reg_dist = reg_dist.reshape([-1, 4, self.reg_max + 1, l]).transpose(
[0, 2, 3, 1])
reg_dist = self.proj_conv(F.softmax(reg_dist, axis=1)).squeeze(1)
# cls and reg
cls_score = F.sigmoid(cls_logit)
cls_score_list.append(cls_score.reshape([-1, self.num_classes, l]))
reg_dist_list.append(reg_dist)
cls_score_list = paddle.concat(cls_score_list, axis=-1)
reg_dist_list = paddle.concat(reg_dist_list, axis=1)
return cls_score_list, reg_dist_list, anchor_points, stride_tensor
def forward(self, feats, targets=None):
assert len(feats) == len(self.fpn_strides), \
"The size of feats is not equal to size of fpn_strides"
if self.training:
return self.forward_train(feats, targets)
else:
return self.forward_eval(feats)
@staticmethod
def _focal_loss(score, label, alpha=0.25, gamma=2.0):
weight = (score - label).pow(gamma)
if alpha > 0:
alpha_t = alpha * label + (1 - alpha) * (1 - label)
weight *= alpha_t
loss = F.binary_cross_entropy(
score, label, weight=weight, reduction='sum')
return loss
@staticmethod
def _varifocal_loss(pred_score, gt_score, label, alpha=0.75, gamma=2.0):
weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label
loss = F.binary_cross_entropy(
pred_score, gt_score, weight=weight, reduction='sum')
return loss
def _bbox_decode(self, anchor_points, pred_dist):
_, l, _ = get_static_shape(pred_dist)
pred_dist = F.softmax(pred_dist.reshape([-1, l, 4, self.reg_max + 1]))
pred_dist = self.proj_conv(pred_dist.transpose([0, 3, 1, 2])).squeeze(1)
return batch_distance2bbox(anchor_points, pred_dist)
def _bbox2distance(self, points, bbox):
x1y1, x2y2 = paddle.split(bbox, 2, -1)
lt = points - x1y1
rb = x2y2 - points
return paddle.concat([lt, rb], -1).clip(0, self.reg_max - 0.01)
def _df_loss(self, pred_dist, target):
target_left = paddle.cast(target, 'int64')
target_right = target_left + 1
weight_left = target_right.astype('float32') - target
weight_right = 1 - weight_left
loss_left = F.cross_entropy(
pred_dist, target_left, reduction='none') * weight_left
loss_right = F.cross_entropy(
pred_dist, target_right, reduction='none') * weight_right
return (loss_left + loss_right).mean(-1, keepdim=True)
def _bbox_loss(self, pred_dist, pred_bboxes, anchor_points, assigned_labels,
assigned_bboxes, assigned_scores, assigned_scores_sum):
# select positive samples mask
mask_positive = (assigned_labels != self.num_classes)
num_pos = mask_positive.sum()
# pos/neg loss
if num_pos > 0:
# l1 + iou
bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 4])
pred_bboxes_pos = paddle.masked_select(pred_bboxes,
bbox_mask).reshape([-1, 4])
assigned_bboxes_pos = paddle.masked_select(
assigned_bboxes, bbox_mask).reshape([-1, 4])
bbox_weight = paddle.masked_select(
assigned_scores.sum(-1), mask_positive).unsqueeze(-1)
loss_l1 = F.l1_loss(pred_bboxes_pos, assigned_bboxes_pos)
loss_iou = self.iou_loss(pred_bboxes_pos,
assigned_bboxes_pos) * bbox_weight
loss_iou = loss_iou.sum() / assigned_scores_sum
dist_mask = mask_positive.unsqueeze(-1).tile(
[1, 1, (self.reg_max + 1) * 4])
pred_dist_pos = paddle.masked_select(
pred_dist, dist_mask).reshape([-1, 4, self.reg_max + 1])
assigned_ltrb = self._bbox2distance(anchor_points, assigned_bboxes)
assigned_ltrb_pos = paddle.masked_select(
assigned_ltrb, bbox_mask).reshape([-1, 4])
loss_dfl = self._df_loss(pred_dist_pos,
assigned_ltrb_pos) * bbox_weight
loss_dfl = loss_dfl.sum() / assigned_scores_sum
else:
loss_l1 = paddle.zeros([1])
loss_iou = paddle.zeros([1])
loss_dfl = pred_dist.sum() * 0.
return loss_l1, loss_iou, loss_dfl
def get_loss(self, head_outs, gt_meta):
pred_scores, pred_distri, anchors,\
anchor_points, num_anchors_list, stride_tensor = head_outs
anchor_points_s = anchor_points / stride_tensor
pred_bboxes = self._bbox_decode(anchor_points_s, pred_distri)
gt_labels = gt_meta['gt_class']
gt_bboxes = gt_meta['gt_bbox']
pad_gt_mask = gt_meta['pad_gt_mask']
# label assignment
if gt_meta['epoch_id'] < self.static_assigner_epoch:
assigned_labels, assigned_bboxes, assigned_scores = \
self.static_assigner(
anchors,
num_anchors_list,
gt_labels,
gt_bboxes,
pad_gt_mask,
bg_index=self.num_classes,
pred_bboxes=pred_bboxes.detach() * stride_tensor)
alpha_l = 0.25
else:
assigned_labels, assigned_bboxes, assigned_scores = \
self.assigner(
pred_scores.detach(),
pred_bboxes.detach() * stride_tensor,
anchor_points,
num_anchors_list,
gt_labels,
gt_bboxes,
pad_gt_mask,
bg_index=self.num_classes)
alpha_l = -1
# rescale bbox
assigned_bboxes /= stride_tensor
# cls loss
if self.use_varifocal_loss:
one_hot_label = F.one_hot(assigned_labels,
self.num_classes + 1)[..., :-1]
loss_cls = self._varifocal_loss(pred_scores, assigned_scores,
one_hot_label)
else:
loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha_l)
assigned_scores_sum = assigned_scores.sum()
if paddle.distributed.get_world_size() > 1:
paddle.distributed.all_reduce(assigned_scores_sum)
assigned_scores_sum /= paddle.distributed.get_world_size()
assigned_scores_sum = paddle.clip(assigned_scores_sum, min=1.)
loss_cls /= assigned_scores_sum
loss_l1, loss_iou, loss_dfl = \
self._bbox_loss(pred_distri, pred_bboxes, anchor_points_s,
assigned_labels, assigned_bboxes, assigned_scores,
assigned_scores_sum)
loss = self.loss_weight['class'] * loss_cls + \
self.loss_weight['iou'] * loss_iou + \
self.loss_weight['dfl'] * loss_dfl
out_dict = {
'loss': loss,
'loss_cls': loss_cls,
'loss_iou': loss_iou,
'loss_dfl': loss_dfl,
'loss_l1': loss_l1,
}
return out_dict
def post_process(self, head_outs, scale_factor):
pred_scores, pred_dist, anchor_points, stride_tensor = head_outs
pred_bboxes = batch_distance2bbox(anchor_points, pred_dist)
pred_bboxes *= stride_tensor
if self.exclude_post_process:
return paddle.concat(
[pred_bboxes, pred_scores.transpose([0, 2, 1])], axis=-1), None
else:
# scale bbox to origin
scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1)
scale_factor = paddle.concat(
[scale_x, scale_y, scale_x, scale_y],
axis=-1).reshape([-1, 1, 4])
pred_bboxes /= scale_factor
if self.exclude_nms:
# `exclude_nms=True` just use in benchmark
return pred_bboxes, pred_scores
else:
bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
return bbox_pred, bbox_num

@ -0,0 +1,249 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn.initializer import Normal, Constant
from paddlers.models.ppdet.modeling.bbox_utils import bbox2delta, delta2bbox
from paddlers.models.ppdet.modeling.heads.fcos_head import FCOSFeat
from paddlers.models.ppdet.core.workspace import register
__all__ = ['RetinaHead']
@register
class RetinaFeat(FCOSFeat):
"""We use FCOSFeat to construct conv layers in RetinaNet.
We rename FCOSFeat to RetinaFeat to avoid confusion.
"""
pass
@register
class RetinaHead(nn.Layer):
"""Used in RetinaNet proposed in paper https://arxiv.org/pdf/1708.02002.pdf
"""
__shared__ = ['num_classes']
__inject__ = [
'conv_feat', 'anchor_generator', 'bbox_assigner', 'loss_class',
'loss_bbox', 'nms'
]
def __init__(self,
num_classes=80,
conv_feat='RetinaFeat',
anchor_generator='RetinaAnchorGenerator',
bbox_assigner='MaxIoUAssigner',
loss_class='FocalLoss',
loss_bbox='SmoothL1Loss',
nms='MultiClassNMS',
prior_prob=0.01,
nms_pre=1000,
weights=[1., 1., 1., 1.]):
super(RetinaHead, self).__init__()
self.num_classes = num_classes
self.conv_feat = conv_feat
self.anchor_generator = anchor_generator
self.bbox_assigner = bbox_assigner
self.loss_class = loss_class
self.loss_bbox = loss_bbox
self.nms = nms
self.nms_pre = nms_pre
self.weights = weights
bias_init_value = -math.log((1 - prior_prob) / prior_prob)
num_anchors = self.anchor_generator.num_anchors
self.retina_cls = nn.Conv2D(
in_channels=self.conv_feat.feat_out,
out_channels=self.num_classes * num_anchors,
kernel_size=3,
stride=1,
padding=1,
weight_attr=ParamAttr(initializer=Normal(
mean=0.0, std=0.01)),
bias_attr=ParamAttr(initializer=Constant(value=bias_init_value)))
self.retina_reg = nn.Conv2D(
in_channels=self.conv_feat.feat_out,
out_channels=4 * num_anchors,
kernel_size=3,
stride=1,
padding=1,
weight_attr=ParamAttr(initializer=Normal(
mean=0.0, std=0.01)),
bias_attr=ParamAttr(initializer=Constant(value=0)))
def forward(self, neck_feats, targets=None):
cls_logits_list = []
bboxes_reg_list = []
for neck_feat in neck_feats:
conv_cls_feat, conv_reg_feat = self.conv_feat(neck_feat)
cls_logits = self.retina_cls(conv_cls_feat)
bbox_reg = self.retina_reg(conv_reg_feat)
cls_logits_list.append(cls_logits)
bboxes_reg_list.append(bbox_reg)
if self.training:
return self.get_loss([cls_logits_list, bboxes_reg_list], targets)
else:
return [cls_logits_list, bboxes_reg_list]
def get_loss(self, head_outputs, targets):
"""Here we calculate loss for a batch of images.
We assign anchors to gts in each image and gather all the assigned
postive and negative samples. Then loss is calculated on the gathered
samples.
"""
cls_logits_list, bboxes_reg_list = head_outputs
anchors = self.anchor_generator(cls_logits_list)
anchors = paddle.concat(anchors)
# matches: contain gt_inds
# match_labels: -1(ignore), 0(neg) or 1(pos)
matches_list, match_labels_list = [], []
# assign anchors to gts, no sampling is involved
for gt_bbox in targets['gt_bbox']:
matches, match_labels = self.bbox_assigner(anchors, gt_bbox)
matches_list.append(matches)
match_labels_list.append(match_labels)
# reshape network outputs
cls_logits = [
_.transpose([0, 2, 3, 1]).reshape([0, -1, self.num_classes])
for _ in cls_logits_list
]
bboxes_reg = [
_.transpose([0, 2, 3, 1]).reshape([0, -1, 4])
for _ in bboxes_reg_list
]
cls_logits = paddle.concat(cls_logits, axis=1)
bboxes_reg = paddle.concat(bboxes_reg, axis=1)
cls_pred_list, cls_tar_list = [], []
reg_pred_list, reg_tar_list = [], []
# find and gather preds and targets in each image
for matches, match_labels, cls_logit, bbox_reg, gt_bbox, gt_class in \
zip(matches_list, match_labels_list, cls_logits, bboxes_reg,
targets['gt_bbox'], targets['gt_class']):
pos_mask = (match_labels == 1)
neg_mask = (match_labels == 0)
chosen_mask = paddle.logical_or(pos_mask, neg_mask)
gt_class = gt_class.reshape([-1])
bg_class = paddle.to_tensor(
[self.num_classes], dtype=gt_class.dtype)
# a trick to assign num_classes to negative targets
gt_class = paddle.concat([gt_class, bg_class], axis=-1)
matches = paddle.where(neg_mask,
paddle.full_like(matches, gt_class.size - 1),
matches)
cls_pred = cls_logit[chosen_mask]
cls_tar = gt_class[matches[chosen_mask]]
reg_pred = bbox_reg[pos_mask].reshape([-1, 4])
reg_tar = gt_bbox[matches[pos_mask]].reshape([-1, 4])
reg_tar = bbox2delta(anchors[pos_mask], reg_tar, self.weights)
cls_pred_list.append(cls_pred)
cls_tar_list.append(cls_tar)
reg_pred_list.append(reg_pred)
reg_tar_list.append(reg_tar)
cls_pred = paddle.concat(cls_pred_list)
cls_tar = paddle.concat(cls_tar_list)
reg_pred = paddle.concat(reg_pred_list)
reg_tar = paddle.concat(reg_tar_list)
avg_factor = max(1.0, reg_pred.shape[0])
cls_loss = self.loss_class(
cls_pred, cls_tar, reduction='sum') / avg_factor
if reg_pred.shape[0] == 0:
reg_loss = paddle.zeros([1])
reg_loss.stop_gradient = False
else:
reg_loss = self.loss_bbox(
reg_pred, reg_tar, reduction='sum') / avg_factor
loss = cls_loss + reg_loss
out_dict = {
'loss_cls': cls_loss,
'loss_reg': reg_loss,
'loss': loss,
}
return out_dict
def get_bboxes_single(self,
anchors,
cls_scores_list,
bbox_preds_list,
im_shape,
scale_factor,
rescale=True):
assert len(cls_scores_list) == len(bbox_preds_list)
mlvl_bboxes = []
mlvl_scores = []
for anchor, cls_score, bbox_pred in zip(anchors, cls_scores_list,
bbox_preds_list):
cls_score = cls_score.reshape([-1, self.num_classes])
bbox_pred = bbox_pred.reshape([-1, 4])
if self.nms_pre is not None and cls_score.shape[0] > self.nms_pre:
max_score = cls_score.max(axis=1)
_, topk_inds = max_score.topk(self.nms_pre)
bbox_pred = bbox_pred.gather(topk_inds)
anchor = anchor.gather(topk_inds)
cls_score = cls_score.gather(topk_inds)
bbox_pred = delta2bbox(bbox_pred, anchor, self.weights).squeeze()
mlvl_bboxes.append(bbox_pred)
mlvl_scores.append(F.sigmoid(cls_score))
mlvl_bboxes = paddle.concat(mlvl_bboxes)
mlvl_bboxes = paddle.squeeze(mlvl_bboxes)
if rescale:
mlvl_bboxes = mlvl_bboxes / paddle.concat(
[scale_factor[::-1], scale_factor[::-1]])
mlvl_scores = paddle.concat(mlvl_scores)
mlvl_scores = mlvl_scores.transpose([1, 0])
return mlvl_bboxes, mlvl_scores
def decode(self, anchors, cls_logits, bboxes_reg, im_shape, scale_factor):
batch_bboxes = []
batch_scores = []
for img_id in range(cls_logits[0].shape[0]):
num_lvls = len(cls_logits)
cls_scores_list = [cls_logits[i][img_id] for i in range(num_lvls)]
bbox_preds_list = [bboxes_reg[i][img_id] for i in range(num_lvls)]
bboxes, scores = self.get_bboxes_single(
anchors, cls_scores_list, bbox_preds_list, im_shape[img_id],
scale_factor[img_id])
batch_bboxes.append(bboxes)
batch_scores.append(scores)
batch_bboxes = paddle.stack(batch_bboxes, axis=0)
batch_scores = paddle.stack(batch_scores, axis=0)
return batch_bboxes, batch_scores
def post_process(self, head_outputs, im_shape, scale_factor):
cls_logits_list, bboxes_reg_list = head_outputs
anchors = self.anchor_generator(cls_logits_list)
cls_logits = [_.transpose([0, 2, 3, 1]) for _ in cls_logits_list]
bboxes_reg = [_.transpose([0, 2, 3, 1]) for _ in bboxes_reg_list]
bboxes, scores = self.decode(anchors, cls_logits, bboxes_reg, im_shape,
scale_factor)
bbox_pred, bbox_num, _ = self.nms(bboxes, scores)
return bbox_pred, bbox_num

@ -29,7 +29,7 @@ class RoIAlign(object):
RoI Align module
For more details, please refer to the document of roi_align in
in ppdet/modeing/ops.py
in https://github.com/PaddlePaddle/Paddle/blob/release/2.5/python/paddle/vision/ops.py
Args:
resolution (int): The output size, default 14
@ -76,17 +76,26 @@ class RoIAlign(object):
def __call__(self, feats, roi, rois_num):
roi = paddle.concat(roi) if len(roi) > 1 else roi[0]
if len(feats) == 1:
rois_feat = ops.roi_align(
feats[self.start_level],
roi,
self.resolution,
self.spatial_scale[0],
rois_num=rois_num,
rois_feat = paddle.vision.ops.roi_align(
x=feats[self.start_level],
boxes=roi,
boxes_num=rois_num,
output_size=self.resolution,
spatial_scale=self.spatial_scale[0],
aligned=self.aligned)
else:
offset = 2
k_min = self.start_level + offset
k_max = self.end_level + offset
if hasattr(paddle.vision.ops, "distribute_fpn_proposals"):
rois_dist, restore_index, rois_num_dist = paddle.vision.ops.distribute_fpn_proposals(
roi,
k_min,
k_max,
self.canconical_level,
self.canonical_size,
rois_num=rois_num)
else:
rois_dist, restore_index, rois_num_dist = ops.distribute_fpn_proposals(
roi,
k_min,
@ -94,15 +103,16 @@ class RoIAlign(object):
self.canconical_level,
self.canonical_size,
rois_num=rois_num)
rois_feat_list = []
for lvl in range(self.start_level, self.end_level + 1):
roi_feat = ops.roi_align(
feats[lvl],
rois_dist[lvl],
self.resolution,
self.spatial_scale[lvl],
roi_feat = paddle.vision.ops.roi_align(
x=feats[lvl],
boxes=rois_dist[lvl],
boxes_num=rois_num_dist[lvl],
output_size=self.resolution,
spatial_scale=self.spatial_scale[lvl],
sampling_ratio=self.sampling_ratio,
rois_num=rois_num_dist[lvl],
aligned=self.aligned)
rois_feat_list.append(roi_feat)
rois_feat_shuffle = paddle.concat(rois_feat_list)

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save