[Fix] Update ppdet Version and Update QR Code (#64)
parent
6752db2de9
commit
a4957b21be
197 changed files with 14438 additions and 4071 deletions
Before Width: | Height: | Size: 280 KiB After Width: | Height: | Size: 281 KiB |
@ -1 +1,2 @@ |
||||
ppdet ba2aad26e6bc1e5c2dad76ca96692a0d63eccfac |
||||
ppseg f6c73b478cdf00f40ae69edd35bf6bce2a1687ef |
@ -0,0 +1,479 @@ |
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
||||
# |
||||
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
# you may not use this file except in compliance with the License. |
||||
# You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
|
||||
from __future__ import absolute_import |
||||
from __future__ import print_function |
||||
from __future__ import division |
||||
|
||||
try: |
||||
from collections.abc import Sequence |
||||
except Exception: |
||||
from collections import Sequence |
||||
|
||||
from numbers import Number, Integral |
||||
|
||||
import cv2 |
||||
import numpy as np |
||||
import math |
||||
import copy |
||||
|
||||
from .operators import register_op, BaseOperator |
||||
from paddlers.models.ppdet.modeling.rbox_utils import poly2rbox_le135_np, poly2rbox_oc_np, rbox2poly_np |
||||
from paddlers.models.ppdet.utils.logger import setup_logger |
||||
logger = setup_logger(__name__) |
||||
|
||||
|
||||
@register_op |
||||
class RRotate(BaseOperator): |
||||
""" Rotate Image, Polygon, Box |
||||
|
||||
Args: |
||||
scale (float): rotate scale |
||||
angle (float): rotate angle |
||||
fill_value (int, tuple): fill color |
||||
auto_bound (bool): whether auto bound or not |
||||
""" |
||||
|
||||
def __init__(self, scale=1.0, angle=0., fill_value=0., auto_bound=True): |
||||
super(RRotate, self).__init__() |
||||
self.scale = scale |
||||
self.angle = angle |
||||
self.fill_value = fill_value |
||||
self.auto_bound = auto_bound |
||||
|
||||
def get_rotated_matrix(self, angle, scale, h, w): |
||||
center = ((w - 1) * 0.5, (h - 1) * 0.5) |
||||
matrix = cv2.getRotationMatrix2D(center, -angle, scale) |
||||
# calculate the new size |
||||
cos = np.abs(matrix[0, 0]) |
||||
sin = np.abs(matrix[0, 1]) |
||||
new_w = h * sin + w * cos |
||||
new_h = h * cos + w * sin |
||||
# calculate offset |
||||
n_w = int(np.round(new_w)) |
||||
n_h = int(np.round(new_h)) |
||||
if self.auto_bound: |
||||
ratio = min(w / n_w, h / n_h) |
||||
matrix = cv2.getRotationMatrix2D(center, -angle, ratio) |
||||
else: |
||||
matrix[0, 2] += (new_w - w) * 0.5 |
||||
matrix[1, 2] += (new_h - h) * 0.5 |
||||
w = n_w |
||||
h = n_h |
||||
return matrix, h, w |
||||
|
||||
def get_rect_from_pts(self, pts, h, w): |
||||
""" get minimum rectangle of points |
||||
""" |
||||
assert pts.shape[-1] % 2 == 0, 'the dim of input [pts] is not correct' |
||||
min_x, min_y = np.min(pts[:, 0::2], axis=1), np.min(pts[:, 1::2], |
||||
axis=1) |
||||
max_x, max_y = np.max(pts[:, 0::2], axis=1), np.max(pts[:, 1::2], |
||||
axis=1) |
||||
min_x, min_y = np.clip(min_x, 0, w), np.clip(min_y, 0, h) |
||||
max_x, max_y = np.clip(max_x, 0, w), np.clip(max_y, 0, h) |
||||
boxes = np.stack([min_x, min_y, max_x, max_y], axis=-1) |
||||
return boxes |
||||
|
||||
def apply_image(self, image, matrix, h, w): |
||||
return cv2.warpAffine( |
||||
image, matrix, (w, h), borderValue=self.fill_value) |
||||
|
||||
def apply_pts(self, pts, matrix, h, w): |
||||
assert pts.shape[-1] % 2 == 0, 'the dim of input [pts] is not correct' |
||||
# n is number of samples and m is two times the number of points due to (x, y) |
||||
_, m = pts.shape |
||||
# transpose points |
||||
pts_ = pts.reshape(-1, 2).T |
||||
# pad 1 to convert the points to homogeneous coordinates |
||||
padding = np.ones((1, pts_.shape[1]), pts.dtype) |
||||
rotated_pts = np.matmul(matrix, np.concatenate((pts_, padding), axis=0)) |
||||
return rotated_pts[:2, :].T.reshape(-1, m) |
||||
|
||||
def apply(self, sample, context=None): |
||||
image = sample['image'] |
||||
h, w = image.shape[:2] |
||||
matrix, h, w = self.get_rotated_matrix(self.angle, self.scale, h, w) |
||||
sample['image'] = self.apply_image(image, matrix, h, w) |
||||
polys = sample['gt_poly'] |
||||
# TODO: segment or keypoint to be processed |
||||
if len(polys) > 0: |
||||
pts = self.apply_pts(polys, matrix, h, w) |
||||
sample['gt_poly'] = pts |
||||
sample['gt_bbox'] = self.get_rect_from_pts(pts, h, w) |
||||
|
||||
return sample |
||||
|
||||
|
||||
@register_op |
||||
class RandomRRotate(BaseOperator): |
||||
""" Random Rotate Image |
||||
Args: |
||||
scale (float, tuple, list): rotate scale |
||||
scale_mode (str): mode of scale, [range, value, None] |
||||
angle (float, tuple, list): rotate angle |
||||
angle_mode (str): mode of angle, [range, value, None] |
||||
fill_value (float, tuple, list): fill value |
||||
rotate_prob (float): probability of rotation |
||||
auto_bound (bool): whether auto bound or not |
||||
""" |
||||
|
||||
def __init__(self, |
||||
scale=1.0, |
||||
scale_mode=None, |
||||
angle=0., |
||||
angle_mode=None, |
||||
fill_value=0., |
||||
rotate_prob=1.0, |
||||
auto_bound=True): |
||||
super(RandomRRotate, self).__init__() |
||||
self.scale = scale |
||||
self.scale_mode = scale_mode |
||||
self.angle = angle |
||||
self.angle_mode = angle_mode |
||||
self.fill_value = fill_value |
||||
self.rotate_prob = rotate_prob |
||||
self.auto_bound = auto_bound |
||||
|
||||
def get_angle(self, angle, angle_mode): |
||||
assert not angle_mode or angle_mode in [ |
||||
'range', 'value' |
||||
], 'angle mode should be in [range, value, None]' |
||||
if not angle_mode: |
||||
return angle |
||||
elif angle_mode == 'range': |
||||
low, high = angle |
||||
return np.random.rand() * (high - low) + low |
||||
elif angle_mode == 'value': |
||||
return np.random.choice(angle) |
||||
|
||||
def get_scale(self, scale, scale_mode): |
||||
assert not scale_mode or scale_mode in [ |
||||
'range', 'value' |
||||
], 'scale mode should be in [range, value, None]' |
||||
if not scale_mode: |
||||
return scale |
||||
elif scale_mode == 'range': |
||||
low, high = scale |
||||
return np.random.rand() * (high - low) + low |
||||
elif scale_mode == 'value': |
||||
return np.random.choice(scale) |
||||
|
||||
def apply(self, sample, context=None): |
||||
if np.random.rand() > self.rotate_prob: |
||||
return sample |
||||
|
||||
angle = self.get_angle(self.angle, self.angle_mode) |
||||
scale = self.get_scale(self.scale, self.scale_mode) |
||||
rotator = RRotate(scale, angle, self.fill_value, self.auto_bound) |
||||
return rotator(sample) |
||||
|
||||
|
||||
@register_op |
||||
class Poly2RBox(BaseOperator): |
||||
""" Polygon to Rotated Box, using new OpenCV definition since 4.5.1 |
||||
|
||||
Args: |
||||
filter_threshold (int, float): threshold to filter annotations |
||||
filter_mode (str): filter mode, ['area', 'edge'] |
||||
rbox_type (str): rbox type, ['le135', 'oc'] |
||||
|
||||
""" |
||||
|
||||
def __init__(self, filter_threshold=4, filter_mode=None, rbox_type='le135'): |
||||
super(Poly2RBox, self).__init__() |
||||
self.filter_fn = lambda size: self.filter(size, filter_threshold, filter_mode) |
||||
self.rbox_fn = poly2rbox_le135_np if rbox_type == 'le135' else poly2rbox_oc_np |
||||
|
||||
def filter(self, size, threshold, mode): |
||||
if mode == 'area': |
||||
if size[0] * size[1] < threshold: |
||||
return True |
||||
elif mode == 'edge': |
||||
if min(size) < threshold: |
||||
return True |
||||
return False |
||||
|
||||
def get_rbox(self, polys): |
||||
valid_ids, rboxes, bboxes = [], [], [] |
||||
for i, poly in enumerate(polys): |
||||
cx, cy, w, h, angle = self.rbox_fn(poly) |
||||
if self.filter_fn((w, h)): |
||||
continue |
||||
rboxes.append(np.array([cx, cy, w, h, angle], dtype=np.float32)) |
||||
valid_ids.append(i) |
||||
xmin, ymin = min(poly[0::2]), min(poly[1::2]) |
||||
xmax, ymax = max(poly[0::2]), max(poly[1::2]) |
||||
bboxes.append(np.array([xmin, ymin, xmax, ymax], dtype=np.float32)) |
||||
|
||||
if len(valid_ids) == 0: |
||||
rboxes = np.zeros((0, 5), dtype=np.float32) |
||||
bboxes = np.zeros((0, 4), dtype=np.float32) |
||||
else: |
||||
rboxes = np.stack(rboxes) |
||||
bboxes = np.stack(bboxes) |
||||
|
||||
return rboxes, bboxes, valid_ids |
||||
|
||||
def apply(self, sample, context=None): |
||||
rboxes, bboxes, valid_ids = self.get_rbox(sample['gt_poly']) |
||||
sample['gt_rbox'] = rboxes |
||||
sample['gt_bbox'] = bboxes |
||||
for k in ['gt_class', 'gt_score', 'gt_poly', 'is_crowd', 'difficult']: |
||||
if k in sample: |
||||
sample[k] = sample[k][valid_ids] |
||||
|
||||
return sample |
||||
|
||||
|
||||
@register_op |
||||
class Poly2Array(BaseOperator): |
||||
""" convert gt_poly to np.array for rotated bboxes |
||||
""" |
||||
|
||||
def __init__(self): |
||||
super(Poly2Array, self).__init__() |
||||
|
||||
def apply(self, sample, context=None): |
||||
if 'gt_poly' in sample: |
||||
sample['gt_poly'] = np.array( |
||||
sample['gt_poly'], dtype=np.float32).reshape((-1, 8)) |
||||
|
||||
return sample |
||||
|
||||
|
||||
@register_op |
||||
class RResize(BaseOperator): |
||||
def __init__(self, target_size, keep_ratio, interp=cv2.INTER_LINEAR): |
||||
""" |
||||
Resize image to target size. if keep_ratio is True, |
||||
resize the image's long side to the maximum of target_size |
||||
if keep_ratio is False, resize the image to target size(h, w) |
||||
Args: |
||||
target_size (int|list): image target size |
||||
keep_ratio (bool): whether keep_ratio or not, default true |
||||
interp (int): the interpolation method |
||||
""" |
||||
super(RResize, self).__init__() |
||||
self.keep_ratio = keep_ratio |
||||
self.interp = interp |
||||
if not isinstance(target_size, (Integral, Sequence)): |
||||
raise TypeError( |
||||
"Type of target_size is invalid. Must be Integer or List or Tuple, now is {}". |
||||
format(type(target_size))) |
||||
if isinstance(target_size, Integral): |
||||
target_size = [target_size, target_size] |
||||
self.target_size = target_size |
||||
|
||||
def apply_image(self, image, scale): |
||||
im_scale_x, im_scale_y = scale |
||||
|
||||
return cv2.resize( |
||||
image, |
||||
None, |
||||
None, |
||||
fx=im_scale_x, |
||||
fy=im_scale_y, |
||||
interpolation=self.interp) |
||||
|
||||
def apply_pts(self, pts, scale, size): |
||||
im_scale_x, im_scale_y = scale |
||||
resize_w, resize_h = size |
||||
pts[:, 0::2] *= im_scale_x |
||||
pts[:, 1::2] *= im_scale_y |
||||
pts[:, 0::2] = np.clip(pts[:, 0::2], 0, resize_w) |
||||
pts[:, 1::2] = np.clip(pts[:, 1::2], 0, resize_h) |
||||
return pts |
||||
|
||||
def apply(self, sample, context=None): |
||||
""" Resize the image numpy. |
||||
""" |
||||
im = sample['image'] |
||||
if not isinstance(im, np.ndarray): |
||||
raise TypeError("{}: image type is not numpy.".format(self)) |
||||
if len(im.shape) != 3: |
||||
raise ImageError('{}: image is not 3-dimensional.'.format(self)) |
||||
|
||||
# apply image |
||||
im_shape = im.shape |
||||
if self.keep_ratio: |
||||
|
||||
im_size_min = np.min(im_shape[0:2]) |
||||
im_size_max = np.max(im_shape[0:2]) |
||||
|
||||
target_size_min = np.min(self.target_size) |
||||
target_size_max = np.max(self.target_size) |
||||
|
||||
im_scale = min(target_size_min / im_size_min, |
||||
target_size_max / im_size_max) |
||||
|
||||
resize_h = im_scale * float(im_shape[0]) |
||||
resize_w = im_scale * float(im_shape[1]) |
||||
|
||||
im_scale_x = im_scale |
||||
im_scale_y = im_scale |
||||
else: |
||||
resize_h, resize_w = self.target_size |
||||
im_scale_y = resize_h / im_shape[0] |
||||
im_scale_x = resize_w / im_shape[1] |
||||
|
||||
im = self.apply_image(sample['image'], [im_scale_x, im_scale_y]) |
||||
sample['image'] = im.astype(np.float32) |
||||
sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32) |
||||
if 'scale_factor' in sample: |
||||
scale_factor = sample['scale_factor'] |
||||
sample['scale_factor'] = np.asarray( |
||||
[scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x], |
||||
dtype=np.float32) |
||||
else: |
||||
sample['scale_factor'] = np.asarray( |
||||
[im_scale_y, im_scale_x], dtype=np.float32) |
||||
|
||||
# apply bbox |
||||
if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: |
||||
sample['gt_bbox'] = self.apply_pts(sample['gt_bbox'], |
||||
[im_scale_x, im_scale_y], |
||||
[resize_w, resize_h]) |
||||
|
||||
# apply polygon |
||||
if 'gt_poly' in sample and len(sample['gt_poly']) > 0: |
||||
sample['gt_poly'] = self.apply_pts(sample['gt_poly'], |
||||
[im_scale_x, im_scale_y], |
||||
[resize_w, resize_h]) |
||||
|
||||
return sample |
||||
|
||||
|
||||
@register_op |
||||
class RandomRFlip(BaseOperator): |
||||
def __init__(self, prob=0.5): |
||||
""" |
||||
Args: |
||||
prob (float): the probability of flipping image |
||||
""" |
||||
super(RandomRFlip, self).__init__() |
||||
self.prob = prob |
||||
if not (isinstance(self.prob, float)): |
||||
raise TypeError("{}: input type is invalid.".format(self)) |
||||
|
||||
def apply_image(self, image): |
||||
return image[:, ::-1, :] |
||||
|
||||
def apply_pts(self, pts, width): |
||||
oldx = pts[:, 0::2].copy() |
||||
pts[:, 0::2] = width - oldx - 1 |
||||
return pts |
||||
|
||||
def apply(self, sample, context=None): |
||||
"""Filp the image and bounding box. |
||||
Operators: |
||||
1. Flip the image numpy. |
||||
2. Transform the bboxes' x coordinates. |
||||
(Must judge whether the coordinates are normalized!) |
||||
3. Transform the segmentations' x coordinates. |
||||
(Must judge whether the coordinates are normalized!) |
||||
Output: |
||||
sample: the image, bounding box and segmentation part |
||||
in sample are flipped. |
||||
""" |
||||
if np.random.uniform(0, 1) < self.prob: |
||||
im = sample['image'] |
||||
height, width = im.shape[:2] |
||||
im = self.apply_image(im) |
||||
if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: |
||||
sample['gt_bbox'] = self.apply_pts(sample['gt_bbox'], width) |
||||
if 'gt_poly' in sample and len(sample['gt_poly']) > 0: |
||||
sample['gt_poly'] = self.apply_pts(sample['gt_poly'], width) |
||||
|
||||
sample['flipped'] = True |
||||
sample['image'] = im |
||||
return sample |
||||
|
||||
|
||||
@register_op |
||||
class VisibleRBox(BaseOperator): |
||||
""" |
||||
In debug mode, visualize images according to `gt_box`. |
||||
(Currently only supported when not cropping and flipping image.) |
||||
""" |
||||
|
||||
def __init__(self, output_dir='debug'): |
||||
super(VisibleRBox, self).__init__() |
||||
self.output_dir = output_dir |
||||
if not os.path.isdir(output_dir): |
||||
os.makedirs(output_dir) |
||||
|
||||
def apply(self, sample, context=None): |
||||
image = Image.fromarray(sample['image'].astype(np.uint8)) |
||||
out_file_name = '{:012d}.jpg'.format(sample['im_id'][0]) |
||||
width = sample['w'] |
||||
height = sample['h'] |
||||
# gt_poly = sample['gt_rbox'] |
||||
gt_poly = sample['gt_poly'] |
||||
gt_class = sample['gt_class'] |
||||
draw = ImageDraw.Draw(image) |
||||
for i in range(gt_poly.shape[0]): |
||||
x1, y1, x2, y2, x3, y3, x4, y4 = gt_poly[i] |
||||
draw.line( |
||||
[(x1, y1), (x2, y2), (x3, y3), (x4, y4), (x1, y1)], |
||||
width=2, |
||||
fill='green') |
||||
# draw label |
||||
xmin = min(x1, x2, x3, x4) |
||||
ymin = min(y1, y2, y3, y4) |
||||
text = str(gt_class[i][0]) |
||||
tw, th = draw.textsize(text) |
||||
draw.rectangle( |
||||
[(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill='green') |
||||
draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255)) |
||||
|
||||
if 'gt_keypoint' in sample.keys(): |
||||
gt_keypoint = sample['gt_keypoint'] |
||||
if self.is_normalized: |
||||
for i in range(gt_keypoint.shape[1]): |
||||
if i % 2: |
||||
gt_keypoint[:, i] = gt_keypoint[:, i] * height |
||||
else: |
||||
gt_keypoint[:, i] = gt_keypoint[:, i] * width |
||||
for i in range(gt_keypoint.shape[0]): |
||||
keypoint = gt_keypoint[i] |
||||
for j in range(int(keypoint.shape[0] / 2)): |
||||
x1 = round(keypoint[2 * j]).astype(np.int32) |
||||
y1 = round(keypoint[2 * j + 1]).astype(np.int32) |
||||
draw.ellipse( |
||||
(x1, y1, x1 + 5, y1 + 5), fill='green', outline='green') |
||||
save_path = os.path.join(self.output_dir, out_file_name) |
||||
image.save(save_path, quality=95) |
||||
return sample |
||||
|
||||
|
||||
@register_op |
||||
class Rbox2Poly(BaseOperator): |
||||
""" |
||||
Convert rbbox format to poly format. |
||||
""" |
||||
|
||||
def __init__(self): |
||||
super(Rbox2Poly, self).__init__() |
||||
|
||||
def apply(self, sample, context=None): |
||||
assert 'gt_rbox' in sample |
||||
assert sample['gt_rbox'].shape[1] == 5 |
||||
rboxes = sample['gt_rbox'] |
||||
polys = rbox2poly_np(rboxes) |
||||
sample['gt_poly'] = polys |
||||
xmin, ymin = polys[:, 0::2].min(1), polys[:, 1::2].min(1) |
||||
xmax, ymax = polys[:, 0::2].max(1), polys[:, 1::2].max(1) |
||||
sample['gt_bbox'] = np.stack([xmin, ymin, xmin, ymin], axis=1) |
||||
return sample |
@ -0,0 +1,72 @@ |
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
||||
# |
||||
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
# you may not use this file except in compliance with the License. |
||||
# You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
|
||||
import paddle |
||||
import numbers |
||||
import numpy as np |
||||
|
||||
try: |
||||
from collections.abc import Sequence, Mapping |
||||
except: |
||||
from collections import Sequence, Mapping |
||||
|
||||
|
||||
def default_collate_fn(batch): |
||||
""" |
||||
Default batch collating function for :code:`paddle.io.DataLoader`, |
||||
get input data as a list of sample datas, each element in list |
||||
if the data of a sample, and sample data should composed of list, |
||||
dictionary, string, number, numpy array, this |
||||
function will parse input data recursively and stack number, |
||||
numpy array and paddle.Tensor datas as batch datas. e.g. for |
||||
following input data: |
||||
[{'image': np.array(shape=[3, 224, 224]), 'label': 1}, |
||||
{'image': np.array(shape=[3, 224, 224]), 'label': 3}, |
||||
{'image': np.array(shape=[3, 224, 224]), 'label': 4}, |
||||
{'image': np.array(shape=[3, 224, 224]), 'label': 5},] |
||||
|
||||
|
||||
This default collate function zipped each number and numpy array |
||||
field together and stack each field as the batch field as follows: |
||||
{'image': np.array(shape=[4, 3, 224, 224]), 'label': np.array([1, 3, 4, 5])} |
||||
Args: |
||||
batch(list of sample data): batch should be a list of sample data. |
||||
|
||||
Returns: |
||||
Batched data: batched each number, numpy array and paddle.Tensor |
||||
in input data. |
||||
""" |
||||
sample = batch[0] |
||||
if isinstance(sample, np.ndarray): |
||||
batch = np.stack(batch, axis=0) |
||||
return batch |
||||
elif isinstance(sample, numbers.Number): |
||||
batch = np.array(batch) |
||||
return batch |
||||
elif isinstance(sample, (str, bytes)): |
||||
return batch |
||||
elif isinstance(sample, Mapping): |
||||
return { |
||||
key: default_collate_fn([d[key] for d in batch]) |
||||
for key in sample |
||||
} |
||||
elif isinstance(sample, Sequence): |
||||
sample_fields_num = len(sample) |
||||
if not all(len(sample) == sample_fields_num for sample in iter(batch)): |
||||
raise RuntimeError( |
||||
"fileds number not same among samples in a batch") |
||||
return [default_collate_fn(fields) for fields in zip(*batch)] |
||||
|
||||
raise TypeError("batch data con only contains: tensor, numpy.ndarray, " |
||||
"dict, list, number, but got {}".format(type(sample))) |
@ -0,0 +1,35 @@ |
||||
# 自定义OP编译 |
||||
旋转框IOU计算OP是参考[自定义外部算子](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/custom_op/new_cpp_op_cn.html) 。 |
||||
|
||||
## 1. 环境依赖 |
||||
- Paddle >= 2.0.1 |
||||
- gcc 8.2 |
||||
|
||||
## 2. 安装 |
||||
``` |
||||
python setup.py install |
||||
``` |
||||
|
||||
编译完成后即可使用,以下为`rbox_iou`的使用示例 |
||||
``` |
||||
# 引入自定义op |
||||
from ext_op import rbox_iou |
||||
|
||||
paddle.set_device('gpu:0') |
||||
paddle.disable_static() |
||||
|
||||
rbox1 = np.random.rand(13000, 5) |
||||
rbox2 = np.random.rand(7, 5) |
||||
|
||||
pd_rbox1 = paddle.to_tensor(rbox1) |
||||
pd_rbox2 = paddle.to_tensor(rbox2) |
||||
|
||||
iou = rbox_iou(pd_rbox1, pd_rbox2) |
||||
print('iou', iou) |
||||
``` |
||||
|
||||
## 3. 单元测试 |
||||
可以通过执行单元测试来确认自定义算子功能的正确性,执行单元测试的示例如下所示: |
||||
``` |
||||
python unittest/test_matched_rbox_iou.py |
||||
``` |
@ -0,0 +1,90 @@ |
||||
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
// The code is based on
|
||||
// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/box_iou_rotated
|
||||
|
||||
#include "paddle/extension.h" |
||||
#include "rbox_iou_op.h" |
||||
|
||||
template <typename T> |
||||
void matched_rbox_iou_cpu_kernel(const int rbox_num, const T *rbox1_data_ptr, |
||||
const T *rbox2_data_ptr, T *output_data_ptr) { |
||||
|
||||
int i; |
||||
for (i = 0; i < rbox_num; i++) { |
||||
output_data_ptr[i] = |
||||
rbox_iou_single<T>(rbox1_data_ptr + i * 5, rbox2_data_ptr + i * 5); |
||||
} |
||||
} |
||||
|
||||
#define CHECK_INPUT_CPU(x) \ |
||||
PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.") |
||||
|
||||
std::vector<paddle::Tensor> MatchedRboxIouCPUForward(const paddle::Tensor &rbox1, |
||||
const paddle::Tensor &rbox2) { |
||||
CHECK_INPUT_CPU(rbox1); |
||||
CHECK_INPUT_CPU(rbox2); |
||||
PD_CHECK(rbox1.shape()[0] == rbox2.shape()[0], "inputs must be same dim"); |
||||
|
||||
auto rbox_num = rbox1.shape()[0]; |
||||
auto output = paddle::Tensor(paddle::PlaceType::kCPU, {rbox_num}); |
||||
|
||||
PD_DISPATCH_FLOATING_TYPES(rbox1.type(), "rotated_iou_cpu_kernel", ([&] { |
||||
matched_rbox_iou_cpu_kernel<data_t>( |
||||
rbox_num, rbox1.data<data_t>(), |
||||
rbox2.data<data_t>(), |
||||
output.mutable_data<data_t>()); |
||||
})); |
||||
|
||||
return {output}; |
||||
} |
||||
|
||||
#ifdef PADDLE_WITH_CUDA |
||||
std::vector<paddle::Tensor> MatchedRboxIouCUDAForward(const paddle::Tensor &rbox1, |
||||
const paddle::Tensor &rbox2); |
||||
#endif |
||||
|
||||
#define CHECK_INPUT_SAME(x1, x2) \ |
||||
PD_CHECK(x1.place() == x2.place(), "input must be smae pacle.") |
||||
|
||||
std::vector<paddle::Tensor> MatchedRboxIouForward(const paddle::Tensor &rbox1, |
||||
const paddle::Tensor &rbox2) { |
||||
CHECK_INPUT_SAME(rbox1, rbox2); |
||||
if (rbox1.place() == paddle::PlaceType::kCPU) { |
||||
return MatchedRboxIouCPUForward(rbox1, rbox2); |
||||
#ifdef PADDLE_WITH_CUDA |
||||
} else if (rbox1.place() == paddle::PlaceType::kGPU) { |
||||
return MatchedRboxIouCUDAForward(rbox1, rbox2); |
||||
#endif |
||||
} |
||||
} |
||||
|
||||
std::vector<std::vector<int64_t>> |
||||
MatchedRboxIouInferShape(std::vector<int64_t> rbox1_shape, |
||||
std::vector<int64_t> rbox2_shape) { |
||||
return {{rbox1_shape[0]}}; |
||||
} |
||||
|
||||
std::vector<paddle::DataType> MatchedRboxIouInferDtype(paddle::DataType t1, |
||||
paddle::DataType t2) { |
||||
return {t1}; |
||||
} |
||||
|
||||
PD_BUILD_OP(matched_rbox_iou) |
||||
.Inputs({"RBOX1", "RBOX2"}) |
||||
.Outputs({"Output"}) |
||||
.SetKernelFn(PD_KERNEL(MatchedRboxIouForward)) |
||||
.SetInferShapeFn(PD_INFER_SHAPE(MatchedRboxIouInferShape)) |
||||
.SetInferDtypeFn(PD_INFER_DTYPE(MatchedRboxIouInferDtype)); |
@ -0,0 +1,63 @@ |
||||
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
||||
// |
||||
// Licensed under the Apache License, Version 2.0 (the "License"); |
||||
// you may not use this file except in compliance with the License. |
||||
// You may obtain a copy of the License at |
||||
// |
||||
// http://www.apache.org/licenses/LICENSE-2.0 |
||||
// |
||||
// Unless required by applicable law or agreed to in writing, software |
||||
// distributed under the License is distributed on an "AS IS" BASIS, |
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
// See the License for the specific language governing permissions and |
||||
// limitations under the License. |
||||
// |
||||
// The code is based on |
||||
// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/box_iou_rotated |
||||
|
||||
#include "paddle/extension.h" |
||||
#include "rbox_iou_op.h" |
||||
|
||||
/** |
||||
Computes ceil(a / b) |
||||
*/ |
||||
|
||||
static inline int CeilDiv(const int a, const int b) { return (a + b - 1) / b; } |
||||
|
||||
template <typename T> |
||||
__global__ void |
||||
matched_rbox_iou_cuda_kernel(const int rbox_num, const T *rbox1_data_ptr, |
||||
const T *rbox2_data_ptr, T *output_data_ptr) { |
||||
for (int tid = blockIdx.x * blockDim.x + threadIdx.x; tid < rbox_num; |
||||
tid += blockDim.x * gridDim.x) { |
||||
output_data_ptr[tid] = |
||||
rbox_iou_single<T>(rbox1_data_ptr + tid * 5, rbox2_data_ptr + tid * 5); |
||||
} |
||||
} |
||||
|
||||
#define CHECK_INPUT_GPU(x) \ |
||||
PD_CHECK(x.place() == paddle::PlaceType::kGPU, #x " must be a GPU Tensor.") |
||||
|
||||
std::vector<paddle::Tensor> MatchedRboxIouCUDAForward(const paddle::Tensor &rbox1, |
||||
const paddle::Tensor &rbox2) { |
||||
CHECK_INPUT_GPU(rbox1); |
||||
CHECK_INPUT_GPU(rbox2); |
||||
PD_CHECK(rbox1.shape()[0] == rbox2.shape()[0], "inputs must be same dim"); |
||||
|
||||
auto rbox_num = rbox1.shape()[0]; |
||||
|
||||
auto output = paddle::Tensor(paddle::PlaceType::kGPU, {rbox_num}); |
||||
|
||||
const int thread_per_block = 512; |
||||
const int block_per_grid = CeilDiv(rbox_num, thread_per_block); |
||||
|
||||
PD_DISPATCH_FLOATING_TYPES( |
||||
rbox1.type(), "matched_rbox_iou_cuda_kernel", ([&] { |
||||
matched_rbox_iou_cuda_kernel< |
||||
data_t><<<block_per_grid, thread_per_block, 0, rbox1.stream()>>>( |
||||
rbox_num, rbox1.data<data_t>(), rbox2.data<data_t>(), |
||||
output.mutable_data<data_t>()); |
||||
})); |
||||
|
||||
return {output}; |
||||
} |
@ -0,0 +1,97 @@ |
||||
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
// The code is based on https://github.com/csuhan/s2anet/blob/master/mmdet/ops/box_iou_rotated
|
||||
|
||||
#include "rbox_iou_op.h" |
||||
#include "paddle/extension.h" |
||||
|
||||
|
||||
template <typename T> |
||||
void rbox_iou_cpu_kernel( |
||||
const int rbox1_num, |
||||
const int rbox2_num, |
||||
const T* rbox1_data_ptr, |
||||
const T* rbox2_data_ptr, |
||||
T* output_data_ptr) { |
||||
|
||||
int i, j; |
||||
for (i = 0; i < rbox1_num; i++) { |
||||
for (j = 0; j < rbox2_num; j++) { |
||||
int offset = i * rbox2_num + j; |
||||
output_data_ptr[offset] = rbox_iou_single<T>(rbox1_data_ptr + i * 5, rbox2_data_ptr + j * 5); |
||||
} |
||||
} |
||||
} |
||||
|
||||
|
||||
#define CHECK_INPUT_CPU(x) PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.") |
||||
|
||||
std::vector<paddle::Tensor> RboxIouCPUForward(const paddle::Tensor& rbox1, const paddle::Tensor& rbox2) { |
||||
CHECK_INPUT_CPU(rbox1); |
||||
CHECK_INPUT_CPU(rbox2); |
||||
|
||||
auto rbox1_num = rbox1.shape()[0]; |
||||
auto rbox2_num = rbox2.shape()[0]; |
||||
|
||||
auto output = paddle::Tensor(paddle::PlaceType::kCPU, {rbox1_num, rbox2_num}); |
||||
|
||||
PD_DISPATCH_FLOATING_TYPES( |
||||
rbox1.type(), |
||||
"rbox_iou_cpu_kernel", |
||||
([&] { |
||||
rbox_iou_cpu_kernel<data_t>( |
||||
rbox1_num, |
||||
rbox2_num, |
||||
rbox1.data<data_t>(), |
||||
rbox2.data<data_t>(), |
||||
output.mutable_data<data_t>()); |
||||
})); |
||||
|
||||
return {output}; |
||||
} |
||||
|
||||
|
||||
#ifdef PADDLE_WITH_CUDA |
||||
std::vector<paddle::Tensor> RboxIouCUDAForward(const paddle::Tensor& rbox1, const paddle::Tensor& rbox2); |
||||
#endif |
||||
|
||||
|
||||
#define CHECK_INPUT_SAME(x1, x2) PD_CHECK(x1.place() == x2.place(), "input must be smae pacle.") |
||||
|
||||
std::vector<paddle::Tensor> RboxIouForward(const paddle::Tensor& rbox1, const paddle::Tensor& rbox2) { |
||||
CHECK_INPUT_SAME(rbox1, rbox2); |
||||
if (rbox1.place() == paddle::PlaceType::kCPU) { |
||||
return RboxIouCPUForward(rbox1, rbox2); |
||||
#ifdef PADDLE_WITH_CUDA |
||||
} else if (rbox1.place() == paddle::PlaceType::kGPU) { |
||||
return RboxIouCUDAForward(rbox1, rbox2); |
||||
#endif |
||||
} |
||||
} |
||||
|
||||
std::vector<std::vector<int64_t>> InferShape(std::vector<int64_t> rbox1_shape, std::vector<int64_t> rbox2_shape) { |
||||
return {{rbox1_shape[0], rbox2_shape[0]}}; |
||||
} |
||||
|
||||
std::vector<paddle::DataType> InferDtype(paddle::DataType t1, paddle::DataType t2) { |
||||
return {t1}; |
||||
} |
||||
|
||||
PD_BUILD_OP(rbox_iou) |
||||
.Inputs({"RBOX1", "RBOX2"}) |
||||
.Outputs({"Output"}) |
||||
.SetKernelFn(PD_KERNEL(RboxIouForward)) |
||||
.SetInferShapeFn(PD_INFER_SHAPE(InferShape)) |
||||
.SetInferDtypeFn(PD_INFER_DTYPE(InferDtype)); |
@ -0,0 +1,114 @@ |
||||
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
||||
// |
||||
// Licensed under the Apache License, Version 2.0 (the "License"); |
||||
// you may not use this file except in compliance with the License. |
||||
// You may obtain a copy of the License at |
||||
// |
||||
// http://www.apache.org/licenses/LICENSE-2.0 |
||||
// |
||||
// Unless required by applicable law or agreed to in writing, software |
||||
// distributed under the License is distributed on an "AS IS" BASIS, |
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
// See the License for the specific language governing permissions and |
||||
// limitations under the License. |
||||
// |
||||
// The code is based on |
||||
// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/box_iou_rotated |
||||
|
||||
#include "paddle/extension.h" |
||||
#include "rbox_iou_op.h" |
||||
|
||||
// 2D block with 32 * 16 = 512 threads per block |
||||
const int BLOCK_DIM_X = 32; |
||||
const int BLOCK_DIM_Y = 16; |
||||
|
||||
/** |
||||
Computes ceil(a / b) |
||||
*/ |
||||
|
||||
static inline int CeilDiv(const int a, const int b) { return (a + b - 1) / b; } |
||||
|
||||
template <typename T> |
||||
__global__ void rbox_iou_cuda_kernel(const int rbox1_num, const int rbox2_num, |
||||
const T *rbox1_data_ptr, |
||||
const T *rbox2_data_ptr, |
||||
T *output_data_ptr) { |
||||
|
||||
// get row_start and col_start |
||||
const int rbox1_block_idx = blockIdx.x * blockDim.x; |
||||
const int rbox2_block_idx = blockIdx.y * blockDim.y; |
||||
|
||||
const int rbox1_thread_num = min(rbox1_num - rbox1_block_idx, blockDim.x); |
||||
const int rbox2_thread_num = min(rbox2_num - rbox2_block_idx, blockDim.y); |
||||
|
||||
__shared__ T block_boxes1[BLOCK_DIM_X * 5]; |
||||
__shared__ T block_boxes2[BLOCK_DIM_Y * 5]; |
||||
|
||||
// It's safe to copy using threadIdx.x since BLOCK_DIM_X >= BLOCK_DIM_Y |
||||
if (threadIdx.x < rbox1_thread_num && threadIdx.y == 0) { |
||||
block_boxes1[threadIdx.x * 5 + 0] = |
||||
rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 0]; |
||||
block_boxes1[threadIdx.x * 5 + 1] = |
||||
rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 1]; |
||||
block_boxes1[threadIdx.x * 5 + 2] = |
||||
rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 2]; |
||||
block_boxes1[threadIdx.x * 5 + 3] = |
||||
rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 3]; |
||||
block_boxes1[threadIdx.x * 5 + 4] = |
||||
rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 4]; |
||||
} |
||||
|
||||
// threadIdx.x < BLOCK_DIM_Y=rbox2_thread_num, just use same condition as |
||||
// above: threadIdx.y == 0 |
||||
if (threadIdx.x < rbox2_thread_num && threadIdx.y == 0) { |
||||
block_boxes2[threadIdx.x * 5 + 0] = |
||||
rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 0]; |
||||
block_boxes2[threadIdx.x * 5 + 1] = |
||||
rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 1]; |
||||
block_boxes2[threadIdx.x * 5 + 2] = |
||||
rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 2]; |
||||
block_boxes2[threadIdx.x * 5 + 3] = |
||||
rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 3]; |
||||
block_boxes2[threadIdx.x * 5 + 4] = |
||||
rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 4]; |
||||
} |
||||
|
||||
// sync |
||||
__syncthreads(); |
||||
|
||||
if (threadIdx.x < rbox1_thread_num && threadIdx.y < rbox2_thread_num) { |
||||
int offset = (rbox1_block_idx + threadIdx.x) * rbox2_num + rbox2_block_idx + |
||||
threadIdx.y; |
||||
output_data_ptr[offset] = rbox_iou_single<T>( |
||||
block_boxes1 + threadIdx.x * 5, block_boxes2 + threadIdx.y * 5); |
||||
} |
||||
} |
||||
|
||||
#define CHECK_INPUT_GPU(x) \ |
||||
PD_CHECK(x.place() == paddle::PlaceType::kGPU, #x " must be a GPU Tensor.") |
||||
|
||||
std::vector<paddle::Tensor> RboxIouCUDAForward(const paddle::Tensor &rbox1, |
||||
const paddle::Tensor &rbox2) { |
||||
CHECK_INPUT_GPU(rbox1); |
||||
CHECK_INPUT_GPU(rbox2); |
||||
|
||||
auto rbox1_num = rbox1.shape()[0]; |
||||
auto rbox2_num = rbox2.shape()[0]; |
||||
|
||||
auto output = paddle::Tensor(paddle::PlaceType::kGPU, {rbox1_num, rbox2_num}); |
||||
|
||||
const int blocks_x = CeilDiv(rbox1_num, BLOCK_DIM_X); |
||||
const int blocks_y = CeilDiv(rbox2_num, BLOCK_DIM_Y); |
||||
|
||||
dim3 blocks(blocks_x, blocks_y); |
||||
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); |
||||
|
||||
PD_DISPATCH_FLOATING_TYPES( |
||||
rbox1.type(), "rbox_iou_cuda_kernel", ([&] { |
||||
rbox_iou_cuda_kernel<data_t><<<blocks, threads, 0, rbox1.stream()>>>( |
||||
rbox1_num, rbox2_num, rbox1.data<data_t>(), rbox2.data<data_t>(), |
||||
output.mutable_data<data_t>()); |
||||
})); |
||||
|
||||
return {output}; |
||||
} |
@ -0,0 +1,348 @@ |
||||
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
// The code is based on
|
||||
// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/box_iou_rotated
|
||||
|
||||
#pragma once |
||||
|
||||
#include <cassert> |
||||
#include <cmath> |
||||
#include <vector> |
||||
|
||||
#ifdef __CUDACC__ |
||||
// Designates functions callable from the host (CPU) and the device (GPU)
|
||||
#define HOST_DEVICE __host__ __device__ |
||||
#define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__ |
||||
#else |
||||
#include <algorithm> |
||||
#define HOST_DEVICE |
||||
#define HOST_DEVICE_INLINE HOST_DEVICE inline |
||||
#endif |
||||
|
||||
namespace { |
||||
|
||||
template <typename T> struct RotatedBox { T x_ctr, y_ctr, w, h, a; }; |
||||
|
||||
template <typename T> struct Point { |
||||
T x, y; |
||||
HOST_DEVICE_INLINE Point(const T &px = 0, const T &py = 0) : x(px), y(py) {} |
||||
HOST_DEVICE_INLINE Point operator+(const Point &p) const { |
||||
return Point(x + p.x, y + p.y); |
||||
} |
||||
HOST_DEVICE_INLINE Point &operator+=(const Point &p) { |
||||
x += p.x; |
||||
y += p.y; |
||||
return *this; |
||||
} |
||||
HOST_DEVICE_INLINE Point operator-(const Point &p) const { |
||||
return Point(x - p.x, y - p.y); |
||||
} |
||||
HOST_DEVICE_INLINE Point operator*(const T coeff) const { |
||||
return Point(x * coeff, y * coeff); |
||||
} |
||||
}; |
||||
|
||||
template <typename T> |
||||
HOST_DEVICE_INLINE T dot_2d(const Point<T> &A, const Point<T> &B) { |
||||
return A.x * B.x + A.y * B.y; |
||||
} |
||||
|
||||
template <typename T> |
||||
HOST_DEVICE_INLINE T cross_2d(const Point<T> &A, const Point<T> &B) { |
||||
return A.x * B.y - B.x * A.y; |
||||
} |
||||
|
||||
template <typename T> |
||||
HOST_DEVICE_INLINE void get_rotated_vertices(const RotatedBox<T> &box, |
||||
Point<T> (&pts)[4]) { |
||||
// M_PI / 180. == 0.01745329251
|
||||
// double theta = box.a * 0.01745329251;
|
||||
// MODIFIED
|
||||
double theta = box.a; |
||||
T cosTheta2 = (T)cos(theta) * 0.5f; |
||||
T sinTheta2 = (T)sin(theta) * 0.5f; |
||||
|
||||
// y: top --> down; x: left --> right
|
||||
pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w; |
||||
pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w; |
||||
pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w; |
||||
pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w; |
||||
pts[2].x = 2 * box.x_ctr - pts[0].x; |
||||
pts[2].y = 2 * box.y_ctr - pts[0].y; |
||||
pts[3].x = 2 * box.x_ctr - pts[1].x; |
||||
pts[3].y = 2 * box.y_ctr - pts[1].y; |
||||
} |
||||
|
||||
template <typename T> |
||||
HOST_DEVICE_INLINE int get_intersection_points(const Point<T> (&pts1)[4], |
||||
const Point<T> (&pts2)[4], |
||||
Point<T> (&intersections)[24]) { |
||||
// Line vector
|
||||
// A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
|
||||
Point<T> vec1[4], vec2[4]; |
||||
for (int i = 0; i < 4; i++) { |
||||
vec1[i] = pts1[(i + 1) % 4] - pts1[i]; |
||||
vec2[i] = pts2[(i + 1) % 4] - pts2[i]; |
||||
} |
||||
|
||||
// Line test - test all line combos for intersection
|
||||
int num = 0; // number of intersections
|
||||
for (int i = 0; i < 4; i++) { |
||||
for (int j = 0; j < 4; j++) { |
||||
// Solve for 2x2 Ax=b
|
||||
T det = cross_2d<T>(vec2[j], vec1[i]); |
||||
|
||||
// This takes care of parallel lines
|
||||
if (fabs(det) <= 1e-14) { |
||||
continue; |
||||
} |
||||
|
||||
auto vec12 = pts2[j] - pts1[i]; |
||||
|
||||
T t1 = cross_2d<T>(vec2[j], vec12) / det; |
||||
T t2 = cross_2d<T>(vec1[i], vec12) / det; |
||||
|
||||
if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) { |
||||
intersections[num++] = pts1[i] + vec1[i] * t1; |
||||
} |
||||
} |
||||
} |
||||
|
||||
// Check for vertices of rect1 inside rect2
|
||||
{ |
||||
const auto &AB = vec2[0]; |
||||
const auto &DA = vec2[3]; |
||||
auto ABdotAB = dot_2d<T>(AB, AB); |
||||
auto ADdotAD = dot_2d<T>(DA, DA); |
||||
for (int i = 0; i < 4; i++) { |
||||
// assume ABCD is the rectangle, and P is the point to be judged
|
||||
// P is inside ABCD iff. P's projection on AB lies within AB
|
||||
// and P's projection on AD lies within AD
|
||||
|
||||
auto AP = pts1[i] - pts2[0]; |
||||
|
||||
auto APdotAB = dot_2d<T>(AP, AB); |
||||
auto APdotAD = -dot_2d<T>(AP, DA); |
||||
|
||||
if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && |
||||
(APdotAD <= ADdotAD)) { |
||||
intersections[num++] = pts1[i]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
// Reverse the check - check for vertices of rect2 inside rect1
|
||||
{ |
||||
const auto &AB = vec1[0]; |
||||
const auto &DA = vec1[3]; |
||||
auto ABdotAB = dot_2d<T>(AB, AB); |
||||
auto ADdotAD = dot_2d<T>(DA, DA); |
||||
for (int i = 0; i < 4; i++) { |
||||
auto AP = pts2[i] - pts1[0]; |
||||
|
||||
auto APdotAB = dot_2d<T>(AP, AB); |
||||
auto APdotAD = -dot_2d<T>(AP, DA); |
||||
|
||||
if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && |
||||
(APdotAD <= ADdotAD)) { |
||||
intersections[num++] = pts2[i]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
return num; |
||||
} |
||||
|
||||
template <typename T> |
||||
HOST_DEVICE_INLINE int convex_hull_graham(const Point<T> (&p)[24], |
||||
const int &num_in, Point<T> (&q)[24], |
||||
bool shift_to_zero = false) { |
||||
assert(num_in >= 2); |
||||
|
||||
// Step 1:
|
||||
// Find point with minimum y
|
||||
// if more than 1 points have the same minimum y,
|
||||
// pick the one with the minimum x.
|
||||
int t = 0; |
||||
for (int i = 1; i < num_in; i++) { |
||||
if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) { |
||||
t = i; |
||||
} |
||||
} |
||||
auto &start = p[t]; // starting point
|
||||
|
||||
// Step 2:
|
||||
// Subtract starting point from every points (for sorting in the next step)
|
||||
for (int i = 0; i < num_in; i++) { |
||||
q[i] = p[i] - start; |
||||
} |
||||
|
||||
// Swap the starting point to position 0
|
||||
auto tmp = q[0]; |
||||
q[0] = q[t]; |
||||
q[t] = tmp; |
||||
|
||||
// Step 3:
|
||||
// Sort point 1 ~ num_in according to their relative cross-product values
|
||||
// (essentially sorting according to angles)
|
||||
// If the angles are the same, sort according to their distance to origin
|
||||
T dist[24]; |
||||
for (int i = 0; i < num_in; i++) { |
||||
dist[i] = dot_2d<T>(q[i], q[i]); |
||||
} |
||||
|
||||
#ifdef __CUDACC__ |
||||
// CUDA version
|
||||
// In the future, we can potentially use thrust
|
||||
// for sorting here to improve speed (though not guaranteed)
|
||||
for (int i = 1; i < num_in - 1; i++) { |
||||
for (int j = i + 1; j < num_in; j++) { |
||||
T crossProduct = cross_2d<T>(q[i], q[j]); |
||||
if ((crossProduct < -1e-6) || |
||||
(fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) { |
||||
auto q_tmp = q[i]; |
||||
q[i] = q[j]; |
||||
q[j] = q_tmp; |
||||
auto dist_tmp = dist[i]; |
||||
dist[i] = dist[j]; |
||||
dist[j] = dist_tmp; |
||||
} |
||||
} |
||||
} |
||||
#else |
||||
// CPU version
|
||||
std::sort(q + 1, q + num_in, |
||||
[](const Point<T> &A, const Point<T> &B) -> bool { |
||||
T temp = cross_2d<T>(A, B); |
||||
if (fabs(temp) < 1e-6) { |
||||
return dot_2d<T>(A, A) < dot_2d<T>(B, B); |
||||
} else { |
||||
return temp > 0; |
||||
} |
||||
}); |
||||
#endif |
||||
|
||||
// Step 4:
|
||||
// Make sure there are at least 2 points (that don't overlap with each other)
|
||||
// in the stack
|
||||
int k; // index of the non-overlapped second point
|
||||
for (k = 1; k < num_in; k++) { |
||||
if (dist[k] > 1e-8) { |
||||
break; |
||||
} |
||||
} |
||||
if (k == num_in) { |
||||
// We reach the end, which means the convex hull is just one point
|
||||
q[0] = p[t]; |
||||
return 1; |
||||
} |
||||
q[1] = q[k]; |
||||
int m = 2; // 2 points in the stack
|
||||
// Step 5:
|
||||
// Finally we can start the scanning process.
|
||||
// When a non-convex relationship between the 3 points is found
|
||||
// (either concave shape or duplicated points),
|
||||
// we pop the previous point from the stack
|
||||
// until the 3-point relationship is convex again, or
|
||||
// until the stack only contains two points
|
||||
for (int i = k + 1; i < num_in; i++) { |
||||
while (m > 1 && cross_2d<T>(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) { |
||||
m--; |
||||
} |
||||
q[m++] = q[i]; |
||||
} |
||||
|
||||
// Step 6 (Optional):
|
||||
// In general sense we need the original coordinates, so we
|
||||
// need to shift the points back (reverting Step 2)
|
||||
// But if we're only interested in getting the area/perimeter of the shape
|
||||
// We can simply return.
|
||||
if (!shift_to_zero) { |
||||
for (int i = 0; i < m; i++) { |
||||
q[i] += start; |
||||
} |
||||
} |
||||
|
||||
return m; |
||||
} |
||||
|
||||
template <typename T> |
||||
HOST_DEVICE_INLINE T polygon_area(const Point<T> (&q)[24], const int &m) { |
||||
if (m <= 2) { |
||||
return 0; |
||||
} |
||||
|
||||
T area = 0; |
||||
for (int i = 1; i < m - 1; i++) { |
||||
area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0])); |
||||
} |
||||
|
||||
return area / 2.0; |
||||
} |
||||
|
||||
template <typename T> |
||||
HOST_DEVICE_INLINE T rboxes_intersection(const RotatedBox<T> &box1, |
||||
const RotatedBox<T> &box2) { |
||||
// There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
|
||||
// from rotated_rect_intersection_pts
|
||||
Point<T> intersectPts[24], orderedPts[24]; |
||||
|
||||
Point<T> pts1[4]; |
||||
Point<T> pts2[4]; |
||||
get_rotated_vertices<T>(box1, pts1); |
||||
get_rotated_vertices<T>(box2, pts2); |
||||
|
||||
int num = get_intersection_points<T>(pts1, pts2, intersectPts); |
||||
|
||||
if (num <= 2) { |
||||
return 0.0; |
||||
} |
||||
|
||||
// Convex Hull to order the intersection points in clockwise order and find
|
||||
// the contour area.
|
||||
int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true); |
||||
return polygon_area<T>(orderedPts, num_convex); |
||||
} |
||||
|
||||
} // namespace
|
||||
|
||||
template <typename T> |
||||
HOST_DEVICE_INLINE T rbox_iou_single(T const *const box1_raw, |
||||
T const *const box2_raw) { |
||||
// shift center to the middle point to achieve higher precision in result
|
||||
RotatedBox<T> box1, box2; |
||||
auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0; |
||||
auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0; |
||||
box1.x_ctr = box1_raw[0] - center_shift_x; |
||||
box1.y_ctr = box1_raw[1] - center_shift_y; |
||||
box1.w = box1_raw[2]; |
||||
box1.h = box1_raw[3]; |
||||
box1.a = box1_raw[4]; |
||||
box2.x_ctr = box2_raw[0] - center_shift_x; |
||||
box2.y_ctr = box2_raw[1] - center_shift_y; |
||||
box2.w = box2_raw[2]; |
||||
box2.h = box2_raw[3]; |
||||
box2.a = box2_raw[4]; |
||||
|
||||
const T area1 = box1.w * box1.h; |
||||
const T area2 = box2.w * box2.h; |
||||
if (area1 < 1e-14 || area2 < 1e-14) { |
||||
return 0.f; |
||||
} |
||||
|
||||
const T intersection = rboxes_intersection<T>(box1, box2); |
||||
const T iou = intersection / (area1 + area2 - intersection); |
||||
return iou; |
||||
} |
@ -0,0 +1,33 @@ |
||||
import os |
||||
import glob |
||||
import paddle |
||||
from paddle.utils.cpp_extension import CppExtension, CUDAExtension, setup |
||||
|
||||
|
||||
def get_extensions(): |
||||
root_dir = os.path.dirname(os.path.abspath(__file__)) |
||||
ext_root_dir = os.path.join(root_dir, 'csrc') |
||||
sources = [] |
||||
for ext_name in os.listdir(ext_root_dir): |
||||
ext_dir = os.path.join(ext_root_dir, ext_name) |
||||
source = glob.glob(os.path.join(ext_dir, '*.cc')) |
||||
kwargs = dict() |
||||
if paddle.device.is_compiled_with_cuda(): |
||||
source += glob.glob(os.path.join(ext_dir, '*.cu')) |
||||
|
||||
if not source: |
||||
continue |
||||
|
||||
sources += source |
||||
|
||||
if paddle.device.is_compiled_with_cuda(): |
||||
extension = CUDAExtension( |
||||
sources, extra_compile_args={'cxx': ['-DPADDLE_WITH_CUDA']}) |
||||
else: |
||||
extension = CppExtension(sources) |
||||
|
||||
return extension |
||||
|
||||
|
||||
if __name__ == "__main__": |
||||
setup(name='ext_op', ext_modules=get_extensions()) |
@ -0,0 +1,149 @@ |
||||
import numpy as np |
||||
import sys |
||||
import time |
||||
from shapely.geometry import Polygon |
||||
import paddle |
||||
import unittest |
||||
|
||||
from ext_op import matched_rbox_iou |
||||
|
||||
|
||||
def rbox2poly_single(rrect, get_best_begin_point=False): |
||||
""" |
||||
rrect:[x_ctr,y_ctr,w,h,angle] |
||||
to |
||||
poly:[x0,y0,x1,y1,x2,y2,x3,y3] |
||||
""" |
||||
x_ctr, y_ctr, width, height, angle = rrect[:5] |
||||
tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2 |
||||
# rect 2x4 |
||||
rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]]) |
||||
R = np.array([[np.cos(angle), -np.sin(angle)], |
||||
[np.sin(angle), np.cos(angle)]]) |
||||
# poly |
||||
poly = R.dot(rect) |
||||
x0, x1, x2, x3 = poly[0, :4] + x_ctr |
||||
y0, y1, y2, y3 = poly[1, :4] + y_ctr |
||||
poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float64) |
||||
return poly |
||||
|
||||
|
||||
def intersection(g, p): |
||||
""" |
||||
Intersection. |
||||
""" |
||||
|
||||
g = g[:8].reshape((4, 2)) |
||||
p = p[:8].reshape((4, 2)) |
||||
|
||||
a = g |
||||
b = p |
||||
|
||||
use_filter = True |
||||
if use_filter: |
||||
# step1: |
||||
inter_x1 = np.maximum(np.min(a[:, 0]), np.min(b[:, 0])) |
||||
inter_x2 = np.minimum(np.max(a[:, 0]), np.max(b[:, 0])) |
||||
inter_y1 = np.maximum(np.min(a[:, 1]), np.min(b[:, 1])) |
||||
inter_y2 = np.minimum(np.max(a[:, 1]), np.max(b[:, 1])) |
||||
if inter_x1 >= inter_x2 or inter_y1 >= inter_y2: |
||||
return 0. |
||||
x1 = np.minimum(np.min(a[:, 0]), np.min(b[:, 0])) |
||||
x2 = np.maximum(np.max(a[:, 0]), np.max(b[:, 0])) |
||||
y1 = np.minimum(np.min(a[:, 1]), np.min(b[:, 1])) |
||||
y2 = np.maximum(np.max(a[:, 1]), np.max(b[:, 1])) |
||||
if x1 >= x2 or y1 >= y2 or (x2 - x1) < 2 or (y2 - y1) < 2: |
||||
return 0. |
||||
|
||||
g = Polygon(g) |
||||
p = Polygon(p) |
||||
if not g.is_valid or not p.is_valid: |
||||
return 0 |
||||
|
||||
inter = Polygon(g).intersection(Polygon(p)).area |
||||
union = g.area + p.area - inter |
||||
if union == 0: |
||||
return 0 |
||||
else: |
||||
return inter / union |
||||
|
||||
|
||||
def matched_rbox_overlaps(anchors, gt_bboxes, use_cv2=False): |
||||
""" |
||||
|
||||
Args: |
||||
anchors: [M, 5] x1,y1,x2,y2,angle |
||||
gt_bboxes: [M, 5] x1,y1,x2,y2,angle |
||||
|
||||
Returns: |
||||
macthed_iou: [M] |
||||
""" |
||||
assert anchors.shape[1] == 5 |
||||
assert gt_bboxes.shape[1] == 5 |
||||
|
||||
gt_bboxes_ploy = [rbox2poly_single(e) for e in gt_bboxes] |
||||
anchors_ploy = [rbox2poly_single(e) for e in anchors] |
||||
|
||||
num = len(anchors_ploy) |
||||
iou = np.zeros((num, ), dtype=np.float64) |
||||
|
||||
start_time = time.time() |
||||
for i in range(num): |
||||
try: |
||||
iou[i] = intersection(gt_bboxes_ploy[i], anchors_ploy[i]) |
||||
except Exception as e: |
||||
print('cur gt_bboxes_ploy[i]', gt_bboxes_ploy[i], 'anchors_ploy[j]', |
||||
anchors_ploy[i], e) |
||||
return iou |
||||
|
||||
|
||||
def gen_sample(n): |
||||
rbox = np.random.rand(n, 5) |
||||
rbox[:, 0:4] = rbox[:, 0:4] * 0.45 + 0.001 |
||||
rbox[:, 4] = rbox[:, 4] - 0.5 |
||||
return rbox |
||||
|
||||
|
||||
class MatchedRBoxIoUTest(unittest.TestCase): |
||||
def setUp(self): |
||||
self.initTestCase() |
||||
self.rbox1 = gen_sample(self.n) |
||||
self.rbox2 = gen_sample(self.n) |
||||
|
||||
def initTestCase(self): |
||||
self.n = 1000 |
||||
|
||||
def assertAllClose(self, x, y, msg, atol=5e-1, rtol=1e-2): |
||||
self.assertTrue(np.allclose(x, y, atol=atol, rtol=rtol), msg=msg) |
||||
|
||||
def get_places(self): |
||||
places = [paddle.CPUPlace()] |
||||
if paddle.device.is_compiled_with_cuda(): |
||||
places.append(paddle.CUDAPlace(0)) |
||||
|
||||
return places |
||||
|
||||
def check_output(self, place): |
||||
paddle.disable_static() |
||||
pd_rbox1 = paddle.to_tensor(self.rbox1, place=place) |
||||
pd_rbox2 = paddle.to_tensor(self.rbox2, place=place) |
||||
actual_t = matched_rbox_iou(pd_rbox1, pd_rbox2).numpy() |
||||
poly_rbox1 = self.rbox1 |
||||
poly_rbox2 = self.rbox2 |
||||
poly_rbox1[:, 0:4] = self.rbox1[:, 0:4] * 1024 |
||||
poly_rbox2[:, 0:4] = self.rbox2[:, 0:4] * 1024 |
||||
expect_t = matched_rbox_overlaps(poly_rbox1, poly_rbox2, use_cv2=False) |
||||
self.assertAllClose( |
||||
actual_t, |
||||
expect_t, |
||||
msg="rbox_iou has diff at {} \nExpect {}\nBut got {}".format( |
||||
str(place), str(expect_t), str(actual_t))) |
||||
|
||||
def test_output(self): |
||||
places = self.get_places() |
||||
for place in places: |
||||
self.check_output(place) |
||||
|
||||
|
||||
if __name__ == "__main__": |
||||
unittest.main() |
@ -0,0 +1,151 @@ |
||||
import numpy as np |
||||
import sys |
||||
import time |
||||
from shapely.geometry import Polygon |
||||
import paddle |
||||
import unittest |
||||
|
||||
from ext_op import rbox_iou |
||||
|
||||
|
||||
def rbox2poly_single(rrect, get_best_begin_point=False): |
||||
""" |
||||
rrect:[x_ctr,y_ctr,w,h,angle] |
||||
to |
||||
poly:[x0,y0,x1,y1,x2,y2,x3,y3] |
||||
""" |
||||
x_ctr, y_ctr, width, height, angle = rrect[:5] |
||||
tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2 |
||||
# rect 2x4 |
||||
rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]]) |
||||
R = np.array([[np.cos(angle), -np.sin(angle)], |
||||
[np.sin(angle), np.cos(angle)]]) |
||||
# poly |
||||
poly = R.dot(rect) |
||||
x0, x1, x2, x3 = poly[0, :4] + x_ctr |
||||
y0, y1, y2, y3 = poly[1, :4] + y_ctr |
||||
poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float64) |
||||
return poly |
||||
|
||||
|
||||
def intersection(g, p): |
||||
""" |
||||
Intersection. |
||||
""" |
||||
|
||||
g = g[:8].reshape((4, 2)) |
||||
p = p[:8].reshape((4, 2)) |
||||
|
||||
a = g |
||||
b = p |
||||
|
||||
use_filter = True |
||||
if use_filter: |
||||
# step1: |
||||
inter_x1 = np.maximum(np.min(a[:, 0]), np.min(b[:, 0])) |
||||
inter_x2 = np.minimum(np.max(a[:, 0]), np.max(b[:, 0])) |
||||
inter_y1 = np.maximum(np.min(a[:, 1]), np.min(b[:, 1])) |
||||
inter_y2 = np.minimum(np.max(a[:, 1]), np.max(b[:, 1])) |
||||
if inter_x1 >= inter_x2 or inter_y1 >= inter_y2: |
||||
return 0. |
||||
x1 = np.minimum(np.min(a[:, 0]), np.min(b[:, 0])) |
||||
x2 = np.maximum(np.max(a[:, 0]), np.max(b[:, 0])) |
||||
y1 = np.minimum(np.min(a[:, 1]), np.min(b[:, 1])) |
||||
y2 = np.maximum(np.max(a[:, 1]), np.max(b[:, 1])) |
||||
if x1 >= x2 or y1 >= y2 or (x2 - x1) < 2 or (y2 - y1) < 2: |
||||
return 0. |
||||
|
||||
g = Polygon(g) |
||||
p = Polygon(p) |
||||
if not g.is_valid or not p.is_valid: |
||||
return 0 |
||||
|
||||
inter = Polygon(g).intersection(Polygon(p)).area |
||||
union = g.area + p.area - inter |
||||
if union == 0: |
||||
return 0 |
||||
else: |
||||
return inter / union |
||||
|
||||
|
||||
def rbox_overlaps(anchors, gt_bboxes, use_cv2=False): |
||||
""" |
||||
|
||||
Args: |
||||
anchors: [NA, 5] x1,y1,x2,y2,angle |
||||
gt_bboxes: [M, 5] x1,y1,x2,y2,angle |
||||
|
||||
Returns: |
||||
iou: [NA, M] |
||||
""" |
||||
assert anchors.shape[1] == 5 |
||||
assert gt_bboxes.shape[1] == 5 |
||||
|
||||
gt_bboxes_ploy = [rbox2poly_single(e) for e in gt_bboxes] |
||||
anchors_ploy = [rbox2poly_single(e) for e in anchors] |
||||
|
||||
num_gt, num_anchors = len(gt_bboxes_ploy), len(anchors_ploy) |
||||
iou = np.zeros((num_anchors, num_gt), dtype=np.float64) |
||||
|
||||
start_time = time.time() |
||||
for i in range(num_anchors): |
||||
for j in range(num_gt): |
||||
try: |
||||
iou[i, j] = intersection(anchors_ploy[i], gt_bboxes_ploy[j]) |
||||
except Exception as e: |
||||
print('cur anchors_ploy[i]', anchors_ploy[i], |
||||
'gt_bboxes_ploy[j]', gt_bboxes_ploy[j], e) |
||||
return iou |
||||
|
||||
|
||||
def gen_sample(n): |
||||
rbox = np.random.rand(n, 5) |
||||
rbox[:, 0:4] = rbox[:, 0:4] * 0.45 + 0.001 |
||||
rbox[:, 4] = rbox[:, 4] - 0.5 |
||||
return rbox |
||||
|
||||
|
||||
class RBoxIoUTest(unittest.TestCase): |
||||
def setUp(self): |
||||
self.initTestCase() |
||||
self.rbox1 = gen_sample(self.n) |
||||
self.rbox2 = gen_sample(self.m) |
||||
|
||||
def initTestCase(self): |
||||
self.n = 13000 |
||||
self.m = 7 |
||||
|
||||
def assertAllClose(self, x, y, msg, atol=5e-1, rtol=1e-2): |
||||
self.assertTrue(np.allclose(x, y, atol=atol, rtol=rtol), msg=msg) |
||||
|
||||
def get_places(self): |
||||
places = [paddle.CPUPlace()] |
||||
if paddle.device.is_compiled_with_cuda(): |
||||
places.append(paddle.CUDAPlace(0)) |
||||
|
||||
return places |
||||
|
||||
def check_output(self, place): |
||||
paddle.disable_static() |
||||
pd_rbox1 = paddle.to_tensor(self.rbox1, place=place) |
||||
pd_rbox2 = paddle.to_tensor(self.rbox2, place=place) |
||||
actual_t = rbox_iou(pd_rbox1, pd_rbox2).numpy() |
||||
poly_rbox1 = self.rbox1 |
||||
poly_rbox2 = self.rbox2 |
||||
poly_rbox1[:, 0:4] = self.rbox1[:, 0:4] * 1024 |
||||
poly_rbox2[:, 0:4] = self.rbox2[:, 0:4] * 1024 |
||||
expect_t = rbox_overlaps(poly_rbox1, poly_rbox2, use_cv2=False) |
||||
self.assertAllClose( |
||||
actual_t, |
||||
expect_t, |
||||
msg="rbox_iou has diff at {} \nExpect {}\nBut got {}".format( |
||||
str(place), str(expect_t), str(actual_t))) |
||||
|
||||
def test_output(self): |
||||
places = self.get_places() |
||||
for place in places: |
||||
self.check_output(place) |
||||
|
||||
|
||||
if __name__ == "__main__": |
||||
unittest.main() |
@ -0,0 +1 @@ |
||||
MODEL_ZOO |
@ -0,0 +1,13 @@ |
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
||||
# |
||||
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
# you may not use this file except in compliance with the License. |
||||
# You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
@ -0,0 +1,48 @@ |
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
||||
# |
||||
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
# you may not use this file except in compliance with the License. |
||||
# You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
|
||||
from __future__ import absolute_import |
||||
from __future__ import division |
||||
from __future__ import print_function |
||||
|
||||
import os |
||||
import paddle |
||||
import paddlers.models.ppdet as ppdet |
||||
import unittest |
||||
|
||||
# NOTE: weights downloading costs time, we choose |
||||
# a small model for unittesting |
||||
MODEL_NAME = 'ppyolo/ppyolo_tiny_650e_coco' |
||||
|
||||
|
||||
class TestGetConfigFile(unittest.TestCase): |
||||
def test_main(self): |
||||
try: |
||||
cfg_file = ppdet.model_zoo.get_config_file(MODEL_NAME) |
||||
assert os.path.isfile(cfg_file) |
||||
except: |
||||
self.assertTrue(False) |
||||
|
||||
|
||||
class TestGetModel(unittest.TestCase): |
||||
def test_main(self): |
||||
try: |
||||
model = ppdet.model_zoo.get_model(MODEL_NAME) |
||||
assert isinstance(model, paddle.nn.Layer) |
||||
except: |
||||
self.assertTrue(False) |
||||
|
||||
|
||||
if __name__ == '__main__': |
||||
unittest.main() |
@ -0,0 +1,68 @@ |
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
||||
# |
||||
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
# you may not use this file except in compliance with the License. |
||||
# You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
|
||||
from __future__ import absolute_import |
||||
from __future__ import division |
||||
from __future__ import print_function |
||||
|
||||
import unittest |
||||
import paddlers.models.ppdet as ppdet |
||||
|
||||
|
||||
class TestListModel(unittest.TestCase): |
||||
def setUp(self): |
||||
self._filter = [] |
||||
|
||||
def test_main(self): |
||||
try: |
||||
ppdet.model_zoo.list_model(self._filter) |
||||
self.assertTrue(True) |
||||
except: |
||||
self.assertTrue(False) |
||||
|
||||
|
||||
class TestListModelYOLO(TestListModel): |
||||
def setUp(self): |
||||
self._filter = ['yolo'] |
||||
|
||||
|
||||
class TestListModelRCNN(TestListModel): |
||||
def setUp(self): |
||||
self._filter = ['rcnn'] |
||||
|
||||
|
||||
class TestListModelSSD(TestListModel): |
||||
def setUp(self): |
||||
self._filter = ['ssd'] |
||||
|
||||
|
||||
class TestListModelMultiFilter(TestListModel): |
||||
def setUp(self): |
||||
self._filter = ['yolo', 'darknet'] |
||||
|
||||
|
||||
class TestListModelError(unittest.TestCase): |
||||
def setUp(self): |
||||
self._filter = ['xxx'] |
||||
|
||||
def test_main(self): |
||||
try: |
||||
ppdet.model_zoo.list_model(self._filter) |
||||
self.assertTrue(False) |
||||
except ValueError: |
||||
self.assertTrue(True) |
||||
|
||||
|
||||
if __name__ == '__main__': |
||||
unittest.main() |
@ -0,0 +1,79 @@ |
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
||||
# |
||||
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
# you may not use this file except in compliance with the License. |
||||
# You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
|
||||
from __future__ import absolute_import |
||||
from __future__ import division |
||||
from __future__ import print_function |
||||
|
||||
from paddlers.models.ppdet.core.workspace import register, create |
||||
from .meta_arch import BaseArch |
||||
|
||||
__all__ = ['ByteTrack'] |
||||
|
||||
|
||||
@register |
||||
class ByteTrack(BaseArch): |
||||
""" |
||||
ByteTrack network, see https://arxiv.org/abs/2110.06864 |
||||
|
||||
Args: |
||||
detector (object): detector model instance |
||||
reid (object): reid model instance, default None |
||||
tracker (object): tracker instance |
||||
""" |
||||
__category__ = 'architecture' |
||||
|
||||
def __init__(self, detector='YOLOX', reid=None, tracker='JDETracker'): |
||||
super(ByteTrack, self).__init__() |
||||
self.detector = detector |
||||
self.reid = reid |
||||
self.tracker = tracker |
||||
|
||||
@classmethod |
||||
def from_config(cls, cfg, *args, **kwargs): |
||||
detector = create(cfg['detector']) |
||||
|
||||
if cfg['reid'] != 'None': |
||||
reid = create(cfg['reid']) |
||||
else: |
||||
reid = None |
||||
|
||||
tracker = create(cfg['tracker']) |
||||
|
||||
return { |
||||
"detector": detector, |
||||
"reid": reid, |
||||
"tracker": tracker, |
||||
} |
||||
|
||||
def _forward(self): |
||||
det_outs = self.detector(self.inputs) |
||||
|
||||
if self.training: |
||||
return det_outs |
||||
else: |
||||
if self.reid is not None: |
||||
assert 'crops' in self.inputs |
||||
crops = self.inputs['crops'] |
||||
pred_embs = self.reid(crops) |
||||
else: |
||||
pred_embs = None |
||||
det_outs['embeddings'] = pred_embs |
||||
return det_outs |
||||
|
||||
def get_loss(self): |
||||
return self._forward() |
||||
|
||||
def get_pred(self): |
||||
return self._forward() |
@ -0,0 +1,68 @@ |
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
||||
# |
||||
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
# you may not use this file except in compliance with the License. |
||||
# You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
|
||||
from __future__ import absolute_import |
||||
from __future__ import division |
||||
from __future__ import print_function |
||||
|
||||
from paddlers.models.ppdet.core.workspace import register, create |
||||
from .meta_arch import BaseArch |
||||
import paddle |
||||
|
||||
__all__ = ['RetinaNet'] |
||||
|
||||
|
||||
@register |
||||
class RetinaNet(BaseArch): |
||||
__category__ = 'architecture' |
||||
|
||||
def __init__(self, backbone, neck, head): |
||||
super(RetinaNet, self).__init__() |
||||
self.backbone = backbone |
||||
self.neck = neck |
||||
self.head = head |
||||
|
||||
@classmethod |
||||
def from_config(cls, cfg, *args, **kwargs): |
||||
backbone = create(cfg['backbone']) |
||||
|
||||
kwargs = {'input_shape': backbone.out_shape} |
||||
neck = create(cfg['neck'], **kwargs) |
||||
|
||||
kwargs = {'input_shape': neck.out_shape} |
||||
head = create(cfg['head'], **kwargs) |
||||
|
||||
return { |
||||
'backbone': backbone, |
||||
'neck': neck, |
||||
'head': head, |
||||
} |
||||
|
||||
def _forward(self): |
||||
body_feats = self.backbone(self.inputs) |
||||
neck_feats = self.neck(body_feats) |
||||
|
||||
if self.training: |
||||
return self.head(neck_feats, self.inputs) |
||||
else: |
||||
head_outs = self.head(neck_feats) |
||||
bbox, bbox_num = self.head.post_process( |
||||
head_outs, self.inputs['im_shape'], self.inputs['scale_factor']) |
||||
return {'bbox': bbox, 'bbox_num': bbox_num} |
||||
|
||||
def get_loss(self): |
||||
return self._forward() |
||||
|
||||
def get_pred(self): |
||||
return self._forward() |
@ -0,0 +1,138 @@ |
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
||||
# |
||||
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
# you may not use this file except in compliance with the License. |
||||
# You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
|
||||
from __future__ import absolute_import |
||||
from __future__ import division |
||||
from __future__ import print_function |
||||
|
||||
from paddlers.models.ppdet.core.workspace import register, create |
||||
from .meta_arch import BaseArch |
||||
|
||||
import random |
||||
import paddle |
||||
import paddle.nn.functional as F |
||||
import paddle.distributed as dist |
||||
|
||||
__all__ = ['YOLOX'] |
||||
|
||||
|
||||
@register |
||||
class YOLOX(BaseArch): |
||||
""" |
||||
YOLOX network, see https://arxiv.org/abs/2107.08430 |
||||
|
||||
Args: |
||||
backbone (nn.Layer): backbone instance |
||||
neck (nn.Layer): neck instance |
||||
head (nn.Layer): head instance |
||||
for_mot (bool): whether used for MOT or not |
||||
input_size (list[int]): initial scale, will be reset by self._preprocess() |
||||
size_stride (int): stride of the size range |
||||
size_range (list[int]): multi-scale range for training |
||||
random_interval (int): interval of iter to change self._input_size |
||||
""" |
||||
__category__ = 'architecture' |
||||
|
||||
def __init__(self, |
||||
backbone='CSPDarkNet', |
||||
neck='YOLOCSPPAN', |
||||
head='YOLOXHead', |
||||
for_mot=False, |
||||
input_size=[640, 640], |
||||
size_stride=32, |
||||
size_range=[15, 25], |
||||
random_interval=10): |
||||
super(YOLOX, self).__init__() |
||||
self.backbone = backbone |
||||
self.neck = neck |
||||
self.head = head |
||||
self.for_mot = for_mot |
||||
|
||||
self.input_size = input_size |
||||
self._input_size = paddle.to_tensor(input_size) |
||||
self.size_stride = size_stride |
||||
self.size_range = size_range |
||||
self.random_interval = random_interval |
||||
self._step = 0 |
||||
|
||||
@classmethod |
||||
def from_config(cls, cfg, *args, **kwargs): |
||||
# backbone |
||||
backbone = create(cfg['backbone']) |
||||
|
||||
# fpn |
||||
kwargs = {'input_shape': backbone.out_shape} |
||||
neck = create(cfg['neck'], **kwargs) |
||||
|
||||
# head |
||||
kwargs = {'input_shape': neck.out_shape} |
||||
head = create(cfg['head'], **kwargs) |
||||
|
||||
return { |
||||
'backbone': backbone, |
||||
'neck': neck, |
||||
"head": head, |
||||
} |
||||
|
||||
def _forward(self): |
||||
if self.training: |
||||
self._preprocess() |
||||
body_feats = self.backbone(self.inputs) |
||||
neck_feats = self.neck(body_feats, self.for_mot) |
||||
|
||||
if self.training: |
||||
yolox_losses = self.head(neck_feats, self.inputs) |
||||
yolox_losses.update({'size': self._input_size[0]}) |
||||
return yolox_losses |
||||
else: |
||||
head_outs = self.head(neck_feats) |
||||
bbox, bbox_num = self.head.post_process( |
||||
head_outs, self.inputs['im_shape'], self.inputs['scale_factor']) |
||||
return {'bbox': bbox, 'bbox_num': bbox_num} |
||||
|
||||
def get_loss(self): |
||||
return self._forward() |
||||
|
||||
def get_pred(self): |
||||
return self._forward() |
||||
|
||||
def _preprocess(self): |
||||
# YOLOX multi-scale training, interpolate resize before inputs of the network. |
||||
self._get_size() |
||||
scale_y = self._input_size[0] / self.input_size[0] |
||||
scale_x = self._input_size[1] / self.input_size[1] |
||||
if scale_x != 1 or scale_y != 1: |
||||
self.inputs['image'] = F.interpolate( |
||||
self.inputs['image'], |
||||
size=self._input_size, |
||||
mode='bilinear', |
||||
align_corners=False) |
||||
gt_bboxes = self.inputs['gt_bbox'] |
||||
for i in range(len(gt_bboxes)): |
||||
if len(gt_bboxes[i]) > 0: |
||||
gt_bboxes[i][:, 0::2] = gt_bboxes[i][:, 0::2] * scale_x |
||||
gt_bboxes[i][:, 1::2] = gt_bboxes[i][:, 1::2] * scale_y |
||||
self.inputs['gt_bbox'] = gt_bboxes |
||||
|
||||
def _get_size(self): |
||||
# random_interval = 10 as default, every 10 iters to change self._input_size |
||||
image_ratio = self.input_size[1] * 1.0 / self.input_size[0] |
||||
if self._step % self.random_interval == 0: |
||||
size_factor = random.randint(*self.size_range) |
||||
size = [ |
||||
self.size_stride * size_factor, |
||||
self.size_stride * int(size_factor * image_ratio) |
||||
] |
||||
self._input_size = paddle.to_tensor(size) |
||||
self._step += 1 |
@ -0,0 +1,54 @@ |
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
||||
# |
||||
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
# you may not use this file except in compliance with the License. |
||||
# You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
|
||||
from __future__ import absolute_import |
||||
from __future__ import division |
||||
from __future__ import print_function |
||||
|
||||
from paddlers.models.ppdet.core.workspace import register |
||||
from paddlers.models.ppdet.modeling.proposal_generator.target import label_box |
||||
|
||||
__all__ = ['MaxIoUAssigner'] |
||||
|
||||
|
||||
@register |
||||
class MaxIoUAssigner(object): |
||||
"""a standard bbox assigner based on max IoU, use ppdet's label_box |
||||
as backend. |
||||
Args: |
||||
positive_overlap (float): threshold for defining positive samples |
||||
negative_overlap (float): threshold for denining negative samples |
||||
allow_low_quality (bool): whether to lower IoU thr if a GT poorly |
||||
overlaps with candidate bboxes |
||||
""" |
||||
|
||||
def __init__(self, |
||||
positive_overlap, |
||||
negative_overlap, |
||||
allow_low_quality=True): |
||||
self.positive_overlap = positive_overlap |
||||
self.negative_overlap = negative_overlap |
||||
self.allow_low_quality = allow_low_quality |
||||
|
||||
def __call__(self, bboxes, gt_bboxes): |
||||
matches, match_labels = label_box( |
||||
bboxes, |
||||
gt_bboxes, |
||||
positive_overlap=self.positive_overlap, |
||||
negative_overlap=self.negative_overlap, |
||||
allow_low_quality=self.allow_low_quality, |
||||
ignore_thresh=-1, |
||||
is_crowd=None, |
||||
assign_on_cpu=False) |
||||
return matches, match_labels |
@ -0,0 +1,245 @@ |
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
||||
# |
||||
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
# you may not use this file except in compliance with the License. |
||||
# You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
''' |
||||
Modified from https://github.com/facebookresearch/ConvNeXt |
||||
Copyright (c) Meta Platforms, Inc. and affiliates. |
||||
All rights reserved. |
||||
This source code is licensed under the license found in the |
||||
LICENSE file in the root directory of this source tree. |
||||
''' |
||||
|
||||
import paddle |
||||
import paddle.nn as nn |
||||
import paddle.nn.functional as F |
||||
from paddle import ParamAttr |
||||
from paddle.nn.initializer import Constant |
||||
|
||||
import numpy as np |
||||
|
||||
from paddlers.models.ppdet.core.workspace import register, serializable |
||||
from ..shape_spec import ShapeSpec |
||||
from .transformer_utils import DropPath, trunc_normal_, zeros_ |
||||
|
||||
__all__ = ['ConvNeXt'] |
||||
|
||||
|
||||
class Block(nn.Layer): |
||||
r""" ConvNeXt Block. There are two equivalent implementations: |
||||
(1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W) |
||||
(2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back |
||||
We use (2) as we find it slightly faster in Pypaddle |
||||
|
||||
Args: |
||||
dim (int): Number of input channels. |
||||
drop_path (float): Stochastic depth rate. Default: 0.0 |
||||
layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. |
||||
""" |
||||
|
||||
def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6): |
||||
super().__init__() |
||||
self.dwconv = nn.Conv2D( |
||||
dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv |
||||
self.norm = LayerNorm(dim, eps=1e-6) |
||||
self.pwconv1 = nn.Linear( |
||||
dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers |
||||
self.act = nn.GELU() |
||||
self.pwconv2 = nn.Linear(4 * dim, dim) |
||||
|
||||
if layer_scale_init_value > 0: |
||||
self.gamma = self.create_parameter( |
||||
shape=(dim, ), |
||||
attr=ParamAttr(initializer=Constant(layer_scale_init_value))) |
||||
else: |
||||
self.gamma = None |
||||
|
||||
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity( |
||||
) |
||||
|
||||
def forward(self, x): |
||||
input = x |
||||
x = self.dwconv(x) |
||||
x = x.transpose([0, 2, 3, 1]) |
||||
x = self.norm(x) |
||||
x = self.pwconv1(x) |
||||
x = self.act(x) |
||||
x = self.pwconv2(x) |
||||
if self.gamma is not None: |
||||
x = self.gamma * x |
||||
x = x.transpose([0, 3, 1, 2]) |
||||
x = input + self.drop_path(x) |
||||
return x |
||||
|
||||
|
||||
class LayerNorm(nn.Layer): |
||||
r""" LayerNorm that supports two data formats: channels_last (default) or channels_first. |
||||
The ordering of the dimensions in the inputs. channels_last corresponds to inputs with |
||||
shape (batch_size, height, width, channels) while channels_first corresponds to inputs |
||||
with shape (batch_size, channels, height, width). |
||||
""" |
||||
|
||||
def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"): |
||||
super().__init__() |
||||
|
||||
self.weight = self.create_parameter( |
||||
shape=(normalized_shape, ), |
||||
attr=ParamAttr(initializer=Constant(1.))) |
||||
self.bias = self.create_parameter( |
||||
shape=(normalized_shape, ), |
||||
attr=ParamAttr(initializer=Constant(0.))) |
||||
|
||||
self.eps = eps |
||||
self.data_format = data_format |
||||
if self.data_format not in ["channels_last", "channels_first"]: |
||||
raise NotImplementedError |
||||
self.normalized_shape = (normalized_shape, ) |
||||
|
||||
def forward(self, x): |
||||
if self.data_format == "channels_last": |
||||
return F.layer_norm(x, self.normalized_shape, self.weight, |
||||
self.bias, self.eps) |
||||
elif self.data_format == "channels_first": |
||||
u = x.mean(1, keepdim=True) |
||||
s = (x - u).pow(2).mean(1, keepdim=True) |
||||
x = (x - u) / paddle.sqrt(s + self.eps) |
||||
x = self.weight[:, None, None] * x + self.bias[:, None, None] |
||||
return x |
||||
|
||||
|
||||
@register |
||||
@serializable |
||||
class ConvNeXt(nn.Layer): |
||||
r""" ConvNeXt |
||||
A Pypaddle impl of : `A ConvNet for the 2020s` - |
||||
https://arxiv.org/pdf/2201.03545.pdf |
||||
|
||||
Args: |
||||
in_chans (int): Number of input image channels. Default: 3 |
||||
depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3] |
||||
dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768] |
||||
drop_path_rate (float): Stochastic depth rate. Default: 0. |
||||
layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. |
||||
""" |
||||
|
||||
arch_settings = { |
||||
'tiny': { |
||||
'depths': [3, 3, 9, 3], |
||||
'dims': [96, 192, 384, 768] |
||||
}, |
||||
'small': { |
||||
'depths': [3, 3, 27, 3], |
||||
'dims': [96, 192, 384, 768] |
||||
}, |
||||
'base': { |
||||
'depths': [3, 3, 27, 3], |
||||
'dims': [128, 256, 512, 1024] |
||||
}, |
||||
'large': { |
||||
'depths': [3, 3, 27, 3], |
||||
'dims': [192, 384, 768, 1536] |
||||
}, |
||||
'xlarge': { |
||||
'depths': [3, 3, 27, 3], |
||||
'dims': [256, 512, 1024, 2048] |
||||
}, |
||||
} |
||||
|
||||
def __init__( |
||||
self, |
||||
arch='tiny', |
||||
in_chans=3, |
||||
drop_path_rate=0., |
||||
layer_scale_init_value=1e-6, |
||||
return_idx=[1, 2, 3], |
||||
norm_output=True, |
||||
pretrained=None, ): |
||||
super().__init__() |
||||
depths = self.arch_settings[arch]['depths'] |
||||
dims = self.arch_settings[arch]['dims'] |
||||
self.downsample_layers = nn.LayerList( |
||||
) # stem and 3 intermediate downsampling conv layers |
||||
stem = nn.Sequential( |
||||
nn.Conv2D( |
||||
in_chans, dims[0], kernel_size=4, stride=4), |
||||
LayerNorm( |
||||
dims[0], eps=1e-6, data_format="channels_first")) |
||||
self.downsample_layers.append(stem) |
||||
for i in range(3): |
||||
downsample_layer = nn.Sequential( |
||||
LayerNorm( |
||||
dims[i], eps=1e-6, data_format="channels_first"), |
||||
nn.Conv2D( |
||||
dims[i], dims[i + 1], kernel_size=2, stride=2), ) |
||||
self.downsample_layers.append(downsample_layer) |
||||
|
||||
self.stages = nn.LayerList( |
||||
) # 4 feature resolution stages, each consisting of multiple residual blocks |
||||
dp_rates = [x for x in np.linspace(0, drop_path_rate, sum(depths))] |
||||
cur = 0 |
||||
for i in range(4): |
||||
stage = nn.Sequential(*[ |
||||
Block( |
||||
dim=dims[i], |
||||
drop_path=dp_rates[cur + j], |
||||
layer_scale_init_value=layer_scale_init_value) |
||||
for j in range(depths[i]) |
||||
]) |
||||
self.stages.append(stage) |
||||
cur += depths[i] |
||||
|
||||
self.return_idx = return_idx |
||||
self.dims = [dims[i] for i in return_idx] # [::-1] |
||||
|
||||
self.norm_output = norm_output |
||||
if norm_output: |
||||
self.norms = nn.LayerList([ |
||||
LayerNorm( |
||||
c, eps=1e-6, data_format="channels_first") |
||||
for c in self.dims |
||||
]) |
||||
|
||||
self.apply(self._init_weights) |
||||
|
||||
if pretrained is not None: |
||||
if 'http' in pretrained: #URL |
||||
path = paddle.utils.download.get_weights_path_from_url( |
||||
pretrained) |
||||
else: #model in local path |
||||
path = pretrained |
||||
self.set_state_dict(paddle.load(path)) |
||||
|
||||
def _init_weights(self, m): |
||||
if isinstance(m, (nn.Conv2D, nn.Linear)): |
||||
trunc_normal_(m.weight) |
||||
zeros_(m.bias) |
||||
|
||||
def forward_features(self, x): |
||||
output = [] |
||||
for i in range(4): |
||||
x = self.downsample_layers[i](x) |
||||
x = self.stages[i](x) |
||||
output.append(x) |
||||
|
||||
outputs = [output[i] for i in self.return_idx] |
||||
if self.norm_output: |
||||
outputs = [self.norms[i](out) for i, out in enumerate(outputs)] |
||||
|
||||
return outputs |
||||
|
||||
def forward(self, x): |
||||
x = self.forward_features(x['image']) |
||||
return x |
||||
|
||||
@property |
||||
def out_shape(self): |
||||
return [ShapeSpec(channels=c) for c in self.dims] |
@ -0,0 +1,404 @@ |
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
||||
# |
||||
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
# you may not use this file except in compliance with the License. |
||||
# You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
|
||||
import paddle |
||||
import paddle.nn as nn |
||||
import paddle.nn.functional as F |
||||
from paddle import ParamAttr |
||||
from paddle.regularizer import L2Decay |
||||
from paddlers.models.ppdet.core.workspace import register, serializable |
||||
from paddlers.models.ppdet.modeling.initializer import conv_init_ |
||||
from ..shape_spec import ShapeSpec |
||||
|
||||
__all__ = [ |
||||
'CSPDarkNet', 'BaseConv', 'DWConv', 'BottleNeck', 'SPPLayer', 'SPPFLayer' |
||||
] |
||||
|
||||
|
||||
class BaseConv(nn.Layer): |
||||
def __init__(self, |
||||
in_channels, |
||||
out_channels, |
||||
ksize, |
||||
stride, |
||||
groups=1, |
||||
bias=False, |
||||
act="silu"): |
||||
super(BaseConv, self).__init__() |
||||
self.conv = nn.Conv2D( |
||||
in_channels, |
||||
out_channels, |
||||
kernel_size=ksize, |
||||
stride=stride, |
||||
padding=(ksize - 1) // 2, |
||||
groups=groups, |
||||
bias_attr=bias) |
||||
self.bn = nn.BatchNorm2D( |
||||
out_channels, |
||||
weight_attr=ParamAttr(regularizer=L2Decay(0.0)), |
||||
bias_attr=ParamAttr(regularizer=L2Decay(0.0))) |
||||
|
||||
self._init_weights() |
||||
|
||||
def _init_weights(self): |
||||
conv_init_(self.conv) |
||||
|
||||
def forward(self, x): |
||||
# use 'x * F.sigmoid(x)' replace 'silu' |
||||
x = self.bn(self.conv(x)) |
||||
y = x * F.sigmoid(x) |
||||
return y |
||||
|
||||
|
||||
class DWConv(nn.Layer): |
||||
"""Depthwise Conv""" |
||||
|
||||
def __init__(self, |
||||
in_channels, |
||||
out_channels, |
||||
ksize, |
||||
stride=1, |
||||
bias=False, |
||||
act="silu"): |
||||
super(DWConv, self).__init__() |
||||
self.dw_conv = BaseConv( |
||||
in_channels, |
||||
in_channels, |
||||
ksize=ksize, |
||||
stride=stride, |
||||
groups=in_channels, |
||||
bias=bias, |
||||
act=act) |
||||
self.pw_conv = BaseConv( |
||||
in_channels, |
||||
out_channels, |
||||
ksize=1, |
||||
stride=1, |
||||
groups=1, |
||||
bias=bias, |
||||
act=act) |
||||
|
||||
def forward(self, x): |
||||
return self.pw_conv(self.dw_conv(x)) |
||||
|
||||
|
||||
class Focus(nn.Layer): |
||||
"""Focus width and height information into channel space, used in YOLOX.""" |
||||
|
||||
def __init__(self, |
||||
in_channels, |
||||
out_channels, |
||||
ksize=3, |
||||
stride=1, |
||||
bias=False, |
||||
act="silu"): |
||||
super(Focus, self).__init__() |
||||
self.conv = BaseConv( |
||||
in_channels * 4, |
||||
out_channels, |
||||
ksize=ksize, |
||||
stride=stride, |
||||
bias=bias, |
||||
act=act) |
||||
|
||||
def forward(self, inputs): |
||||
# inputs [bs, C, H, W] -> outputs [bs, 4C, W/2, H/2] |
||||
top_left = inputs[:, :, 0::2, 0::2] |
||||
top_right = inputs[:, :, 0::2, 1::2] |
||||
bottom_left = inputs[:, :, 1::2, 0::2] |
||||
bottom_right = inputs[:, :, 1::2, 1::2] |
||||
outputs = paddle.concat( |
||||
[top_left, bottom_left, top_right, bottom_right], 1) |
||||
return self.conv(outputs) |
||||
|
||||
|
||||
class BottleNeck(nn.Layer): |
||||
def __init__(self, |
||||
in_channels, |
||||
out_channels, |
||||
shortcut=True, |
||||
expansion=0.5, |
||||
depthwise=False, |
||||
bias=False, |
||||
act="silu"): |
||||
super(BottleNeck, self).__init__() |
||||
hidden_channels = int(out_channels * expansion) |
||||
Conv = DWConv if depthwise else BaseConv |
||||
self.conv1 = BaseConv( |
||||
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act) |
||||
self.conv2 = Conv( |
||||
hidden_channels, |
||||
out_channels, |
||||
ksize=3, |
||||
stride=1, |
||||
bias=bias, |
||||
act=act) |
||||
self.add_shortcut = shortcut and in_channels == out_channels |
||||
|
||||
def forward(self, x): |
||||
y = self.conv2(self.conv1(x)) |
||||
if self.add_shortcut: |
||||
y = y + x |
||||
return y |
||||
|
||||
|
||||
class SPPLayer(nn.Layer): |
||||
"""Spatial Pyramid Pooling (SPP) layer used in YOLOv3-SPP and YOLOX""" |
||||
|
||||
def __init__(self, |
||||
in_channels, |
||||
out_channels, |
||||
kernel_sizes=(5, 9, 13), |
||||
bias=False, |
||||
act="silu"): |
||||
super(SPPLayer, self).__init__() |
||||
hidden_channels = in_channels // 2 |
||||
self.conv1 = BaseConv( |
||||
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act) |
||||
self.maxpoolings = nn.LayerList([ |
||||
nn.MaxPool2D( |
||||
kernel_size=ks, stride=1, padding=ks // 2) |
||||
for ks in kernel_sizes |
||||
]) |
||||
conv2_channels = hidden_channels * (len(kernel_sizes) + 1) |
||||
self.conv2 = BaseConv( |
||||
conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act) |
||||
|
||||
def forward(self, x): |
||||
x = self.conv1(x) |
||||
x = paddle.concat([x] + [mp(x) for mp in self.maxpoolings], axis=1) |
||||
x = self.conv2(x) |
||||
return x |
||||
|
||||
|
||||
class SPPFLayer(nn.Layer): |
||||
""" Spatial Pyramid Pooling - Fast (SPPF) layer used in YOLOv5 by Glenn Jocher, |
||||
equivalent to SPP(k=(5, 9, 13)) |
||||
""" |
||||
|
||||
def __init__(self, |
||||
in_channels, |
||||
out_channels, |
||||
ksize=5, |
||||
bias=False, |
||||
act='silu'): |
||||
super(SPPFLayer, self).__init__() |
||||
hidden_channels = in_channels // 2 |
||||
self.conv1 = BaseConv( |
||||
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act) |
||||
self.maxpooling = nn.MaxPool2D( |
||||
kernel_size=ksize, stride=1, padding=ksize // 2) |
||||
conv2_channels = hidden_channels * 4 |
||||
self.conv2 = BaseConv( |
||||
conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act) |
||||
|
||||
def forward(self, x): |
||||
x = self.conv1(x) |
||||
y1 = self.maxpooling(x) |
||||
y2 = self.maxpooling(y1) |
||||
y3 = self.maxpooling(y2) |
||||
concats = paddle.concat([x, y1, y2, y3], axis=1) |
||||
out = self.conv2(concats) |
||||
return out |
||||
|
||||
|
||||
class CSPLayer(nn.Layer): |
||||
"""CSP (Cross Stage Partial) layer with 3 convs, named C3 in YOLOv5""" |
||||
|
||||
def __init__(self, |
||||
in_channels, |
||||
out_channels, |
||||
num_blocks=1, |
||||
shortcut=True, |
||||
expansion=0.5, |
||||
depthwise=False, |
||||
bias=False, |
||||
act="silu"): |
||||
super(CSPLayer, self).__init__() |
||||
hidden_channels = int(out_channels * expansion) |
||||
self.conv1 = BaseConv( |
||||
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act) |
||||
self.conv2 = BaseConv( |
||||
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act) |
||||
self.bottlenecks = nn.Sequential(*[ |
||||
BottleNeck( |
||||
hidden_channels, |
||||
hidden_channels, |
||||
shortcut=shortcut, |
||||
expansion=1.0, |
||||
depthwise=depthwise, |
||||
bias=bias, |
||||
act=act) for _ in range(num_blocks) |
||||
]) |
||||
self.conv3 = BaseConv( |
||||
hidden_channels * 2, |
||||
out_channels, |
||||
ksize=1, |
||||
stride=1, |
||||
bias=bias, |
||||
act=act) |
||||
|
||||
def forward(self, x): |
||||
x_1 = self.conv1(x) |
||||
x_1 = self.bottlenecks(x_1) |
||||
x_2 = self.conv2(x) |
||||
x = paddle.concat([x_1, x_2], axis=1) |
||||
x = self.conv3(x) |
||||
return x |
||||
|
||||
|
||||
@register |
||||
@serializable |
||||
class CSPDarkNet(nn.Layer): |
||||
""" |
||||
CSPDarkNet backbone. |
||||
Args: |
||||
arch (str): Architecture of CSPDarkNet, from {P5, P6, X}, default as X, |
||||
and 'X' means used in YOLOX, 'P5/P6' means used in YOLOv5. |
||||
depth_mult (float): Depth multiplier, multiply number of channels in |
||||
each layer, default as 1.0. |
||||
width_mult (float): Width multiplier, multiply number of blocks in |
||||
CSPLayer, default as 1.0. |
||||
depthwise (bool): Whether to use depth-wise conv layer. |
||||
act (str): Activation function type, default as 'silu'. |
||||
return_idx (list): Index of stages whose feature maps are returned. |
||||
""" |
||||
|
||||
__shared__ = ['depth_mult', 'width_mult', 'act', 'trt'] |
||||
|
||||
# in_channels, out_channels, num_blocks, add_shortcut, use_spp(use_sppf) |
||||
# 'X' means setting used in YOLOX, 'P5/P6' means setting used in YOLOv5. |
||||
arch_settings = { |
||||
'X': [[64, 128, 3, True, False], [128, 256, 9, True, False], |
||||
[256, 512, 9, True, False], [512, 1024, 3, False, True]], |
||||
'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False], |
||||
[256, 512, 9, True, False], [512, 1024, 3, True, True]], |
||||
'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False], |
||||
[256, 512, 9, True, False], [512, 768, 3, True, False], |
||||
[768, 1024, 3, True, True]], |
||||
} |
||||
|
||||
def __init__(self, |
||||
arch='X', |
||||
depth_mult=1.0, |
||||
width_mult=1.0, |
||||
depthwise=False, |
||||
act='silu', |
||||
trt=False, |
||||
return_idx=[2, 3, 4]): |
||||
super(CSPDarkNet, self).__init__() |
||||
self.arch = arch |
||||
self.return_idx = return_idx |
||||
Conv = DWConv if depthwise else BaseConv |
||||
arch_setting = self.arch_settings[arch] |
||||
base_channels = int(arch_setting[0][0] * width_mult) |
||||
|
||||
# Note: differences between the latest YOLOv5 and the original YOLOX |
||||
# 1. self.stem, use SPPF(in YOLOv5) or SPP(in YOLOX) |
||||
# 2. use SPPF(in YOLOv5) or SPP(in YOLOX) |
||||
# 3. put SPPF before(YOLOv5) or SPP after(YOLOX) the last cspdark block's CSPLayer |
||||
# 4. whether SPPF(SPP)'CSPLayer add shortcut, True in YOLOv5, False in YOLOX |
||||
if arch in ['P5', 'P6']: |
||||
# in the latest YOLOv5, use Conv stem, and SPPF (fast, only single spp kernal size) |
||||
self.stem = Conv( |
||||
3, base_channels, ksize=6, stride=2, bias=False, act=act) |
||||
spp_kernal_sizes = 5 |
||||
elif arch in ['X']: |
||||
# in the original YOLOX, use Focus stem, and SPP (three spp kernal sizes) |
||||
self.stem = Focus( |
||||
3, base_channels, ksize=3, stride=1, bias=False, act=act) |
||||
spp_kernal_sizes = (5, 9, 13) |
||||
else: |
||||
raise AttributeError("Unsupported arch type: {}".format(arch)) |
||||
|
||||
_out_channels = [base_channels] |
||||
layers_num = 1 |
||||
self.csp_dark_blocks = [] |
||||
|
||||
for i, (in_channels, out_channels, num_blocks, shortcut, |
||||
use_spp) in enumerate(arch_setting): |
||||
in_channels = int(in_channels * width_mult) |
||||
out_channels = int(out_channels * width_mult) |
||||
_out_channels.append(out_channels) |
||||
num_blocks = max(round(num_blocks * depth_mult), 1) |
||||
stage = [] |
||||
|
||||
conv_layer = self.add_sublayer( |
||||
'layers{}.stage{}.conv_layer'.format(layers_num, i + 1), |
||||
Conv( |
||||
in_channels, out_channels, 3, 2, bias=False, act=act)) |
||||
stage.append(conv_layer) |
||||
layers_num += 1 |
||||
|
||||
if use_spp and arch in ['X']: |
||||
# in YOLOX use SPPLayer |
||||
spp_layer = self.add_sublayer( |
||||
'layers{}.stage{}.spp_layer'.format(layers_num, i + 1), |
||||
SPPLayer( |
||||
out_channels, |
||||
out_channels, |
||||
kernel_sizes=spp_kernal_sizes, |
||||
bias=False, |
||||
act=act)) |
||||
stage.append(spp_layer) |
||||
layers_num += 1 |
||||
|
||||
csp_layer = self.add_sublayer( |
||||
'layers{}.stage{}.csp_layer'.format(layers_num, i + 1), |
||||
CSPLayer( |
||||
out_channels, |
||||
out_channels, |
||||
num_blocks=num_blocks, |
||||
shortcut=shortcut, |
||||
depthwise=depthwise, |
||||
bias=False, |
||||
act=act)) |
||||
stage.append(csp_layer) |
||||
layers_num += 1 |
||||
|
||||
if use_spp and arch in ['P5', 'P6']: |
||||
# in latest YOLOv5 use SPPFLayer instead of SPPLayer |
||||
sppf_layer = self.add_sublayer( |
||||
'layers{}.stage{}.sppf_layer'.format(layers_num, i + 1), |
||||
SPPFLayer( |
||||
out_channels, |
||||
out_channels, |
||||
ksize=5, |
||||
bias=False, |
||||
act=act)) |
||||
stage.append(sppf_layer) |
||||
layers_num += 1 |
||||
|
||||
self.csp_dark_blocks.append(nn.Sequential(*stage)) |
||||
|
||||
self._out_channels = [_out_channels[i] for i in self.return_idx] |
||||
self.strides = [[2, 4, 8, 16, 32, 64][i] for i in self.return_idx] |
||||
|
||||
def forward(self, inputs): |
||||
x = inputs['image'] |
||||
outputs = [] |
||||
x = self.stem(x) |
||||
for i, layer in enumerate(self.csp_dark_blocks): |
||||
x = layer(x) |
||||
if i + 1 in self.return_idx: |
||||
outputs.append(x) |
||||
return outputs |
||||
|
||||
@property |
||||
def out_shape(self): |
||||
return [ |
||||
ShapeSpec( |
||||
channels=c, stride=s) |
||||
for c, s in zip(self._out_channels, self.strides) |
||||
] |
@ -0,0 +1,321 @@ |
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
||||
# |
||||
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
# you may not use this file except in compliance with the License. |
||||
# You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
|
||||
from __future__ import absolute_import |
||||
from __future__ import division |
||||
from __future__ import print_function |
||||
|
||||
import paddle |
||||
import paddle.nn as nn |
||||
import paddle.nn.functional as F |
||||
from paddle import ParamAttr |
||||
from paddle.regularizer import L2Decay |
||||
from paddle.nn.initializer import Constant |
||||
|
||||
from paddlers.models.ppdet.modeling.ops import get_act_fn |
||||
from paddlers.models.ppdet.core.workspace import register, serializable |
||||
from ..shape_spec import ShapeSpec |
||||
|
||||
__all__ = ['CSPResNet', 'BasicBlock', 'EffectiveSELayer', 'ConvBNLayer'] |
||||
|
||||
|
||||
class ConvBNLayer(nn.Layer): |
||||
def __init__(self, |
||||
ch_in, |
||||
ch_out, |
||||
filter_size=3, |
||||
stride=1, |
||||
groups=1, |
||||
padding=0, |
||||
act=None): |
||||
super(ConvBNLayer, self).__init__() |
||||
|
||||
self.conv = nn.Conv2D( |
||||
in_channels=ch_in, |
||||
out_channels=ch_out, |
||||
kernel_size=filter_size, |
||||
stride=stride, |
||||
padding=padding, |
||||
groups=groups, |
||||
bias_attr=False) |
||||
|
||||
self.bn = nn.BatchNorm2D( |
||||
ch_out, |
||||
weight_attr=ParamAttr(regularizer=L2Decay(0.0)), |
||||
bias_attr=ParamAttr(regularizer=L2Decay(0.0))) |
||||
self.act = get_act_fn(act) if act is None or isinstance(act, ( |
||||
str, dict)) else act |
||||
|
||||
def forward(self, x): |
||||
x = self.conv(x) |
||||
x = self.bn(x) |
||||
x = self.act(x) |
||||
|
||||
return x |
||||
|
||||
|
||||
class RepVggBlock(nn.Layer): |
||||
def __init__(self, ch_in, ch_out, act='relu', alpha=False): |
||||
super(RepVggBlock, self).__init__() |
||||
self.ch_in = ch_in |
||||
self.ch_out = ch_out |
||||
self.conv1 = ConvBNLayer( |
||||
ch_in, ch_out, 3, stride=1, padding=1, act=None) |
||||
self.conv2 = ConvBNLayer( |
||||
ch_in, ch_out, 1, stride=1, padding=0, act=None) |
||||
self.act = get_act_fn(act) if act is None or isinstance(act, ( |
||||
str, dict)) else act |
||||
if alpha: |
||||
self.alpha = self.create_parameter( |
||||
shape=[1], |
||||
attr=ParamAttr(initializer=Constant(value=1.)), |
||||
dtype="float32") |
||||
else: |
||||
self.alpha = None |
||||
|
||||
def forward(self, x): |
||||
if hasattr(self, 'conv'): |
||||
y = self.conv(x) |
||||
else: |
||||
if self.alpha: |
||||
y = self.conv1(x) + self.alpha * self.conv2(x) |
||||
else: |
||||
y = self.conv1(x) + self.conv2(x) |
||||
y = self.act(y) |
||||
return y |
||||
|
||||
def convert_to_deploy(self): |
||||
if not hasattr(self, 'conv'): |
||||
self.conv = nn.Conv2D( |
||||
in_channels=self.ch_in, |
||||
out_channels=self.ch_out, |
||||
kernel_size=3, |
||||
stride=1, |
||||
padding=1, |
||||
groups=1) |
||||
kernel, bias = self.get_equivalent_kernel_bias() |
||||
self.conv.weight.set_value(kernel) |
||||
self.conv.bias.set_value(bias) |
||||
self.__delattr__('conv1') |
||||
self.__delattr__('conv2') |
||||
|
||||
def get_equivalent_kernel_bias(self): |
||||
kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1) |
||||
kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2) |
||||
if self.alpha: |
||||
return kernel3x3 + self.alpha * self._pad_1x1_to_3x3_tensor( |
||||
kernel1x1), bias3x3 + self.alpha * bias1x1 |
||||
else: |
||||
return kernel3x3 + self._pad_1x1_to_3x3_tensor( |
||||
kernel1x1), bias3x3 + bias1x1 |
||||
|
||||
def _pad_1x1_to_3x3_tensor(self, kernel1x1): |
||||
if kernel1x1 is None: |
||||
return 0 |
||||
else: |
||||
return nn.functional.pad(kernel1x1, [1, 1, 1, 1]) |
||||
|
||||
def _fuse_bn_tensor(self, branch): |
||||
if branch is None: |
||||
return 0, 0 |
||||
kernel = branch.conv.weight |
||||
running_mean = branch.bn._mean |
||||
running_var = branch.bn._variance |
||||
gamma = branch.bn.weight |
||||
beta = branch.bn.bias |
||||
eps = branch.bn._epsilon |
||||
std = (running_var + eps).sqrt() |
||||
t = (gamma / std).reshape((-1, 1, 1, 1)) |
||||
return kernel * t, beta - running_mean * gamma / std |
||||
|
||||
|
||||
class BasicBlock(nn.Layer): |
||||
def __init__(self, |
||||
ch_in, |
||||
ch_out, |
||||
act='relu', |
||||
shortcut=True, |
||||
use_alpha=False): |
||||
super(BasicBlock, self).__init__() |
||||
assert ch_in == ch_out |
||||
self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=act) |
||||
self.conv2 = RepVggBlock(ch_out, ch_out, act=act, alpha=use_alpha) |
||||
self.shortcut = shortcut |
||||
|
||||
def forward(self, x): |
||||
y = self.conv1(x) |
||||
y = self.conv2(y) |
||||
if self.shortcut: |
||||
return paddle.add(x, y) |
||||
else: |
||||
return y |
||||
|
||||
|
||||
class EffectiveSELayer(nn.Layer): |
||||
""" Effective Squeeze-Excitation |
||||
From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667 |
||||
""" |
||||
|
||||
def __init__(self, channels, act='hardsigmoid'): |
||||
super(EffectiveSELayer, self).__init__() |
||||
self.fc = nn.Conv2D(channels, channels, kernel_size=1, padding=0) |
||||
self.act = get_act_fn(act) if act is None or isinstance(act, ( |
||||
str, dict)) else act |
||||
|
||||
def forward(self, x): |
||||
x_se = x.mean((2, 3), keepdim=True) |
||||
x_se = self.fc(x_se) |
||||
return x * self.act(x_se) |
||||
|
||||
|
||||
class CSPResStage(nn.Layer): |
||||
def __init__(self, |
||||
block_fn, |
||||
ch_in, |
||||
ch_out, |
||||
n, |
||||
stride, |
||||
act='relu', |
||||
attn='eca', |
||||
use_alpha=False): |
||||
super(CSPResStage, self).__init__() |
||||
|
||||
ch_mid = (ch_in + ch_out) // 2 |
||||
if stride == 2: |
||||
self.conv_down = ConvBNLayer( |
||||
ch_in, ch_mid, 3, stride=2, padding=1, act=act) |
||||
else: |
||||
self.conv_down = None |
||||
self.conv1 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act) |
||||
self.conv2 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act) |
||||
self.blocks = nn.Sequential(*[ |
||||
block_fn( |
||||
ch_mid // 2, |
||||
ch_mid // 2, |
||||
act=act, |
||||
shortcut=True, |
||||
use_alpha=use_alpha) for i in range(n) |
||||
]) |
||||
if attn: |
||||
self.attn = EffectiveSELayer(ch_mid, act='hardsigmoid') |
||||
else: |
||||
self.attn = None |
||||
|
||||
self.conv3 = ConvBNLayer(ch_mid, ch_out, 1, act=act) |
||||
|
||||
def forward(self, x): |
||||
if self.conv_down is not None: |
||||
x = self.conv_down(x) |
||||
y1 = self.conv1(x) |
||||
y2 = self.blocks(self.conv2(x)) |
||||
y = paddle.concat([y1, y2], axis=1) |
||||
if self.attn is not None: |
||||
y = self.attn(y) |
||||
y = self.conv3(y) |
||||
return y |
||||
|
||||
|
||||
@register |
||||
@serializable |
||||
class CSPResNet(nn.Layer): |
||||
__shared__ = ['width_mult', 'depth_mult', 'trt'] |
||||
|
||||
def __init__(self, |
||||
layers=[3, 6, 6, 3], |
||||
channels=[64, 128, 256, 512, 1024], |
||||
act='swish', |
||||
return_idx=[1, 2, 3], |
||||
depth_wise=False, |
||||
use_large_stem=False, |
||||
width_mult=1.0, |
||||
depth_mult=1.0, |
||||
trt=False, |
||||
use_checkpoint=False, |
||||
use_alpha=False, |
||||
**args): |
||||
super(CSPResNet, self).__init__() |
||||
self.use_checkpoint = use_checkpoint |
||||
channels = [max(round(c * width_mult), 1) for c in channels] |
||||
layers = [max(round(l * depth_mult), 1) for l in layers] |
||||
act = get_act_fn( |
||||
act, trt=trt) if act is None or isinstance(act, |
||||
(str, dict)) else act |
||||
|
||||
if use_large_stem: |
||||
self.stem = nn.Sequential( |
||||
('conv1', ConvBNLayer( |
||||
3, channels[0] // 2, 3, stride=2, padding=1, act=act)), |
||||
('conv2', ConvBNLayer( |
||||
channels[0] // 2, |
||||
channels[0] // 2, |
||||
3, |
||||
stride=1, |
||||
padding=1, |
||||
act=act)), ('conv3', ConvBNLayer( |
||||
channels[0] // 2, |
||||
channels[0], |
||||
3, |
||||
stride=1, |
||||
padding=1, |
||||
act=act))) |
||||
else: |
||||
self.stem = nn.Sequential( |
||||
('conv1', ConvBNLayer( |
||||
3, channels[0] // 2, 3, stride=2, padding=1, act=act)), |
||||
('conv2', ConvBNLayer( |
||||
channels[0] // 2, |
||||
channels[0], |
||||
3, |
||||
stride=1, |
||||
padding=1, |
||||
act=act))) |
||||
|
||||
n = len(channels) - 1 |
||||
self.stages = nn.Sequential(*[(str(i), CSPResStage( |
||||
BasicBlock, |
||||
channels[i], |
||||
channels[i + 1], |
||||
layers[i], |
||||
2, |
||||
act=act, |
||||
use_alpha=use_alpha)) for i in range(n)]) |
||||
|
||||
self._out_channels = channels[1:] |
||||
self._out_strides = [4 * 2**i for i in range(n)] |
||||
self.return_idx = return_idx |
||||
if use_checkpoint: |
||||
paddle.seed(0) |
||||
|
||||
def forward(self, inputs): |
||||
x = inputs['image'] |
||||
x = self.stem(x) |
||||
outs = [] |
||||
for idx, stage in enumerate(self.stages): |
||||
if self.use_checkpoint and self.training: |
||||
x = paddle.distributed.fleet.utils.recompute( |
||||
stage, x, **{"preserve_rng_state": True}) |
||||
else: |
||||
x = stage(x) |
||||
if idx in self.return_idx: |
||||
outs.append(x) |
||||
|
||||
return outs |
||||
|
||||
@property |
||||
def out_shape(self): |
||||
return [ |
||||
ShapeSpec( |
||||
channels=self._out_channels[i], stride=self._out_strides[i]) |
||||
for i in self.return_idx |
||||
] |
@ -0,0 +1,266 @@ |
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
||||
# |
||||
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
# you may not use this file except in compliance with the License. |
||||
# You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
""" |
||||
This code is the paddle implementation of MobileOne block, see: https://arxiv.org/pdf/2206.04040.pdf. |
||||
Some codes are based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py |
||||
Ths copyright of microsoft/Swin-Transformer is as follows: |
||||
MIT License [see LICENSE for details] |
||||
""" |
||||
|
||||
import paddle |
||||
import paddle.nn as nn |
||||
from paddle import ParamAttr |
||||
from paddle.regularizer import L2Decay |
||||
from paddle.nn.initializer import Normal, Constant |
||||
|
||||
from paddlers.models.ppdet.modeling.ops import get_act_fn |
||||
from paddlers.models.ppdet.modeling.layers import ConvNormLayer |
||||
|
||||
|
||||
class MobileOneBlock(nn.Layer): |
||||
def __init__( |
||||
self, |
||||
ch_in, |
||||
ch_out, |
||||
stride, |
||||
kernel_size, |
||||
conv_num=1, |
||||
norm_type='bn', |
||||
norm_decay=0., |
||||
norm_groups=32, |
||||
bias_on=False, |
||||
lr_scale=1., |
||||
freeze_norm=False, |
||||
initializer=Normal( |
||||
mean=0., std=0.01), |
||||
skip_quant=False, |
||||
act='relu', ): |
||||
super(MobileOneBlock, self).__init__() |
||||
|
||||
self.ch_in = ch_in |
||||
self.ch_out = ch_out |
||||
self.kernel_size = kernel_size |
||||
self.stride = stride |
||||
self.padding = (kernel_size - 1) // 2 |
||||
self.k = conv_num |
||||
|
||||
self.depth_conv = nn.LayerList() |
||||
self.point_conv = nn.LayerList() |
||||
for _ in range(self.k): |
||||
self.depth_conv.append( |
||||
ConvNormLayer( |
||||
ch_in, |
||||
ch_in, |
||||
kernel_size, |
||||
stride=stride, |
||||
groups=ch_in, |
||||
norm_type=norm_type, |
||||
norm_decay=norm_decay, |
||||
norm_groups=norm_groups, |
||||
bias_on=bias_on, |
||||
lr_scale=lr_scale, |
||||
freeze_norm=freeze_norm, |
||||
initializer=initializer, |
||||
skip_quant=skip_quant)) |
||||
self.point_conv.append( |
||||
ConvNormLayer( |
||||
ch_in, |
||||
ch_out, |
||||
1, |
||||
stride=1, |
||||
groups=1, |
||||
norm_type=norm_type, |
||||
norm_decay=norm_decay, |
||||
norm_groups=norm_groups, |
||||
bias_on=bias_on, |
||||
lr_scale=lr_scale, |
||||
freeze_norm=freeze_norm, |
||||
initializer=initializer, |
||||
skip_quant=skip_quant)) |
||||
self.rbr_1x1 = ConvNormLayer( |
||||
ch_in, |
||||
ch_in, |
||||
1, |
||||
stride=self.stride, |
||||
groups=ch_in, |
||||
norm_type=norm_type, |
||||
norm_decay=norm_decay, |
||||
norm_groups=norm_groups, |
||||
bias_on=bias_on, |
||||
lr_scale=lr_scale, |
||||
freeze_norm=freeze_norm, |
||||
initializer=initializer, |
||||
skip_quant=skip_quant) |
||||
self.rbr_identity_st1 = nn.BatchNorm2D( |
||||
num_features=ch_in, |
||||
weight_attr=ParamAttr(regularizer=L2Decay(0.0)), |
||||
bias_attr=ParamAttr(regularizer=L2Decay( |
||||
0.0))) if ch_in == ch_out and self.stride == 1 else None |
||||
self.rbr_identity_st2 = nn.BatchNorm2D( |
||||
num_features=ch_out, |
||||
weight_attr=ParamAttr(regularizer=L2Decay(0.0)), |
||||
bias_attr=ParamAttr(regularizer=L2Decay( |
||||
0.0))) if ch_in == ch_out and self.stride == 1 else None |
||||
self.act = get_act_fn(act) if act is None or isinstance(act, ( |
||||
str, dict)) else act |
||||
|
||||
def forward(self, x): |
||||
if hasattr(self, "conv1") and hasattr(self, "conv2"): |
||||
y = self.act(self.conv2(self.act(self.conv1(x)))) |
||||
else: |
||||
if self.rbr_identity_st1 is None: |
||||
id_out_st1 = 0 |
||||
else: |
||||
id_out_st1 = self.rbr_identity_st1(x) |
||||
|
||||
x1_1 = 0 |
||||
for i in range(self.k): |
||||
x1_1 += self.depth_conv[i](x) |
||||
|
||||
x1_2 = self.rbr_1x1(x) |
||||
x1 = self.act(x1_1 + x1_2 + id_out_st1) |
||||
|
||||
if self.rbr_identity_st2 is None: |
||||
id_out_st2 = 0 |
||||
else: |
||||
id_out_st2 = self.rbr_identity_st2(x1) |
||||
|
||||
x2_1 = 0 |
||||
for i in range(self.k): |
||||
x2_1 += self.point_conv[i](x1) |
||||
y = self.act(x2_1 + id_out_st2) |
||||
|
||||
return y |
||||
|
||||
def convert_to_deploy(self): |
||||
if not hasattr(self, 'conv1'): |
||||
self.conv1 = nn.Conv2D( |
||||
in_channels=self.ch_in, |
||||
out_channels=self.ch_in, |
||||
kernel_size=self.kernel_size, |
||||
stride=self.stride, |
||||
padding=self.padding, |
||||
groups=self.ch_in, |
||||
bias_attr=ParamAttr( |
||||
initializer=Constant(value=0.), learning_rate=1.)) |
||||
if not hasattr(self, 'conv2'): |
||||
self.conv2 = nn.Conv2D( |
||||
in_channels=self.ch_in, |
||||
out_channels=self.ch_out, |
||||
kernel_size=1, |
||||
stride=1, |
||||
padding='SAME', |
||||
groups=1, |
||||
bias_attr=ParamAttr( |
||||
initializer=Constant(value=0.), learning_rate=1.)) |
||||
|
||||
conv1_kernel, conv1_bias, conv2_kernel, conv2_bias = self.get_equivalent_kernel_bias( |
||||
) |
||||
self.conv1.weight.set_value(conv1_kernel) |
||||
self.conv1.bias.set_value(conv1_bias) |
||||
self.conv2.weight.set_value(conv2_kernel) |
||||
self.conv2.bias.set_value(conv2_bias) |
||||
self.__delattr__('depth_conv') |
||||
self.__delattr__('point_conv') |
||||
self.__delattr__('rbr_1x1') |
||||
if hasattr(self, 'rbr_identity_st1'): |
||||
self.__delattr__('rbr_identity_st1') |
||||
if hasattr(self, 'rbr_identity_st2'): |
||||
self.__delattr__('rbr_identity_st2') |
||||
|
||||
def get_equivalent_kernel_bias(self): |
||||
st1_kernel3x3, st1_bias3x3 = self._fuse_bn_tensor(self.depth_conv) |
||||
st1_kernel1x1, st1_bias1x1 = self._fuse_bn_tensor(self.rbr_1x1) |
||||
st1_kernelid, st1_biasid = self._fuse_bn_tensor( |
||||
self.rbr_identity_st1, kernel_size=self.kernel_size) |
||||
|
||||
st2_kernel1x1, st2_bias1x1 = self._fuse_bn_tensor(self.point_conv) |
||||
st2_kernelid, st2_biasid = self._fuse_bn_tensor( |
||||
self.rbr_identity_st2, kernel_size=1) |
||||
|
||||
conv1_kernel = st1_kernel3x3 + self._pad_1x1_to_3x3_tensor( |
||||
st1_kernel1x1) + st1_kernelid |
||||
|
||||
conv1_bias = st1_bias3x3 + st1_bias1x1 + st1_biasid |
||||
|
||||
conv2_kernel = st2_kernel1x1 + st2_kernelid |
||||
conv2_bias = st2_bias1x1 + st2_biasid |
||||
|
||||
return conv1_kernel, conv1_bias, conv2_kernel, conv2_bias |
||||
|
||||
def _pad_1x1_to_3x3_tensor(self, kernel1x1): |
||||
if kernel1x1 is None: |
||||
return 0 |
||||
else: |
||||
padding_size = (self.kernel_size - 1) // 2 |
||||
return nn.functional.pad( |
||||
kernel1x1, |
||||
[padding_size, padding_size, padding_size, padding_size]) |
||||
|
||||
def _fuse_bn_tensor(self, branch, kernel_size=3): |
||||
if branch is None: |
||||
return 0, 0 |
||||
|
||||
if isinstance(branch, nn.LayerList): |
||||
fused_kernels = [] |
||||
fused_bias = [] |
||||
for block in branch: |
||||
kernel = block.conv.weight |
||||
running_mean = block.norm._mean |
||||
running_var = block.norm._variance |
||||
gamma = block.norm.weight |
||||
beta = block.norm.bias |
||||
eps = block.norm._epsilon |
||||
|
||||
std = (running_var + eps).sqrt() |
||||
t = (gamma / std).reshape((-1, 1, 1, 1)) |
||||
|
||||
fused_kernels.append(kernel * t) |
||||
fused_bias.append(beta - running_mean * gamma / std) |
||||
|
||||
return sum(fused_kernels), sum(fused_bias) |
||||
|
||||
elif isinstance(branch, ConvNormLayer): |
||||
kernel = branch.conv.weight |
||||
running_mean = branch.norm._mean |
||||
running_var = branch.norm._variance |
||||
gamma = branch.norm.weight |
||||
beta = branch.norm.bias |
||||
eps = branch.norm._epsilon |
||||
else: |
||||
assert isinstance(branch, nn.BatchNorm2D) |
||||
input_dim = self.ch_in if kernel_size == 1 else 1 |
||||
kernel_value = paddle.zeros( |
||||
shape=[self.ch_in, input_dim, kernel_size, kernel_size], |
||||
dtype='float32') |
||||
if kernel_size > 1: |
||||
for i in range(self.ch_in): |
||||
kernel_value[i, i % input_dim, (kernel_size - 1) // 2, ( |
||||
kernel_size - 1) // 2] = 1 |
||||
elif kernel_size == 1: |
||||
for i in range(self.ch_in): |
||||
kernel_value[i, i % input_dim, 0, 0] = 1 |
||||
else: |
||||
raise ValueError("Invalid kernel size recieved!") |
||||
kernel = paddle.to_tensor(kernel_value, place=branch.weight.place) |
||||
running_mean = branch._mean |
||||
running_var = branch._variance |
||||
gamma = branch.weight |
||||
beta = branch.bias |
||||
eps = branch._epsilon |
||||
|
||||
std = (running_var + eps).sqrt() |
||||
t = (gamma / std).reshape((-1, 1, 1, 1)) |
||||
|
||||
return kernel * t, beta - running_mean * gamma / std |
@ -0,0 +1,74 @@ |
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
||||
# |
||||
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
# you may not use this file except in compliance with the License. |
||||
# You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
|
||||
import paddle |
||||
import paddle.nn as nn |
||||
|
||||
from paddle.nn.initializer import TruncatedNormal, Constant, Assign |
||||
|
||||
# Common initializations |
||||
ones_ = Constant(value=1.) |
||||
zeros_ = Constant(value=0.) |
||||
trunc_normal_ = TruncatedNormal(std=.02) |
||||
|
||||
|
||||
# Common Layers |
||||
def drop_path(x, drop_prob=0., training=False): |
||||
""" |
||||
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). |
||||
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... |
||||
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... |
||||
""" |
||||
if drop_prob == 0. or not training: |
||||
return x |
||||
keep_prob = paddle.to_tensor(1 - drop_prob) |
||||
shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1) |
||||
random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype) |
||||
random_tensor = paddle.floor(random_tensor) # binarize |
||||
output = x.divide(keep_prob) * random_tensor |
||||
return output |
||||
|
||||
|
||||
class DropPath(nn.Layer): |
||||
def __init__(self, drop_prob=None): |
||||
super(DropPath, self).__init__() |
||||
self.drop_prob = drop_prob |
||||
|
||||
def forward(self, x): |
||||
return drop_path(x, self.drop_prob, self.training) |
||||
|
||||
|
||||
class Identity(nn.Layer): |
||||
def __init__(self): |
||||
super(Identity, self).__init__() |
||||
|
||||
def forward(self, input): |
||||
return input |
||||
|
||||
|
||||
# common funcs |
||||
|
||||
|
||||
def to_2tuple(x): |
||||
if isinstance(x, (list, tuple)): |
||||
return x |
||||
return tuple([x] * 2) |
||||
|
||||
|
||||
def add_parameter(layer, datas, name=None): |
||||
parameter = layer.create_parameter( |
||||
shape=(datas.shape), default_initializer=Assign(datas)) |
||||
if name: |
||||
layer.add_parameter(name, parameter) |
||||
return parameter |
@ -0,0 +1,634 @@ |
||||
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. |
||||
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
# you may not use this file except in compliance with the License. |
||||
# You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
|
||||
import math |
||||
|
||||
import paddle |
||||
import paddle.nn as nn |
||||
import paddle.nn.functional as F |
||||
import numpy as np |
||||
from paddle.nn.initializer import Constant |
||||
|
||||
from paddlers.models.ppdet.modeling.shape_spec import ShapeSpec |
||||
from paddlers.models.ppdet.core.workspace import register, serializable |
||||
|
||||
from .transformer_utils import zeros_, DropPath, Identity |
||||
|
||||
|
||||
class Mlp(nn.Layer): |
||||
def __init__(self, |
||||
in_features, |
||||
hidden_features=None, |
||||
out_features=None, |
||||
act_layer=nn.GELU, |
||||
drop=0.): |
||||
super().__init__() |
||||
out_features = out_features or in_features |
||||
hidden_features = hidden_features or in_features |
||||
self.fc1 = nn.Linear(in_features, hidden_features) |
||||
self.act = act_layer() |
||||
self.fc2 = nn.Linear(hidden_features, out_features) |
||||
self.drop = nn.Dropout(drop) |
||||
|
||||
def forward(self, x): |
||||
x = self.fc1(x) |
||||
x = self.act(x) |
||||
x = self.drop(x) |
||||
x = self.fc2(x) |
||||
x = self.drop(x) |
||||
return x |
||||
|
||||
|
||||
class Attention(nn.Layer): |
||||
def __init__(self, |
||||
dim, |
||||
num_heads=8, |
||||
qkv_bias=False, |
||||
qk_scale=None, |
||||
attn_drop=0., |
||||
proj_drop=0., |
||||
window_size=None): |
||||
super().__init__() |
||||
self.num_heads = num_heads |
||||
head_dim = dim // num_heads |
||||
self.scale = qk_scale or head_dim**-0.5 |
||||
|
||||
self.qkv = nn.Linear(dim, dim * 3, bias_attr=False) |
||||
|
||||
if qkv_bias: |
||||
self.q_bias = self.create_parameter( |
||||
shape=([dim]), default_initializer=zeros_) |
||||
self.v_bias = self.create_parameter( |
||||
shape=([dim]), default_initializer=zeros_) |
||||
else: |
||||
self.q_bias = None |
||||
self.v_bias = None |
||||
if window_size: |
||||
self.window_size = window_size |
||||
self.num_relative_distance = (2 * window_size[0] - 1) * ( |
||||
2 * window_size[1] - 1) + 3 |
||||
self.relative_position_bias_table = self.create_parameter( |
||||
shape=(self.num_relative_distance, num_heads), |
||||
default_initializer=zeros_) # 2*Wh-1 * 2*Ww-1, nH |
||||
# cls to token & token 2 cls & cls to cls |
||||
|
||||
# get pair-wise relative position index for each token inside the window |
||||
coords_h = paddle.arange(window_size[0]) |
||||
coords_w = paddle.arange(window_size[1]) |
||||
coords = paddle.stack(paddle.meshgrid( |
||||
[coords_h, coords_w])) # 2, Wh, Ww |
||||
coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww |
||||
coords_flatten_1 = paddle.unsqueeze(coords_flatten, 2) |
||||
coords_flatten_2 = paddle.unsqueeze(coords_flatten, 1) |
||||
relative_coords = coords_flatten_1.clone() - coords_flatten_2.clone( |
||||
) |
||||
|
||||
#relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Wh |
||||
relative_coords = relative_coords.transpose( |
||||
(1, 2, 0)) #.contiguous() # Wh*Ww, Wh*Ww, 2 |
||||
relative_coords[:, :, 0] += window_size[ |
||||
0] - 1 # shift to start from 0 |
||||
relative_coords[:, :, 1] += window_size[1] - 1 |
||||
relative_coords[:, :, 0] *= 2 * window_size[1] - 1 |
||||
relative_position_index = \ |
||||
paddle.zeros(shape=(window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype) |
||||
relative_position_index[1:, 1:] = relative_coords.sum( |
||||
-1) # Wh*Ww, Wh*Ww |
||||
relative_position_index[0, 0:] = self.num_relative_distance - 3 |
||||
relative_position_index[0:, 0] = self.num_relative_distance - 2 |
||||
relative_position_index[0, 0] = self.num_relative_distance - 1 |
||||
|
||||
self.register_buffer("relative_position_index", |
||||
relative_position_index) |
||||
# trunc_normal_(self.relative_position_bias_table, std=.0) |
||||
else: |
||||
self.window_size = None |
||||
self.relative_position_bias_table = None |
||||
self.relative_position_index = None |
||||
|
||||
self.attn_drop = nn.Dropout(attn_drop) |
||||
self.proj = nn.Linear(dim, dim) |
||||
self.proj_drop = nn.Dropout(proj_drop) |
||||
|
||||
def forward(self, x, rel_pos_bias=None): |
||||
x_shape = paddle.shape(x) |
||||
N, C = x_shape[1], x_shape[2] |
||||
|
||||
qkv_bias = None |
||||
if self.q_bias is not None: |
||||
qkv_bias = paddle.concat( |
||||
(self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias)) |
||||
qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias) |
||||
|
||||
qkv = qkv.reshape((-1, N, 3, self.num_heads, |
||||
C // self.num_heads)).transpose((2, 0, 3, 1, 4)) |
||||
q, k, v = qkv[0], qkv[1], qkv[2] |
||||
attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale |
||||
|
||||
if self.relative_position_bias_table is not None: |
||||
relative_position_bias = self.relative_position_bias_table[ |
||||
self.relative_position_index.reshape([-1])].reshape([ |
||||
self.window_size[0] * self.window_size[1] + 1, |
||||
self.window_size[0] * self.window_size[1] + 1, -1 |
||||
]) # Wh*Ww,Wh*Ww,nH |
||||
relative_position_bias = relative_position_bias.transpose( |
||||
(2, 0, 1)) #.contiguous() # nH, Wh*Ww, Wh*Ww |
||||
attn = attn + relative_position_bias.unsqueeze(0) |
||||
if rel_pos_bias is not None: |
||||
attn = attn + rel_pos_bias |
||||
|
||||
attn = nn.functional.softmax(attn, axis=-1) |
||||
attn = self.attn_drop(attn) |
||||
|
||||
x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C)) |
||||
x = self.proj(x) |
||||
x = self.proj_drop(x) |
||||
return x |
||||
|
||||
|
||||
class Block(nn.Layer): |
||||
def __init__(self, |
||||
dim, |
||||
num_heads, |
||||
mlp_ratio=4., |
||||
qkv_bias=False, |
||||
qk_scale=None, |
||||
drop=0., |
||||
attn_drop=0., |
||||
drop_path=0., |
||||
window_size=None, |
||||
init_values=None, |
||||
act_layer=nn.GELU, |
||||
norm_layer='nn.LayerNorm', |
||||
epsilon=1e-5): |
||||
super().__init__() |
||||
self.norm1 = nn.LayerNorm(dim, epsilon=1e-6) |
||||
self.attn = Attention( |
||||
dim, |
||||
num_heads=num_heads, |
||||
qkv_bias=qkv_bias, |
||||
qk_scale=qk_scale, |
||||
attn_drop=attn_drop, |
||||
proj_drop=drop, |
||||
window_size=window_size) |
||||
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here |
||||
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() |
||||
self.norm2 = eval(norm_layer)(dim, epsilon=epsilon) |
||||
mlp_hidden_dim = int(dim * mlp_ratio) |
||||
self.mlp = Mlp(in_features=dim, |
||||
hidden_features=mlp_hidden_dim, |
||||
act_layer=act_layer, |
||||
drop=drop) |
||||
if init_values is not None: |
||||
self.gamma_1 = self.create_parameter( |
||||
shape=([dim]), default_initializer=Constant(value=init_values)) |
||||
self.gamma_2 = self.create_parameter( |
||||
shape=([dim]), default_initializer=Constant(value=init_values)) |
||||
else: |
||||
self.gamma_1, self.gamma_2 = None, None |
||||
|
||||
def forward(self, x, rel_pos_bias=None): |
||||
|
||||
if self.gamma_1 is None: |
||||
x = x + self.drop_path( |
||||
self.attn( |
||||
self.norm1(x), rel_pos_bias=rel_pos_bias)) |
||||
x = x + self.drop_path(self.mlp(self.norm2(x))) |
||||
else: |
||||
x = x + self.drop_path(self.gamma_1 * self.attn( |
||||
self.norm1(x), rel_pos_bias=rel_pos_bias)) |
||||
x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) |
||||
return x |
||||
|
||||
|
||||
class PatchEmbed(nn.Layer): |
||||
""" Image to Patch Embedding |
||||
""" |
||||
|
||||
def __init__(self, |
||||
img_size=[224, 224], |
||||
patch_size=16, |
||||
in_chans=3, |
||||
embed_dim=768): |
||||
super().__init__() |
||||
self.num_patches_w = img_size[0] // patch_size |
||||
self.num_patches_h = img_size[1] // patch_size |
||||
|
||||
num_patches = self.num_patches_w * self.num_patches_h |
||||
self.patch_shape = (img_size[0] // patch_size, |
||||
img_size[1] // patch_size) |
||||
self.img_size = img_size |
||||
self.patch_size = patch_size |
||||
self.num_patches = num_patches |
||||
|
||||
self.proj = nn.Conv2D( |
||||
in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) |
||||
|
||||
@property |
||||
def num_patches_in_h(self): |
||||
return self.img_size[1] // self.patch_size |
||||
|
||||
@property |
||||
def num_patches_in_w(self): |
||||
return self.img_size[0] // self.patch_size |
||||
|
||||
def forward(self, x, mask=None): |
||||
B, C, H, W = x.shape |
||||
return self.proj(x) |
||||
|
||||
|
||||
class RelativePositionBias(nn.Layer): |
||||
def __init__(self, window_size, num_heads): |
||||
super().__init__() |
||||
self.window_size = window_size |
||||
self.num_relative_distance = (2 * window_size[0] - 1) * ( |
||||
2 * window_size[1] - 1) + 3 |
||||
self.relative_position_bias_table = self.create_parameter( |
||||
shape=(self.num_relative_distance, num_heads), |
||||
default_initialize=zeros_) |
||||
# cls to token & token 2 cls & cls to cls |
||||
|
||||
# get pair-wise relative position index for each token inside the window |
||||
coords_h = paddle.arange(window_size[0]) |
||||
coords_w = paddle.arange(window_size[1]) |
||||
coords = paddle.stack(paddle.meshgrid( |
||||
[coords_h, coords_w])) # 2, Wh, Ww |
||||
coords_flatten = coords.flatten(1) # 2, Wh*Ww |
||||
|
||||
relative_coords = coords_flatten[:, :, |
||||
None] - coords_flatten[:, |
||||
None, :] # 2, Wh*Ww, Wh*Ww |
||||
relative_coords = relative_coords.transpos( |
||||
(1, 2, 0)) # Wh*Ww, Wh*Ww, 2 |
||||
relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 |
||||
relative_coords[:, :, 1] += window_size[1] - 1 |
||||
relative_coords[:, :, 0] *= 2 * window_size[1] - 1 |
||||
relative_position_index = \ |
||||
paddle.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype) |
||||
relative_position_index[1:, 1:] = relative_coords.sum( |
||||
-1) # Wh*Ww, Wh*Ww |
||||
relative_position_index[0, 0:] = self.num_relative_distance - 3 |
||||
relative_position_index[0:, 0] = self.num_relative_distance - 2 |
||||
relative_position_index[0, 0] = self.num_relative_distance - 1 |
||||
self.register_buffer("relative_position_index", relative_position_index) |
||||
|
||||
def forward(self): |
||||
relative_position_bias = \ |
||||
self.relative_position_bias_table[self.relative_position_index.view(-1)].view( |
||||
self.window_size[0] * self.window_size[1] + 1, |
||||
self.window_size[0] * self.window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH |
||||
return relative_position_bias.transpose((2, 0, 1)) # nH, Wh*Ww, Wh*Ww |
||||
|
||||
|
||||
def get_sinusoid_encoding_table(n_position, d_hid, token=False): |
||||
''' Sinusoid position encoding table ''' |
||||
|
||||
def get_position_angle_vec(position): |
||||
return [ |
||||
position / np.power(10000, 2 * (hid_j // 2) / d_hid) |
||||
for hid_j in range(d_hid) |
||||
] |
||||
|
||||
sinusoid_table = np.array( |
||||
[get_position_angle_vec(pos_i) for pos_i in range(n_position)]) |
||||
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i |
||||
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 |
||||
if token: |
||||
sinusoid_table = np.concatenate( |
||||
[sinusoid_table, np.zeros([1, d_hid])], dim=0) |
||||
|
||||
return paddle.to_tensor(sinusoid_table, dtype=paddle.float32).unsqueeze(0) |
||||
|
||||
|
||||
@register |
||||
@serializable |
||||
class VisionTransformer(nn.Layer): |
||||
""" Vision Transformer with support for patch input |
||||
""" |
||||
|
||||
def __init__(self, |
||||
img_size=[672, 1092], |
||||
patch_size=16, |
||||
in_chans=3, |
||||
embed_dim=768, |
||||
depth=12, |
||||
num_heads=12, |
||||
mlp_ratio=4, |
||||
qkv_bias=False, |
||||
qk_scale=None, |
||||
drop_rate=0., |
||||
attn_drop_rate=0., |
||||
drop_path_rate=0., |
||||
norm_layer='nn.LayerNorm', |
||||
init_values=None, |
||||
use_rel_pos_bias=False, |
||||
use_shared_rel_pos_bias=False, |
||||
epsilon=1e-5, |
||||
final_norm=False, |
||||
pretrained=None, |
||||
out_indices=[3, 5, 7, 11], |
||||
use_abs_pos_emb=False, |
||||
use_sincos_pos_emb=True, |
||||
with_fpn=True, |
||||
use_checkpoint=False, |
||||
**args): |
||||
super().__init__() |
||||
self.img_size = img_size |
||||
self.embed_dim = embed_dim |
||||
self.with_fpn = with_fpn |
||||
self.use_checkpoint = use_checkpoint |
||||
self.use_sincos_pos_emb = use_sincos_pos_emb |
||||
self.use_rel_pos_bias = use_rel_pos_bias |
||||
self.final_norm = final_norm |
||||
|
||||
if use_checkpoint: |
||||
paddle.seed(0) |
||||
|
||||
self.patch_embed = PatchEmbed( |
||||
img_size=img_size, |
||||
patch_size=patch_size, |
||||
in_chans=in_chans, |
||||
embed_dim=embed_dim) |
||||
|
||||
self.pos_w = self.patch_embed.num_patches_in_w |
||||
self.pos_h = self.patch_embed.num_patches_in_h |
||||
|
||||
self.cls_token = self.create_parameter( |
||||
shape=(1, 1, embed_dim), |
||||
default_initializer=paddle.nn.initializer.Constant(value=0.)) |
||||
|
||||
if use_abs_pos_emb: |
||||
self.pos_embed = self.create_parameter( |
||||
shape=(1, self.pos_w * self.pos_h + 1, embed_dim), |
||||
default_initializer=paddle.nn.initializer.TruncatedNormal( |
||||
std=.02)) |
||||
elif use_sincos_pos_emb: |
||||
pos_embed = self.build_2d_sincos_position_embedding(embed_dim) |
||||
|
||||
self.pos_embed = pos_embed |
||||
self.pos_embed = self.create_parameter(shape=pos_embed.shape) |
||||
self.pos_embed.set_value(pos_embed.numpy()) |
||||
self.pos_embed.stop_gradient = True |
||||
|
||||
else: |
||||
self.pos_embed = None |
||||
|
||||
self.pos_drop = nn.Dropout(p=drop_rate) |
||||
|
||||
if use_shared_rel_pos_bias: |
||||
self.rel_pos_bias = RelativePositionBias( |
||||
window_size=self.patch_embed.patch_shape, num_heads=num_heads) |
||||
else: |
||||
self.rel_pos_bias = None |
||||
|
||||
dpr = np.linspace(0, drop_path_rate, depth) |
||||
|
||||
self.blocks = nn.LayerList([ |
||||
Block( |
||||
dim=embed_dim, |
||||
num_heads=num_heads, |
||||
mlp_ratio=mlp_ratio, |
||||
qkv_bias=qkv_bias, |
||||
qk_scale=qk_scale, |
||||
drop=drop_rate, |
||||
attn_drop=attn_drop_rate, |
||||
drop_path=dpr[i], |
||||
norm_layer=norm_layer, |
||||
init_values=init_values, |
||||
window_size=self.patch_embed.patch_shape |
||||
if use_rel_pos_bias else None, |
||||
epsilon=epsilon) for i in range(depth) |
||||
]) |
||||
|
||||
self.pretrained = pretrained |
||||
self.init_weight() |
||||
|
||||
assert len(out_indices) <= 4, '' |
||||
self.out_indices = out_indices |
||||
self.out_channels = [embed_dim for _ in range(len(out_indices))] |
||||
self.out_strides = [4, 8, 16, 32][-len(out_indices):] if with_fpn else [ |
||||
8 for _ in range(len(out_indices)) |
||||
] |
||||
|
||||
self.norm = Identity() |
||||
|
||||
if self.with_fpn: |
||||
self.init_fpn( |
||||
embed_dim=embed_dim, |
||||
patch_size=patch_size, ) |
||||
|
||||
def init_weight(self): |
||||
pretrained = self.pretrained |
||||
|
||||
if pretrained: |
||||
if 'http' in pretrained: #URL |
||||
path = paddle.utils.download.get_weights_path_from_url( |
||||
pretrained) |
||||
else: #model in local path |
||||
path = pretrained |
||||
|
||||
load_state_dict = paddle.load(path) |
||||
model_state_dict = self.state_dict() |
||||
pos_embed_name = "pos_embed" |
||||
|
||||
if pos_embed_name in load_state_dict.keys(): |
||||
load_pos_embed = paddle.to_tensor( |
||||
load_state_dict[pos_embed_name], dtype="float32") |
||||
if self.pos_embed.shape != load_pos_embed.shape: |
||||
pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1)) |
||||
model_state_dict[pos_embed_name] = self.resize_pos_embed( |
||||
load_pos_embed, (pos_size, pos_size), |
||||
(self.pos_h, self.pos_w)) |
||||
|
||||
# self.set_state_dict(model_state_dict) |
||||
load_state_dict[pos_embed_name] = model_state_dict[ |
||||
pos_embed_name] |
||||
|
||||
print("Load pos_embed and resize it from {} to {} .".format( |
||||
load_pos_embed.shape, self.pos_embed.shape)) |
||||
|
||||
self.set_state_dict(load_state_dict) |
||||
print("Load load_state_dict....") |
||||
|
||||
def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False): |
||||
if patch_size == 16: |
||||
self.fpn1 = nn.Sequential( |
||||
nn.Conv2DTranspose( |
||||
embed_dim, embed_dim, kernel_size=2, stride=2), |
||||
nn.BatchNorm2D(embed_dim), |
||||
nn.GELU(), |
||||
nn.Conv2DTranspose( |
||||
embed_dim, embed_dim, kernel_size=2, stride=2), ) |
||||
|
||||
self.fpn2 = nn.Sequential( |
||||
nn.Conv2DTranspose( |
||||
embed_dim, embed_dim, kernel_size=2, stride=2), ) |
||||
|
||||
self.fpn3 = Identity() |
||||
|
||||
self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2) |
||||
elif patch_size == 8: |
||||
self.fpn1 = nn.Sequential( |
||||
nn.Conv2DTranspose( |
||||
embed_dim, embed_dim, kernel_size=2, stride=2), ) |
||||
|
||||
self.fpn2 = Identity() |
||||
|
||||
self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), ) |
||||
|
||||
self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), ) |
||||
|
||||
if not out_with_norm: |
||||
self.norm = Identity() |
||||
else: |
||||
self.norm = nn.LayerNorm(embed_dim, epsilon=1e-6) |
||||
|
||||
def interpolate_pos_encoding(self, x, w, h): |
||||
npatch = x.shape[1] - 1 |
||||
N = self.pos_embed.shape[1] - 1 |
||||
w0 = w // self.patch_embed.patch_size |
||||
h0 = h // self.patch_embed.patch_size |
||||
if npatch == N and w0 == self.patch_embed.num_patches_w and h0 == self.patch_embed.num_patches_h: |
||||
return self.pos_embed |
||||
class_pos_embed = self.pos_embed[:, 0] |
||||
patch_pos_embed = self.pos_embed[:, 1:] |
||||
dim = x.shape[-1] |
||||
# we add a small number to avoid floating point error in the interpolation |
||||
# see discussion at https://github.com/facebookresearch/dino/issues/8 |
||||
w0, h0 = w0 + 0.1, h0 + 0.1 |
||||
|
||||
patch_pos_embed = nn.functional.interpolate( |
||||
patch_pos_embed.reshape([ |
||||
1, self.patch_embed.num_patches_w, |
||||
self.patch_embed.num_patches_h, dim |
||||
]).transpose((0, 3, 1, 2)), |
||||
scale_factor=(w0 / self.patch_embed.num_patches_w, |
||||
h0 / self.patch_embed.num_patches_h), |
||||
mode='bicubic', ) |
||||
assert int(w0) == patch_pos_embed.shape[-2] and int( |
||||
h0) == patch_pos_embed.shape[-1] |
||||
patch_pos_embed = patch_pos_embed.transpose( |
||||
(0, 2, 3, 1)).reshape([1, -1, dim]) |
||||
return paddle.concat( |
||||
(class_pos_embed.unsqueeze(0), patch_pos_embed), axis=1) |
||||
|
||||
def resize_pos_embed(self, pos_embed, old_hw, new_hw): |
||||
""" |
||||
Resize pos_embed weight. |
||||
Args: |
||||
pos_embed (Tensor): the pos_embed weight |
||||
old_hw (list[int]): the height and width of old pos_embed |
||||
new_hw (list[int]): the height and width of new pos_embed |
||||
Returns: |
||||
Tensor: the resized pos_embed weight |
||||
""" |
||||
cls_pos_embed = pos_embed[:, :1, :] |
||||
pos_embed = pos_embed[:, 1:, :] |
||||
|
||||
pos_embed = pos_embed.transpose([0, 2, 1]) |
||||
pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]]) |
||||
pos_embed = F.interpolate( |
||||
pos_embed, new_hw, mode='bicubic', align_corners=False) |
||||
pos_embed = pos_embed.flatten(2).transpose([0, 2, 1]) |
||||
pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1) |
||||
|
||||
return pos_embed |
||||
|
||||
def build_2d_sincos_position_embedding( |
||||
self, |
||||
embed_dim=768, |
||||
temperature=10000., ): |
||||
h, w = self.patch_embed.patch_shape |
||||
grid_w = paddle.arange(w, dtype=paddle.float32) |
||||
grid_h = paddle.arange(h, dtype=paddle.float32) |
||||
grid_w, grid_h = paddle.meshgrid(grid_w, grid_h) |
||||
assert embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding' |
||||
pos_dim = embed_dim // 4 |
||||
omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim |
||||
omega = 1. / (temperature**omega) |
||||
|
||||
out_w = grid_w.flatten()[..., None] @omega[None] |
||||
out_h = grid_h.flatten()[..., None] @omega[None] |
||||
|
||||
pos_emb = paddle.concat( |
||||
[ |
||||
paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h), |
||||
paddle.cos(out_h) |
||||
], |
||||
axis=1)[None, :, :] |
||||
|
||||
pe_token = paddle.zeros([1, 1, embed_dim], dtype=paddle.float32) |
||||
pos_embed = paddle.concat([pe_token, pos_emb], axis=1) |
||||
# pos_embed.stop_gradient = True |
||||
|
||||
return pos_embed |
||||
|
||||
def forward(self, x): |
||||
x = x['image'] if isinstance(x, dict) else x |
||||
_, _, h, w = x.shape |
||||
|
||||
x = self.patch_embed(x) |
||||
|
||||
B, D, Hp, Wp = x.shape # b * c * h * w |
||||
|
||||
cls_tokens = self.cls_token.expand( |
||||
(B, self.cls_token.shape[-2], self.cls_token.shape[-1])) |
||||
x = x.flatten(2).transpose([0, 2, 1]) # b * hw * c |
||||
x = paddle.concat([cls_tokens, x], axis=1) |
||||
|
||||
if self.pos_embed is not None: |
||||
# x = x + self.interpolate_pos_encoding(x, w, h) |
||||
x = x + self.interpolate_pos_encoding(x, h, w) |
||||
|
||||
x = self.pos_drop(x) |
||||
|
||||
rel_pos_bias = self.rel_pos_bias( |
||||
) if self.rel_pos_bias is not None else None |
||||
|
||||
feats = [] |
||||
for idx, blk in enumerate(self.blocks): |
||||
if self.use_checkpoint and self.training: |
||||
x = paddle.distributed.fleet.utils.recompute( |
||||
blk, x, rel_pos_bias, **{"preserve_rng_state": True}) |
||||
else: |
||||
x = blk(x, rel_pos_bias) |
||||
|
||||
if idx in self.out_indices: |
||||
xp = paddle.reshape( |
||||
paddle.transpose( |
||||
self.norm(x[:, 1:, :]), perm=[0, 2, 1]), |
||||
shape=[B, D, Hp, Wp]) |
||||
feats.append(xp) |
||||
|
||||
if self.with_fpn: |
||||
fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4] |
||||
for i in range(len(feats)): |
||||
feats[i] = fpns[i](feats[i]) |
||||
|
||||
return feats |
||||
|
||||
@property |
||||
def num_layers(self): |
||||
return len(self.blocks) |
||||
|
||||
@property |
||||
def no_weight_decay(self): |
||||
return {'pos_embed', 'cls_token'} |
||||
|
||||
@property |
||||
def out_shape(self): |
||||
return [ |
||||
ShapeSpec( |
||||
channels=c, stride=s) |
||||
for c, s in zip(self.out_channels, self.out_strides) |
||||
] |
@ -0,0 +1,40 @@ |
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
||||
# |
||||
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
# you may not use this file except in compliance with the License. |
||||
# You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
|
||||
|
||||
def _get_class_default_kwargs(cls, *args, **kwargs): |
||||
""" |
||||
Get default arguments of a class in dict format, if args and |
||||
kwargs is specified, it will replace default arguments |
||||
""" |
||||
varnames = cls.__init__.__code__.co_varnames |
||||
argcount = cls.__init__.__code__.co_argcount |
||||
keys = varnames[:argcount] |
||||
assert keys[0] == 'self' |
||||
keys = keys[1:] |
||||
|
||||
values = list(cls.__init__.__defaults__) |
||||
assert len(values) == len(keys) |
||||
|
||||
if len(args) > 0: |
||||
for i, arg in enumerate(args): |
||||
values[i] = arg |
||||
|
||||
default_kwargs = dict(zip(keys, values)) |
||||
|
||||
if len(kwargs) > 0: |
||||
for k, v in kwargs.items(): |
||||
default_kwargs[k] = v |
||||
|
||||
return default_kwargs |
@ -0,0 +1,388 @@ |
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
||||
# |
||||
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
# you may not use this file except in compliance with the License. |
||||
# You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
|
||||
import paddle |
||||
import paddle.nn as nn |
||||
import paddle.nn.functional as F |
||||
from paddlers.models.ppdet.core.workspace import register |
||||
|
||||
from ..bbox_utils import batch_distance2bbox |
||||
from ..losses import GIoULoss |
||||
from ..initializer import bias_init_with_prob, constant_, normal_ |
||||
from ..assigners.utils import generate_anchors_for_grid_cell |
||||
from paddlers.models.ppdet.modeling.backbones.cspresnet import ConvBNLayer |
||||
from paddlers.models.ppdet.modeling.ops import get_static_shape, get_act_fn |
||||
from paddlers.models.ppdet.modeling.layers import MultiClassNMS |
||||
|
||||
__all__ = ['PPYOLOEHead'] |
||||
|
||||
|
||||
class ESEAttn(nn.Layer): |
||||
def __init__(self, feat_channels, act='swish'): |
||||
super(ESEAttn, self).__init__() |
||||
self.fc = nn.Conv2D(feat_channels, feat_channels, 1) |
||||
self.conv = ConvBNLayer(feat_channels, feat_channels, 1, act=act) |
||||
|
||||
self._init_weights() |
||||
|
||||
def _init_weights(self): |
||||
normal_(self.fc.weight, std=0.001) |
||||
|
||||
def forward(self, feat, avg_feat): |
||||
weight = F.sigmoid(self.fc(avg_feat)) |
||||
return self.conv(feat * weight) |
||||
|
||||
|
||||
@register |
||||
class PPYOLOEHead(nn.Layer): |
||||
__shared__ = [ |
||||
'num_classes', 'eval_size', 'trt', 'exclude_nms', 'exclude_post_process' |
||||
] |
||||
__inject__ = ['static_assigner', 'assigner', 'nms'] |
||||
|
||||
def __init__(self, |
||||
in_channels=[1024, 512, 256], |
||||
num_classes=80, |
||||
act='swish', |
||||
fpn_strides=(32, 16, 8), |
||||
grid_cell_scale=5.0, |
||||
grid_cell_offset=0.5, |
||||
reg_max=16, |
||||
static_assigner_epoch=4, |
||||
use_varifocal_loss=True, |
||||
static_assigner='ATSSAssigner', |
||||
assigner='TaskAlignedAssigner', |
||||
nms='MultiClassNMS', |
||||
eval_size=None, |
||||
loss_weight={ |
||||
'class': 1.0, |
||||
'iou': 2.5, |
||||
'dfl': 0.5, |
||||
}, |
||||
trt=False, |
||||
exclude_nms=False, |
||||
exclude_post_process=False): |
||||
super(PPYOLOEHead, self).__init__() |
||||
assert len(in_channels) > 0, "len(in_channels) should > 0" |
||||
self.in_channels = in_channels |
||||
self.num_classes = num_classes |
||||
self.fpn_strides = fpn_strides |
||||
self.grid_cell_scale = grid_cell_scale |
||||
self.grid_cell_offset = grid_cell_offset |
||||
self.reg_max = reg_max |
||||
self.iou_loss = GIoULoss() |
||||
self.loss_weight = loss_weight |
||||
self.use_varifocal_loss = use_varifocal_loss |
||||
self.eval_size = eval_size |
||||
|
||||
self.static_assigner_epoch = static_assigner_epoch |
||||
self.static_assigner = static_assigner |
||||
self.assigner = assigner |
||||
self.nms = nms |
||||
if isinstance(self.nms, MultiClassNMS) and trt: |
||||
self.nms.trt = trt |
||||
self.exclude_nms = exclude_nms |
||||
self.exclude_post_process = exclude_post_process |
||||
# stem |
||||
self.stem_cls = nn.LayerList() |
||||
self.stem_reg = nn.LayerList() |
||||
act = get_act_fn( |
||||
act, trt=trt) if act is None or isinstance(act, |
||||
(str, dict)) else act |
||||
for in_c in self.in_channels: |
||||
self.stem_cls.append(ESEAttn(in_c, act=act)) |
||||
self.stem_reg.append(ESEAttn(in_c, act=act)) |
||||
# pred head |
||||
self.pred_cls = nn.LayerList() |
||||
self.pred_reg = nn.LayerList() |
||||
for in_c in self.in_channels: |
||||
self.pred_cls.append( |
||||
nn.Conv2D( |
||||
in_c, self.num_classes, 3, padding=1)) |
||||
self.pred_reg.append( |
||||
nn.Conv2D( |
||||
in_c, 4 * (self.reg_max + 1), 3, padding=1)) |
||||
# projection conv |
||||
self.proj_conv = nn.Conv2D(self.reg_max + 1, 1, 1, bias_attr=False) |
||||
self.proj_conv.skip_quant = True |
||||
self._init_weights() |
||||
|
||||
@classmethod |
||||
def from_config(cls, cfg, input_shape): |
||||
return {'in_channels': [i.channels for i in input_shape], } |
||||
|
||||
def _init_weights(self): |
||||
bias_cls = bias_init_with_prob(0.01) |
||||
for cls_, reg_ in zip(self.pred_cls, self.pred_reg): |
||||
constant_(cls_.weight) |
||||
constant_(cls_.bias, bias_cls) |
||||
constant_(reg_.weight) |
||||
constant_(reg_.bias, 1.0) |
||||
|
||||
proj = paddle.linspace(0, self.reg_max, self.reg_max + 1).reshape( |
||||
[1, self.reg_max + 1, 1, 1]) |
||||
self.proj_conv.weight.set_value(proj) |
||||
self.proj_conv.weight.stop_gradient = True |
||||
if self.eval_size: |
||||
anchor_points, stride_tensor = self._generate_anchors() |
||||
self.anchor_points = anchor_points |
||||
self.stride_tensor = stride_tensor |
||||
|
||||
def forward_train(self, feats, targets): |
||||
anchors, anchor_points, num_anchors_list, stride_tensor = \ |
||||
generate_anchors_for_grid_cell( |
||||
feats, self.fpn_strides, self.grid_cell_scale, |
||||
self.grid_cell_offset) |
||||
|
||||
cls_score_list, reg_distri_list = [], [] |
||||
for i, feat in enumerate(feats): |
||||
avg_feat = F.adaptive_avg_pool2d(feat, (1, 1)) |
||||
cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) + |
||||
feat) |
||||
reg_distri = self.pred_reg[i](self.stem_reg[i](feat, avg_feat)) |
||||
# cls and reg |
||||
cls_score = F.sigmoid(cls_logit) |
||||
cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1])) |
||||
reg_distri_list.append(reg_distri.flatten(2).transpose([0, 2, 1])) |
||||
cls_score_list = paddle.concat(cls_score_list, axis=1) |
||||
reg_distri_list = paddle.concat(reg_distri_list, axis=1) |
||||
|
||||
return self.get_loss([ |
||||
cls_score_list, reg_distri_list, anchors, anchor_points, |
||||
num_anchors_list, stride_tensor |
||||
], targets) |
||||
|
||||
def _generate_anchors(self, feats=None, dtype='float32'): |
||||
# just use in eval time |
||||
anchor_points = [] |
||||
stride_tensor = [] |
||||
for i, stride in enumerate(self.fpn_strides): |
||||
if feats is not None: |
||||
_, _, h, w = feats[i].shape |
||||
else: |
||||
h = int(self.eval_size[0] / stride) |
||||
w = int(self.eval_size[1] / stride) |
||||
shift_x = paddle.arange(end=w) + self.grid_cell_offset |
||||
shift_y = paddle.arange(end=h) + self.grid_cell_offset |
||||
shift_y, shift_x = paddle.meshgrid(shift_y, shift_x) |
||||
anchor_point = paddle.cast( |
||||
paddle.stack( |
||||
[shift_x, shift_y], axis=-1), dtype=dtype) |
||||
anchor_points.append(anchor_point.reshape([-1, 2])) |
||||
stride_tensor.append(paddle.full([h * w, 1], stride, dtype=dtype)) |
||||
anchor_points = paddle.concat(anchor_points) |
||||
stride_tensor = paddle.concat(stride_tensor) |
||||
return anchor_points, stride_tensor |
||||
|
||||
def forward_eval(self, feats): |
||||
if self.eval_size: |
||||
anchor_points, stride_tensor = self.anchor_points, self.stride_tensor |
||||
else: |
||||
anchor_points, stride_tensor = self._generate_anchors(feats) |
||||
cls_score_list, reg_dist_list = [], [] |
||||
for i, feat in enumerate(feats): |
||||
_, _, h, w = feat.shape |
||||
l = h * w |
||||
avg_feat = F.adaptive_avg_pool2d(feat, (1, 1)) |
||||
cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) + |
||||
feat) |
||||
reg_dist = self.pred_reg[i](self.stem_reg[i](feat, avg_feat)) |
||||
reg_dist = reg_dist.reshape([-1, 4, self.reg_max + 1, l]).transpose( |
||||
[0, 2, 3, 1]) |
||||
reg_dist = self.proj_conv(F.softmax(reg_dist, axis=1)).squeeze(1) |
||||
# cls and reg |
||||
cls_score = F.sigmoid(cls_logit) |
||||
cls_score_list.append(cls_score.reshape([-1, self.num_classes, l])) |
||||
reg_dist_list.append(reg_dist) |
||||
|
||||
cls_score_list = paddle.concat(cls_score_list, axis=-1) |
||||
reg_dist_list = paddle.concat(reg_dist_list, axis=1) |
||||
|
||||
return cls_score_list, reg_dist_list, anchor_points, stride_tensor |
||||
|
||||
def forward(self, feats, targets=None): |
||||
assert len(feats) == len(self.fpn_strides), \ |
||||
"The size of feats is not equal to size of fpn_strides" |
||||
|
||||
if self.training: |
||||
return self.forward_train(feats, targets) |
||||
else: |
||||
return self.forward_eval(feats) |
||||
|
||||
@staticmethod |
||||
def _focal_loss(score, label, alpha=0.25, gamma=2.0): |
||||
weight = (score - label).pow(gamma) |
||||
if alpha > 0: |
||||
alpha_t = alpha * label + (1 - alpha) * (1 - label) |
||||
weight *= alpha_t |
||||
loss = F.binary_cross_entropy( |
||||
score, label, weight=weight, reduction='sum') |
||||
return loss |
||||
|
||||
@staticmethod |
||||
def _varifocal_loss(pred_score, gt_score, label, alpha=0.75, gamma=2.0): |
||||
weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label |
||||
loss = F.binary_cross_entropy( |
||||
pred_score, gt_score, weight=weight, reduction='sum') |
||||
return loss |
||||
|
||||
def _bbox_decode(self, anchor_points, pred_dist): |
||||
_, l, _ = get_static_shape(pred_dist) |
||||
pred_dist = F.softmax(pred_dist.reshape([-1, l, 4, self.reg_max + 1])) |
||||
pred_dist = self.proj_conv(pred_dist.transpose([0, 3, 1, 2])).squeeze(1) |
||||
return batch_distance2bbox(anchor_points, pred_dist) |
||||
|
||||
def _bbox2distance(self, points, bbox): |
||||
x1y1, x2y2 = paddle.split(bbox, 2, -1) |
||||
lt = points - x1y1 |
||||
rb = x2y2 - points |
||||
return paddle.concat([lt, rb], -1).clip(0, self.reg_max - 0.01) |
||||
|
||||
def _df_loss(self, pred_dist, target): |
||||
target_left = paddle.cast(target, 'int64') |
||||
target_right = target_left + 1 |
||||
weight_left = target_right.astype('float32') - target |
||||
weight_right = 1 - weight_left |
||||
loss_left = F.cross_entropy( |
||||
pred_dist, target_left, reduction='none') * weight_left |
||||
loss_right = F.cross_entropy( |
||||
pred_dist, target_right, reduction='none') * weight_right |
||||
return (loss_left + loss_right).mean(-1, keepdim=True) |
||||
|
||||
def _bbox_loss(self, pred_dist, pred_bboxes, anchor_points, assigned_labels, |
||||
assigned_bboxes, assigned_scores, assigned_scores_sum): |
||||
# select positive samples mask |
||||
mask_positive = (assigned_labels != self.num_classes) |
||||
num_pos = mask_positive.sum() |
||||
# pos/neg loss |
||||
if num_pos > 0: |
||||
# l1 + iou |
||||
bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 4]) |
||||
pred_bboxes_pos = paddle.masked_select(pred_bboxes, |
||||
bbox_mask).reshape([-1, 4]) |
||||
assigned_bboxes_pos = paddle.masked_select( |
||||
assigned_bboxes, bbox_mask).reshape([-1, 4]) |
||||
bbox_weight = paddle.masked_select( |
||||
assigned_scores.sum(-1), mask_positive).unsqueeze(-1) |
||||
|
||||
loss_l1 = F.l1_loss(pred_bboxes_pos, assigned_bboxes_pos) |
||||
|
||||
loss_iou = self.iou_loss(pred_bboxes_pos, |
||||
assigned_bboxes_pos) * bbox_weight |
||||
loss_iou = loss_iou.sum() / assigned_scores_sum |
||||
|
||||
dist_mask = mask_positive.unsqueeze(-1).tile( |
||||
[1, 1, (self.reg_max + 1) * 4]) |
||||
pred_dist_pos = paddle.masked_select( |
||||
pred_dist, dist_mask).reshape([-1, 4, self.reg_max + 1]) |
||||
assigned_ltrb = self._bbox2distance(anchor_points, assigned_bboxes) |
||||
assigned_ltrb_pos = paddle.masked_select( |
||||
assigned_ltrb, bbox_mask).reshape([-1, 4]) |
||||
loss_dfl = self._df_loss(pred_dist_pos, |
||||
assigned_ltrb_pos) * bbox_weight |
||||
loss_dfl = loss_dfl.sum() / assigned_scores_sum |
||||
else: |
||||
loss_l1 = paddle.zeros([1]) |
||||
loss_iou = paddle.zeros([1]) |
||||
loss_dfl = pred_dist.sum() * 0. |
||||
return loss_l1, loss_iou, loss_dfl |
||||
|
||||
def get_loss(self, head_outs, gt_meta): |
||||
pred_scores, pred_distri, anchors,\ |
||||
anchor_points, num_anchors_list, stride_tensor = head_outs |
||||
|
||||
anchor_points_s = anchor_points / stride_tensor |
||||
pred_bboxes = self._bbox_decode(anchor_points_s, pred_distri) |
||||
|
||||
gt_labels = gt_meta['gt_class'] |
||||
gt_bboxes = gt_meta['gt_bbox'] |
||||
pad_gt_mask = gt_meta['pad_gt_mask'] |
||||
# label assignment |
||||
if gt_meta['epoch_id'] < self.static_assigner_epoch: |
||||
assigned_labels, assigned_bboxes, assigned_scores = \ |
||||
self.static_assigner( |
||||
anchors, |
||||
num_anchors_list, |
||||
gt_labels, |
||||
gt_bboxes, |
||||
pad_gt_mask, |
||||
bg_index=self.num_classes, |
||||
pred_bboxes=pred_bboxes.detach() * stride_tensor) |
||||
alpha_l = 0.25 |
||||
else: |
||||
assigned_labels, assigned_bboxes, assigned_scores = \ |
||||
self.assigner( |
||||
pred_scores.detach(), |
||||
pred_bboxes.detach() * stride_tensor, |
||||
anchor_points, |
||||
num_anchors_list, |
||||
gt_labels, |
||||
gt_bboxes, |
||||
pad_gt_mask, |
||||
bg_index=self.num_classes) |
||||
alpha_l = -1 |
||||
# rescale bbox |
||||
assigned_bboxes /= stride_tensor |
||||
# cls loss |
||||
if self.use_varifocal_loss: |
||||
one_hot_label = F.one_hot(assigned_labels, |
||||
self.num_classes + 1)[..., :-1] |
||||
loss_cls = self._varifocal_loss(pred_scores, assigned_scores, |
||||
one_hot_label) |
||||
else: |
||||
loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha_l) |
||||
|
||||
assigned_scores_sum = assigned_scores.sum() |
||||
if paddle.distributed.get_world_size() > 1: |
||||
paddle.distributed.all_reduce(assigned_scores_sum) |
||||
assigned_scores_sum /= paddle.distributed.get_world_size() |
||||
assigned_scores_sum = paddle.clip(assigned_scores_sum, min=1.) |
||||
loss_cls /= assigned_scores_sum |
||||
|
||||
loss_l1, loss_iou, loss_dfl = \ |
||||
self._bbox_loss(pred_distri, pred_bboxes, anchor_points_s, |
||||
assigned_labels, assigned_bboxes, assigned_scores, |
||||
assigned_scores_sum) |
||||
loss = self.loss_weight['class'] * loss_cls + \ |
||||
self.loss_weight['iou'] * loss_iou + \ |
||||
self.loss_weight['dfl'] * loss_dfl |
||||
out_dict = { |
||||
'loss': loss, |
||||
'loss_cls': loss_cls, |
||||
'loss_iou': loss_iou, |
||||
'loss_dfl': loss_dfl, |
||||
'loss_l1': loss_l1, |
||||
} |
||||
return out_dict |
||||
|
||||
def post_process(self, head_outs, scale_factor): |
||||
pred_scores, pred_dist, anchor_points, stride_tensor = head_outs |
||||
pred_bboxes = batch_distance2bbox(anchor_points, pred_dist) |
||||
pred_bboxes *= stride_tensor |
||||
if self.exclude_post_process: |
||||
return paddle.concat( |
||||
[pred_bboxes, pred_scores.transpose([0, 2, 1])], axis=-1), None |
||||
else: |
||||
# scale bbox to origin |
||||
scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1) |
||||
scale_factor = paddle.concat( |
||||
[scale_x, scale_y, scale_x, scale_y], |
||||
axis=-1).reshape([-1, 1, 4]) |
||||
pred_bboxes /= scale_factor |
||||
if self.exclude_nms: |
||||
# `exclude_nms=True` just use in benchmark |
||||
return pred_bboxes, pred_scores |
||||
else: |
||||
bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores) |
||||
return bbox_pred, bbox_num |
@ -0,0 +1,249 @@ |
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
||||
# |
||||
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
# you may not use this file except in compliance with the License. |
||||
# You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
|
||||
from __future__ import absolute_import |
||||
from __future__ import division |
||||
from __future__ import print_function |
||||
|
||||
import math |
||||
import paddle |
||||
import paddle.nn as nn |
||||
import paddle.nn.functional as F |
||||
from paddle import ParamAttr |
||||
from paddle.nn.initializer import Normal, Constant |
||||
from paddlers.models.ppdet.modeling.bbox_utils import bbox2delta, delta2bbox |
||||
from paddlers.models.ppdet.modeling.heads.fcos_head import FCOSFeat |
||||
|
||||
from paddlers.models.ppdet.core.workspace import register |
||||
|
||||
__all__ = ['RetinaHead'] |
||||
|
||||
|
||||
@register |
||||
class RetinaFeat(FCOSFeat): |
||||
"""We use FCOSFeat to construct conv layers in RetinaNet. |
||||
We rename FCOSFeat to RetinaFeat to avoid confusion. |
||||
""" |
||||
pass |
||||
|
||||
|
||||
@register |
||||
class RetinaHead(nn.Layer): |
||||
"""Used in RetinaNet proposed in paper https://arxiv.org/pdf/1708.02002.pdf |
||||
""" |
||||
__shared__ = ['num_classes'] |
||||
__inject__ = [ |
||||
'conv_feat', 'anchor_generator', 'bbox_assigner', 'loss_class', |
||||
'loss_bbox', 'nms' |
||||
] |
||||
|
||||
def __init__(self, |
||||
num_classes=80, |
||||
conv_feat='RetinaFeat', |
||||
anchor_generator='RetinaAnchorGenerator', |
||||
bbox_assigner='MaxIoUAssigner', |
||||
loss_class='FocalLoss', |
||||
loss_bbox='SmoothL1Loss', |
||||
nms='MultiClassNMS', |
||||
prior_prob=0.01, |
||||
nms_pre=1000, |
||||
weights=[1., 1., 1., 1.]): |
||||
super(RetinaHead, self).__init__() |
||||
self.num_classes = num_classes |
||||
self.conv_feat = conv_feat |
||||
self.anchor_generator = anchor_generator |
||||
self.bbox_assigner = bbox_assigner |
||||
self.loss_class = loss_class |
||||
self.loss_bbox = loss_bbox |
||||
self.nms = nms |
||||
self.nms_pre = nms_pre |
||||
self.weights = weights |
||||
|
||||
bias_init_value = -math.log((1 - prior_prob) / prior_prob) |
||||
num_anchors = self.anchor_generator.num_anchors |
||||
self.retina_cls = nn.Conv2D( |
||||
in_channels=self.conv_feat.feat_out, |
||||
out_channels=self.num_classes * num_anchors, |
||||
kernel_size=3, |
||||
stride=1, |
||||
padding=1, |
||||
weight_attr=ParamAttr(initializer=Normal( |
||||
mean=0.0, std=0.01)), |
||||
bias_attr=ParamAttr(initializer=Constant(value=bias_init_value))) |
||||
self.retina_reg = nn.Conv2D( |
||||
in_channels=self.conv_feat.feat_out, |
||||
out_channels=4 * num_anchors, |
||||
kernel_size=3, |
||||
stride=1, |
||||
padding=1, |
||||
weight_attr=ParamAttr(initializer=Normal( |
||||
mean=0.0, std=0.01)), |
||||
bias_attr=ParamAttr(initializer=Constant(value=0))) |
||||
|
||||
def forward(self, neck_feats, targets=None): |
||||
cls_logits_list = [] |
||||
bboxes_reg_list = [] |
||||
for neck_feat in neck_feats: |
||||
conv_cls_feat, conv_reg_feat = self.conv_feat(neck_feat) |
||||
cls_logits = self.retina_cls(conv_cls_feat) |
||||
bbox_reg = self.retina_reg(conv_reg_feat) |
||||
cls_logits_list.append(cls_logits) |
||||
bboxes_reg_list.append(bbox_reg) |
||||
|
||||
if self.training: |
||||
return self.get_loss([cls_logits_list, bboxes_reg_list], targets) |
||||
else: |
||||
return [cls_logits_list, bboxes_reg_list] |
||||
|
||||
def get_loss(self, head_outputs, targets): |
||||
"""Here we calculate loss for a batch of images. |
||||
We assign anchors to gts in each image and gather all the assigned |
||||
postive and negative samples. Then loss is calculated on the gathered |
||||
samples. |
||||
""" |
||||
cls_logits_list, bboxes_reg_list = head_outputs |
||||
anchors = self.anchor_generator(cls_logits_list) |
||||
anchors = paddle.concat(anchors) |
||||
|
||||
# matches: contain gt_inds |
||||
# match_labels: -1(ignore), 0(neg) or 1(pos) |
||||
matches_list, match_labels_list = [], [] |
||||
# assign anchors to gts, no sampling is involved |
||||
for gt_bbox in targets['gt_bbox']: |
||||
matches, match_labels = self.bbox_assigner(anchors, gt_bbox) |
||||
matches_list.append(matches) |
||||
match_labels_list.append(match_labels) |
||||
|
||||
# reshape network outputs |
||||
cls_logits = [ |
||||
_.transpose([0, 2, 3, 1]).reshape([0, -1, self.num_classes]) |
||||
for _ in cls_logits_list |
||||
] |
||||
bboxes_reg = [ |
||||
_.transpose([0, 2, 3, 1]).reshape([0, -1, 4]) |
||||
for _ in bboxes_reg_list |
||||
] |
||||
cls_logits = paddle.concat(cls_logits, axis=1) |
||||
bboxes_reg = paddle.concat(bboxes_reg, axis=1) |
||||
|
||||
cls_pred_list, cls_tar_list = [], [] |
||||
reg_pred_list, reg_tar_list = [], [] |
||||
# find and gather preds and targets in each image |
||||
for matches, match_labels, cls_logit, bbox_reg, gt_bbox, gt_class in \ |
||||
zip(matches_list, match_labels_list, cls_logits, bboxes_reg, |
||||
targets['gt_bbox'], targets['gt_class']): |
||||
pos_mask = (match_labels == 1) |
||||
neg_mask = (match_labels == 0) |
||||
chosen_mask = paddle.logical_or(pos_mask, neg_mask) |
||||
|
||||
gt_class = gt_class.reshape([-1]) |
||||
bg_class = paddle.to_tensor( |
||||
[self.num_classes], dtype=gt_class.dtype) |
||||
# a trick to assign num_classes to negative targets |
||||
gt_class = paddle.concat([gt_class, bg_class], axis=-1) |
||||
matches = paddle.where(neg_mask, |
||||
paddle.full_like(matches, gt_class.size - 1), |
||||
matches) |
||||
|
||||
cls_pred = cls_logit[chosen_mask] |
||||
cls_tar = gt_class[matches[chosen_mask]] |
||||
reg_pred = bbox_reg[pos_mask].reshape([-1, 4]) |
||||
reg_tar = gt_bbox[matches[pos_mask]].reshape([-1, 4]) |
||||
reg_tar = bbox2delta(anchors[pos_mask], reg_tar, self.weights) |
||||
cls_pred_list.append(cls_pred) |
||||
cls_tar_list.append(cls_tar) |
||||
reg_pred_list.append(reg_pred) |
||||
reg_tar_list.append(reg_tar) |
||||
cls_pred = paddle.concat(cls_pred_list) |
||||
cls_tar = paddle.concat(cls_tar_list) |
||||
reg_pred = paddle.concat(reg_pred_list) |
||||
reg_tar = paddle.concat(reg_tar_list) |
||||
|
||||
avg_factor = max(1.0, reg_pred.shape[0]) |
||||
cls_loss = self.loss_class( |
||||
cls_pred, cls_tar, reduction='sum') / avg_factor |
||||
|
||||
if reg_pred.shape[0] == 0: |
||||
reg_loss = paddle.zeros([1]) |
||||
reg_loss.stop_gradient = False |
||||
else: |
||||
reg_loss = self.loss_bbox( |
||||
reg_pred, reg_tar, reduction='sum') / avg_factor |
||||
|
||||
loss = cls_loss + reg_loss |
||||
out_dict = { |
||||
'loss_cls': cls_loss, |
||||
'loss_reg': reg_loss, |
||||
'loss': loss, |
||||
} |
||||
return out_dict |
||||
|
||||
def get_bboxes_single(self, |
||||
anchors, |
||||
cls_scores_list, |
||||
bbox_preds_list, |
||||
im_shape, |
||||
scale_factor, |
||||
rescale=True): |
||||
assert len(cls_scores_list) == len(bbox_preds_list) |
||||
mlvl_bboxes = [] |
||||
mlvl_scores = [] |
||||
for anchor, cls_score, bbox_pred in zip(anchors, cls_scores_list, |
||||
bbox_preds_list): |
||||
cls_score = cls_score.reshape([-1, self.num_classes]) |
||||
bbox_pred = bbox_pred.reshape([-1, 4]) |
||||
if self.nms_pre is not None and cls_score.shape[0] > self.nms_pre: |
||||
max_score = cls_score.max(axis=1) |
||||
_, topk_inds = max_score.topk(self.nms_pre) |
||||
bbox_pred = bbox_pred.gather(topk_inds) |
||||
anchor = anchor.gather(topk_inds) |
||||
cls_score = cls_score.gather(topk_inds) |
||||
bbox_pred = delta2bbox(bbox_pred, anchor, self.weights).squeeze() |
||||
mlvl_bboxes.append(bbox_pred) |
||||
mlvl_scores.append(F.sigmoid(cls_score)) |
||||
mlvl_bboxes = paddle.concat(mlvl_bboxes) |
||||
mlvl_bboxes = paddle.squeeze(mlvl_bboxes) |
||||
if rescale: |
||||
mlvl_bboxes = mlvl_bboxes / paddle.concat( |
||||
[scale_factor[::-1], scale_factor[::-1]]) |
||||
mlvl_scores = paddle.concat(mlvl_scores) |
||||
mlvl_scores = mlvl_scores.transpose([1, 0]) |
||||
return mlvl_bboxes, mlvl_scores |
||||
|
||||
def decode(self, anchors, cls_logits, bboxes_reg, im_shape, scale_factor): |
||||
batch_bboxes = [] |
||||
batch_scores = [] |
||||
for img_id in range(cls_logits[0].shape[0]): |
||||
num_lvls = len(cls_logits) |
||||
cls_scores_list = [cls_logits[i][img_id] for i in range(num_lvls)] |
||||
bbox_preds_list = [bboxes_reg[i][img_id] for i in range(num_lvls)] |
||||
bboxes, scores = self.get_bboxes_single( |
||||
anchors, cls_scores_list, bbox_preds_list, im_shape[img_id], |
||||
scale_factor[img_id]) |
||||
batch_bboxes.append(bboxes) |
||||
batch_scores.append(scores) |
||||
batch_bboxes = paddle.stack(batch_bboxes, axis=0) |
||||
batch_scores = paddle.stack(batch_scores, axis=0) |
||||
return batch_bboxes, batch_scores |
||||
|
||||
def post_process(self, head_outputs, im_shape, scale_factor): |
||||
cls_logits_list, bboxes_reg_list = head_outputs |
||||
anchors = self.anchor_generator(cls_logits_list) |
||||
cls_logits = [_.transpose([0, 2, 3, 1]) for _ in cls_logits_list] |
||||
bboxes_reg = [_.transpose([0, 2, 3, 1]) for _ in bboxes_reg_list] |
||||
bboxes, scores = self.decode(anchors, cls_logits, bboxes_reg, im_shape, |
||||
scale_factor) |
||||
|
||||
bbox_pred, bbox_num, _ = self.nms(bboxes, scores) |
||||
return bbox_pred, bbox_num |
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue