Refactor code for detection and segment

2 years ago · 13ed0033db
parent 0770f5fa60
commit 13ed0033db
17 changed files with 671 additions and 537 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,13 @@
+# Personal
+.idea
+*.engine
+*.pt
+*.pth
+*.onnx
+*.jpg
+*.png
+*.bmp
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
--- a/README.md
+++ b/README.md
@ -54,7 +54,7 @@ You can export your onnx model by `ultralytics` API
 and add postprocess into model at the same time.

 ``` shell
-python3 export.py \
+python3 export-det.py \
 --weights yolov8s.pt \
 --iou-thres 0.65 \
 --conf-thres 0.25 \
@ -148,12 +148,12 @@ Please see more information in [`API-Build.md`](docs/API-Build.md)

 ## 1. Infer with python script

-You can infer images with the engine by [`infer.py`](infer.py) .
+You can infer images with the engine by [`infer-det.py`](infer-det.py) .

 Usage:

 ``` shell
-python3 infer.py \
+python3 infer-det.py \
 --engine yolov8s.engine \
 --imgs data \
 --show \
@ -215,13 +215,13 @@ If you want to profile the TensorRT engine:
 Usage:

 ``` shell
-python3 infer.py --engine yolov8s.engine --profile
+python3 profile.py --engine yolov8s.engine --device cuda:0
 ```

-# Refuse To Use PyTorch for model inference !!!
+# Refuse To Use PyTorch for Model Inference !!!

 If you need to break away from pytorch and use tensorrt inference,
-you can get more information in [`infer-no-torch.py`](infer-no-torch.py),
+you can get more information in [`infer-det-without-torch.py`](infer-det-without-torch.py),
 the usage is the same as the pytorch version, but its performance is much worse.

 You can use `cuda-python` or `pycuda` for inference.
@ -236,7 +236,7 @@ pip install pycuda
 Usage:

 ``` shell
-python3 infer-no-torch.py \
+python3 infer-det-without-torch.py \
 --engine yolov8s.engine \
 --imgs data \
 --show \
--- a/build.py
+++ b/build.py
@ -1,10 +1,7 @@
 import argparse
-import os

 from models import EngineBuilder

-os.environ['CUDA_MODULE_LOADING'] = 'LAZY'
-

 def parse_args():
    parser = argparse.ArgumentParser()
--- a/config.py
+++ b/config.py
@ -0,0 +1,40 @@
+import random
+
+import numpy as np
+
+random.seed(0)
+
+# detection model classes
+CLASSES = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
+           'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
+           'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
+           'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe',
+           'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
+           'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat',
+           'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
+           'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
+           'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+           'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
+           'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
+           'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
+           'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+           'scissors', 'teddy bear', 'hair drier', 'toothbrush')
+
+# colors for per classes
+COLORS = {
+    cls: [random.randint(0, 255) for _ in range(3)]
+    for i, cls in enumerate(CLASSES)
+}
+
+# colors for segment masks
+MASK_COLORS = np.array([(255, 56, 56), (255, 157, 151), (255, 112, 31),
+                        (255, 178, 29), (207, 210, 49), (72, 249, 10),
+                        (146, 204, 23), (61, 219, 134), (26, 147, 52),
+                        (0, 212, 187), (44, 153, 168), (0, 194, 255),
+                        (52, 69, 147), (100, 115, 255), (0, 24, 236),
+                        (132, 56, 255), (82, 0, 133), (203, 56, 255),
+                        (255, 149, 200), (255, 55, 199)],
+                       dtype=np.float32) / 255.
+
+# alpha for segment masks
+ALPHA = 0.5
--- a/docs/Segment.md
+++ b/docs/Segment.md
@ -10,7 +10,7 @@ The yolov8-seg model conversion route is :
 You can export your onnx model by `ultralytics` API and the onnx is also modify by this repo.

 ``` shell
-python3 export_seg.py \
+python3 export-seg.py \
 --weights yolov8s-seg.pt \
 --opset 11 \
 --sim \
@ -68,18 +68,17 @@ Usage:

 ## Infer with python script

-You can infer images with the engine by [`infer.py`](../infer.py) .
+You can infer images with the engine by [`infer-seg.py`](../infer-seg.py) .

 Usage:

 ``` shell
-python3 infer.py \
+python3 infer-seg.py \
 --engine yolov8s-seg.engine \
 --imgs data \
 --show \
 --out-dir outputs \
--device cuda:0 \
--seg
+--device cuda:0
 ```

 #### Description of all arguments
@ -90,7 +89,6 @@ python3 infer.py \
 - `--out-dir` : Where to save detection results images. It will not work when use `--show` flag.
 - `--device` : The CUDA deivce you use.
 - `--profile` : Profile the TensorRT engine.
- `--seg` : Infer with seg model.

 ## Infer with C++

@ -207,3 +205,28 @@ Usage:
 # infer video
 ./yolov8-seg yolov8s-seg.engine data/test.mp4 # the video path
 ```
+
+# Refuse To Use PyTorch for segment Model Inference !!!
+
+It is the same as detection model.
+you can get more information in [`infer-seg-without-torch.py`](../infer-seg-without-torch.py),
+
+Usage:
+
+``` shell
+python3 infer-seg-without-torch.py \
+--engine yolov8s-seg.engine \
+--imgs data \
+--show \
+--out-dir outputs \
+--method cudart
+```
+
+#### Description of all arguments
+
+- `--engine` : The Engine you export.
+- `--imgs` : The images path you want to detect.
+- `--show` : Whether to show detection results.
+- `--out-dir` : Where to save detection results images. It will not work when use `--show` flag.
+- `--method` : Choose `cudart` or `pycuda`, default is `cudart`.
+- `--profile` : Profile the TensorRT engine.
--- a/export-det.py
+++ b/export-det.py
--- a/export-seg.py
+++ b/export-seg.py
--- a/infer-det-without-torch.py
+++ b/infer-det-without-torch.py
@ -0,0 +1,91 @@
+import argparse
+from pathlib import Path
+
+import cv2
+import numpy as np
+
+from config import CLASSES, COLORS
+from models.utils import blob, det_postprocess, letterbox, path_to_list
+
+
+def main(args: argparse.Namespace) -> None:
+    if args.method == 'cudart':
+        from models.cudart_api import TRTEngine
+    elif args.method == 'pycuda':
+        from models.pycuda_api import TRTEngine
+    else:
+        raise NotImplementedError
+
+    Engine = TRTEngine(args.engine)
+    H, W = Engine.inp_info[0].shape[-2:]
+
+    images = path_to_list(args.imgs)
+    save_path = Path(args.out_dir)
+
+    if not args.show and not save_path.exists():
+        save_path.mkdir(parents=True, exist_ok=True)
+
+    for image in images:
+        save_image = save_path / image.name
+        bgr = cv2.imread(str(image))
+        draw = bgr.copy()
+        bgr, ratio, dwdh = letterbox(bgr, (W, H))
+        rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
+        tensor = blob(rgb, return_seg=False)
+        dwdh = np.array(dwdh * 2, dtype=np.float32)
+        tensor = np.ascontiguousarray(tensor)
+        # inference
+        data = Engine(tensor)
+
+        bboxes, scores, labels = det_postprocess(data)
+        bboxes -= dwdh
+        bboxes /= ratio
+
+        for (bbox, score, label) in zip(bboxes, scores, labels):
+            bbox = bbox.round().astype(np.int32).tolist()
+            cls_id = int(label)
+            cls = CLASSES[cls_id]
+            color = COLORS[cls]
+            cv2.rectangle(draw, bbox[:2], bbox[2:], color, 2)
+            cv2.putText(draw,
+                        f'{cls}:{score:.3f}', (bbox[0], bbox[1] - 2),
+                        cv2.FONT_HERSHEY_SIMPLEX,
+                        0.75, [225, 255, 255],
+                        thickness=2)
+        if args.show:
+            cv2.imshow('result', draw)
+            cv2.waitKey(0)
+        else:
+            cv2.imwrite(str(save_image), draw)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--engine', type=str, help='Engine file')
+    parser.add_argument('--imgs', type=str, help='Images file')
+    parser.add_argument('--show',
+                        action='store_true',
+                        help='Show the detection results')
+    parser.add_argument('--out-dir',
+                        type=str,
+                        default='./output',
+                        help='Path to output file')
+    parser.add_argument('--conf-thres',
+                        type=float,
+                        default=0.25,
+                        help='Confidence threshold')
+    parser.add_argument('--iou-thres',
+                        type=float,
+                        default=0.65,
+                        help='Confidence threshold')
+    parser.add_argument('--method',
+                        type=str,
+                        default='cudart',
+                        help='CUDART pipeline')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)
--- a/infer-det.py
+++ b/infer-det.py
@ -0,0 +1,82 @@
+from models import TRTModule  # isort:skip
+import argparse
+from pathlib import Path
+
+import cv2
+import torch
+
+from config import CLASSES, COLORS
+from models.torch_utils import det_postprocess
+from models.utils import blob, letterbox, path_to_list
+
+
+def main(args: argparse.Namespace) -> None:
+    device = torch.device(args.device)
+    Engine = TRTModule(args.engine, device)
+    H, W = Engine.inp_info[0].shape[-2:]
+
+    # set desired output names order
+    Engine.set_desired(['num_dets', 'bboxes', 'scores', 'labels'])
+
+    images = path_to_list(args.imgs)
+    save_path = Path(args.out_dir)
+
+    if not args.show and not save_path.exists():
+        save_path.mkdir(parents=True, exist_ok=True)
+
+    for image in images:
+        save_image = save_path / image.name
+        bgr = cv2.imread(str(image))
+        draw = bgr.copy()
+        bgr, ratio, dwdh = letterbox(bgr, (W, H))
+        rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
+        tensor = blob(rgb, return_seg=False)
+        dwdh = torch.asarray(dwdh * 2, dtype=torch.float32, device=device)
+        tensor = torch.asarray(tensor, device=device)
+        # inference
+        data = Engine(tensor)
+
+        bboxes, scores, labels = det_postprocess(data)
+        bboxes -= dwdh
+        bboxes /= ratio
+
+        for (bbox, score, label) in zip(bboxes, scores, labels):
+            bbox = bbox.round().int().tolist()
+            cls_id = int(label)
+            cls = CLASSES[cls_id]
+            color = COLORS[cls]
+            cv2.rectangle(draw, bbox[:2], bbox[2:], color, 2)
+            cv2.putText(draw,
+                        f'{cls}:{score:.3f}', (bbox[0], bbox[1] - 2),
+                        cv2.FONT_HERSHEY_SIMPLEX,
+                        0.75, [225, 255, 255],
+                        thickness=2)
+        if args.show:
+            cv2.imshow('result', draw)
+            cv2.waitKey(0)
+        else:
+            cv2.imwrite(str(save_image), draw)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--engine', type=str, help='Engine file')
+    parser.add_argument('--imgs', type=str, help='Images file')
+    parser.add_argument('--show',
+                        action='store_true',
+                        help='Show the detection results')
+    parser.add_argument('--out-dir',
+                        type=str,
+                        default='./output',
+                        help='Path to output file')
+    parser.add_argument('--device',
+                        type=str,
+                        default='cuda:0',
+                        help='TensorRT infer device')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)
--- a/infer-no-torch.py
+++ b/infer-no-torch.py
@ -1,254 +0,0 @@
-import argparse
-import os
-import random
-from pathlib import Path
-from typing import List, Tuple, Union
-
-import cv2
-import numpy as np
-from numpy import ndarray
-
-os.environ['CUDA_MODULE_LOADING'] = 'LAZY'
-
-random.seed(0)
-
-SUFFIXS = ('.bmp', '.dng', '.jpeg', '.jpg', '.mpo', '.png', '.tif', '.tiff',
-           '.webp', '.pfm')
-CLASSES = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
-           'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
-           'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
-           'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe',
-           'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
-           'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat',
-           'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
-           'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
-           'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
-           'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
-           'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
-           'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
-           'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
-           'scissors', 'teddy bear', 'hair drier', 'toothbrush')
-
-COLORS = {
-    cls: [random.randint(0, 255) for _ in range(3)]
-    for i, cls in enumerate(CLASSES)
-}
-
-# the same as yolov8
-MASK_COLORS = np.array([(255, 56, 56), (255, 157, 151), (255, 112, 31),
-                        (255, 178, 29), (207, 210, 49), (72, 249, 10),
-                        (146, 204, 23), (61, 219, 134), (26, 147, 52),
-                        (0, 212, 187), (44, 153, 168), (0, 194, 255),
-                        (52, 69, 147), (100, 115, 255), (0, 24, 236),
-                        (132, 56, 255), (82, 0, 133), (203, 56, 255),
-                        (255, 149, 200), (255, 55, 199)],
-                       dtype=np.float32) / 255.
-
-ALPHA = 0.5
-
-
-def letterbox(
-    im: ndarray,
-    new_shape: Union[Tuple, List] = (640, 640),
-    color: Union[Tuple, List] = (114, 114, 114)
-) -> Tuple[ndarray, float, Tuple[float, float]]:
-    # Resize and pad image while meeting stride-multiple constraints
-    shape = im.shape[:2]  # current shape [height, width]
-    if isinstance(new_shape, int):
-        new_shape = (new_shape, new_shape)
-
-    # Scale ratio (new / old)
-    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
-
-    # Compute padding
-    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
-    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[
-        1]  # wh padding
-
-    dw /= 2  # divide padding into 2 sides
-    dh /= 2
-
-    if shape[::-1] != new_unpad:  # resize
-        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
-    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
-    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
-    im = cv2.copyMakeBorder(im,
-                            top,
-                            bottom,
-                            left,
-                            right,
-                            cv2.BORDER_CONSTANT,
-                            value=color)  # add border
-    return im, r, (dw, dh)
-
-
-def blob(im: ndarray) -> Tuple[ndarray, ndarray]:
-    seg = im.astype(np.float32) / 255
-    im = im.transpose([2, 0, 1])
-    im = im[np.newaxis, ...]
-    im = np.ascontiguousarray(im).astype(np.float32) / 255
-    return im, seg
-
-
-def main(args):
-    if args.method == 'cudart':
-        from models.cudart_api import TRTEngine
-    elif args.method == 'pycuda':
-        from models.pycuda_api import TRTEngine
-    else:
-        raise NotImplementedError
-
-    Engine = TRTEngine(args.engine)
-    H, W = Engine.inp_info[0].shape[-2:]
-
-    images_path = Path(args.imgs)
-    assert images_path.exists()
-    save_path = Path(args.out_dir)
-
-    if images_path.is_dir():
-        images = [
-            i.absolute() for i in images_path.iterdir() if i.suffix in SUFFIXS
-        ]
-    else:
-        assert images_path.suffix in SUFFIXS
-        images = [images_path.absolute()]
-
-    if not args.show and not save_path.exists():
-        save_path.mkdir(parents=True, exist_ok=True)
-
-    for image in images:
-        save_image = save_path / image.name
-        bgr = cv2.imread(str(image))
-        draw = bgr.copy()
-        bgr, ratio, dwdh = letterbox(bgr, (W, H))
-        dw, dh = int(dwdh[0]), int(dwdh[1])
-        rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
-        tensor, seg_img = blob(rgb)
-        dwdh = np.array(dwdh * 2, dtype=np.float32)
-        tensor = np.ascontiguousarray(tensor)
-        data = Engine(tensor)
-
-        if args.seg:
-            seg_img = seg_img[dh:H - dh, dw:W - dw, [2, 1, 0]]
-            bboxes, scores, labels, masks = seg_postprocess(
-                data, bgr.shape[:2], args.conf_thres, args.iou_thres)
-            mask, mask_color = [m[:, dh:H - dh, dw:W - dw, :] for m in masks]
-            inv_alph_masks = (1 - mask * 0.5).cumprod(0)
-            mcs = (mask_color * inv_alph_masks).sum(0) * 2
-            seg_img = (seg_img * inv_alph_masks[-1] + mcs) * 255
-            draw = cv2.resize(seg_img.astype(np.uint8), draw.shape[:2][::-1])
-        else:
-            bboxes, scores, labels = det_postprocess(data)
-
-        bboxes -= dwdh
-        bboxes /= ratio
-
-        for (bbox, score, label) in zip(bboxes, scores, labels):
-            bbox = bbox.round().astype(np.int32).tolist()
-            cls_id = int(label)
-            cls = CLASSES[cls_id]
-            color = COLORS[cls]
-            cv2.rectangle(draw, bbox[:2], bbox[2:], color, 2)
-            cv2.putText(draw,
-                        f'{cls}:{score:.3f}', (bbox[0], bbox[1] - 2),
-                        cv2.FONT_HERSHEY_SIMPLEX,
-                        0.75, [225, 255, 255],
-                        thickness=2)
-        if args.show:
-            cv2.imshow('result', draw)
-            cv2.waitKey(0)
-        else:
-            cv2.imwrite(str(save_image), draw)
-
-
-def crop_mask(masks: ndarray, bboxes: ndarray) -> ndarray:
-    n, h, w = masks.shape
-    x1, y1, x2, y2 = np.split(bboxes[:, :, None], [1, 2, 3],
-                              1)  # x1 shape(1,1,n)
-    r = np.arange(w, dtype=x1.dtype)[None, None, :]  # rows shape(1,w,1)
-    c = np.arange(h, dtype=x1.dtype)[None, :, None]  # cols shape(h,1,1)
-
-    return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
-
-
-def seg_postprocess(
-        data: Tuple[ndarray],
-        shape: Union[Tuple, List],
-        conf_thres: float = 0.25,
-        iou_thres: float = 0.65) -> Tuple[ndarray, ndarray, ndarray, List]:
-    assert len(data) == 2
-    h, w = shape[0] // 4, shape[1] // 4  # 4x downsampling
-    outputs, proto = (i[0] for i in data)
-    bboxes, scores, labels, maskconf = np.split(outputs, [4, 5, 6], 1)
-    scores, labels = scores.squeeze(), labels.squeeze()
-    select = scores > conf_thres
-    bboxes, scores, labels, maskconf = bboxes[select], scores[select], labels[
-        select], maskconf[select]
-    cvbboxes = np.concatenate([bboxes[:, :2], bboxes[:, 2:] - bboxes[:, :2]],
-                              1)
-    labels = labels.astype(np.int32)
-    v0, v1 = map(int, (cv2.__version__).split('.')[:2])
-    assert v0 == 4, 'OpenCV version is wrong'
-    if v1 > 6:
-        idx = cv2.dnn.NMSBoxesBatched(cvbboxes, scores, labels, conf_thres,
-                                      iou_thres)
-    else:
-        idx = cv2.dnn.NMSBoxes(cvbboxes, scores, conf_thres, iou_thres)
-    bboxes, scores, labels, maskconf = bboxes[idx], scores[idx], labels[
-        idx], maskconf[idx]
-    masks = (maskconf @ proto).reshape(-1, h, w)
-    masks = crop_mask(masks, bboxes / 4.)
-    masks = cv2.resize(masks.transpose([1, 2, 0]),
-                       shape,
-                       interpolation=cv2.INTER_LINEAR).transpose(2, 0, 1)
-    masks = np.ascontiguousarray((masks > 0.5)[..., None])
-    cidx = labels % len(MASK_COLORS)
-    mask_color = MASK_COLORS[cidx].reshape(-1, 1, 1, 3) * ALPHA
-    out = [masks, masks @ mask_color]
-    return bboxes, scores, labels, out
-
-
-def det_postprocess(data: Tuple[ndarray, ndarray, ndarray]):
-    assert len(data) == 4
-    num_dets, bboxes, scores, labels = (i[0] for i in data)
-    nums = num_dets.item()
-    bboxes = bboxes[:nums]
-    scores = scores[:nums]
-    labels = labels[:nums]
-    return bboxes, scores, labels
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--engine', type=str, help='Engine file')
-    parser.add_argument('--imgs', type=str, help='Images file')
-    parser.add_argument('--show',
-                        action='store_true',
-                        help='Show the detection results')
-    parser.add_argument('--seg', action='store_true', help='Seg inference')
-    parser.add_argument('--out-dir',
-                        type=str,
-                        default='./output',
-                        help='Path to output file')
-    parser.add_argument('--conf-thres',
-                        type=float,
-                        default=0.25,
-                        help='Confidence threshold')
-    parser.add_argument('--iou-thres',
-                        type=float,
-                        default=0.65,
-                        help='Confidence threshold')
-    parser.add_argument('--method',
-                        type=str,
-                        default='cudart',
-                        help='CUDART pipeline')
-    parser.add_argument('--profile',
-                        action='store_true',
-                        help='Profile TensorRT engine')
-    args = parser.parse_args()
-    return args
-
-
-if __name__ == '__main__':
-    args = parse_args()
-    main(args)
--- a/infer-seg-without-torch.py
+++ b/infer-seg-without-torch.py
@ -0,0 +1,100 @@
+import argparse
+from pathlib import Path
+
+import cv2
+import numpy as np
+
+from config import CLASSES, COLORS
+from models.utils import blob, letterbox, path_to_list, seg_postprocess
+
+
+def main(args: argparse.Namespace) -> None:
+    if args.method == 'cudart':
+        from models.cudart_api import TRTEngine
+    elif args.method == 'pycuda':
+        from models.pycuda_api import TRTEngine
+    else:
+        raise NotImplementedError
+
+    Engine = TRTEngine(args.engine)
+    H, W = Engine.inp_info[0].shape[-2:]
+
+    images = path_to_list(args.imgs)
+    save_path = Path(args.out_dir)
+
+    if not args.show and not save_path.exists():
+        save_path.mkdir(parents=True, exist_ok=True)
+
+    for image in images:
+        save_image = save_path / image.name
+        bgr = cv2.imread(str(image))
+        draw = bgr.copy()
+        bgr, ratio, dwdh = letterbox(bgr, (W, H))
+        dw, dh = int(dwdh[0]), int(dwdh[1])
+        rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
+        tensor, seg_img = blob(rgb, return_seg=True)
+        dwdh = np.array(dwdh * 2, dtype=np.float32)
+        tensor = np.ascontiguousarray(tensor)
+        # inference
+        data = Engine(tensor)
+
+        seg_img = seg_img[dh:H - dh, dw:W - dw, [2, 1, 0]]
+        bboxes, scores, labels, masks = seg_postprocess(
+            data, bgr.shape[:2], args.conf_thres, args.iou_thres)
+        mask, mask_color = [m[:, dh:H - dh, dw:W - dw, :] for m in masks]
+        inv_alph_masks = (1 - mask * 0.5).cumprod(0)
+        mcs = (mask_color * inv_alph_masks).sum(0) * 2
+        seg_img = (seg_img * inv_alph_masks[-1] + mcs) * 255
+        draw = cv2.resize(seg_img.astype(np.uint8), draw.shape[:2][::-1])
+
+        bboxes -= dwdh
+        bboxes /= ratio
+
+        for (bbox, score, label) in zip(bboxes, scores, labels):
+            bbox = bbox.round().astype(np.int32).tolist()
+            cls_id = int(label)
+            cls = CLASSES[cls_id]
+            color = COLORS[cls]
+            cv2.rectangle(draw, bbox[:2], bbox[2:], color, 2)
+            cv2.putText(draw,
+                        f'{cls}:{score:.3f}', (bbox[0], bbox[1] - 2),
+                        cv2.FONT_HERSHEY_SIMPLEX,
+                        0.75, [225, 255, 255],
+                        thickness=2)
+        if args.show:
+            cv2.imshow('result', draw)
+            cv2.waitKey(0)
+        else:
+            cv2.imwrite(str(save_image), draw)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--engine', type=str, help='Engine file')
+    parser.add_argument('--imgs', type=str, help='Images file')
+    parser.add_argument('--show',
+                        action='store_true',
+                        help='Show the detection results')
+    parser.add_argument('--out-dir',
+                        type=str,
+                        default='./output',
+                        help='Path to output file')
+    parser.add_argument('--conf-thres',
+                        type=float,
+                        default=0.25,
+                        help='Confidence threshold')
+    parser.add_argument('--iou-thres',
+                        type=float,
+                        default=0.65,
+                        help='Confidence threshold')
+    parser.add_argument('--method',
+                        type=str,
+                        default='cudart',
+                        help='CUDART pipeline')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)
--- a/infer-seg.py
+++ b/infer-seg.py
@ -0,0 +1,101 @@
+from models import TRTModule  # isort:skip
+import argparse
+from pathlib import Path
+
+import cv2
+import numpy as np
+import torch
+
+from config import CLASSES, COLORS
+from models.torch_utils import seg_postprocess
+from models.utils import blob, letterbox, path_to_list
+
+
+def main(args: argparse.Namespace) -> None:
+    device = torch.device(args.device)
+    Engine = TRTModule(args.engine, device)
+    H, W = Engine.inp_info[0].shape[-2:]
+
+    # set desired output names order
+    Engine.set_desired(['outputs', 'proto'])
+
+    images = path_to_list(args.imgs)
+    save_path = Path(args.out_dir)
+
+    if not args.show and not save_path.exists():
+        save_path.mkdir(parents=True, exist_ok=True)
+
+    for image in images:
+        save_image = save_path / image.name
+        bgr = cv2.imread(str(image))
+        draw = bgr.copy()
+        bgr, ratio, dwdh = letterbox(bgr, (W, H))
+        dw, dh = int(dwdh[0]), int(dwdh[1])
+        rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
+        tensor, seg_img = blob(rgb, return_seg=True)
+        dwdh = torch.asarray(dwdh * 2, dtype=torch.float32, device=device)
+        tensor = torch.asarray(tensor, device=device)
+        # inference
+        data = Engine(tensor)
+        seg_img = torch.asarray(seg_img[dh:H - dh, dw:W - dw, [2, 1, 0]],
+                                device=device)
+        bboxes, scores, labels, masks = seg_postprocess(
+            data, bgr.shape[:2], args.conf_thres, args.iou_thres)
+        mask, mask_color = [m[:, dh:H - dh, dw:W - dw, :] for m in masks]
+        inv_alph_masks = (1 - mask * 0.5).cumprod(0)
+        mcs = (mask_color * inv_alph_masks).sum(0) * 2
+        seg_img = (seg_img * inv_alph_masks[-1] + mcs) * 255
+        draw = cv2.resize(seg_img.cpu().numpy().astype(np.uint8),
+                          draw.shape[:2][::-1])
+
+        bboxes -= dwdh
+        bboxes /= ratio
+
+        for (bbox, score, label) in zip(bboxes, scores, labels):
+            bbox = bbox.round().int().tolist()
+            cls_id = int(label)
+            cls = CLASSES[cls_id]
+            color = COLORS[cls]
+            cv2.rectangle(draw, bbox[:2], bbox[2:], color, 2)
+            cv2.putText(draw,
+                        f'{cls}:{score:.3f}', (bbox[0], bbox[1] - 2),
+                        cv2.FONT_HERSHEY_SIMPLEX,
+                        0.75, [225, 255, 255],
+                        thickness=2)
+        if args.show:
+            cv2.imshow('result', draw)
+            cv2.waitKey(0)
+        else:
+            cv2.imwrite(str(save_image), draw)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--engine', type=str, help='Engine file')
+    parser.add_argument('--imgs', type=str, help='Images file')
+    parser.add_argument('--show',
+                        action='store_true',
+                        help='Show the detection results')
+    parser.add_argument('--out-dir',
+                        type=str,
+                        default='./output',
+                        help='Path to output file')
+    parser.add_argument('--conf-thres',
+                        type=float,
+                        default=0.25,
+                        help='Confidence threshold')
+    parser.add_argument('--iou-thres',
+                        type=float,
+                        default=0.65,
+                        help='Confidence threshold')
+    parser.add_argument('--device',
+                        type=str,
+                        default='cuda:0',
+                        help='TensorRT infer device')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)
--- a/infer.py
+++ b/infer.py
@ -1,267 +0,0 @@
-from models import TRTModule, TRTProfilerV0  # isort:skip
-import argparse
-import os
-import random
-from pathlib import Path
-from typing import List, Tuple, Union
-
-import cv2
-import numpy as np
-import torch
-import torch.nn.functional as F
-from numpy import ndarray
-from torch import Tensor
-from torchvision.ops import batched_nms
-
-os.environ['CUDA_MODULE_LOADING'] = 'LAZY'
-
-random.seed(0)
-
-SUFFIXS = ('.bmp', '.dng', '.jpeg', '.jpg', '.mpo', '.png', '.tif', '.tiff',
-           '.webp', '.pfm')
-CLASSES = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
-           'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
-           'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
-           'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe',
-           'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
-           'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat',
-           'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
-           'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
-           'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
-           'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
-           'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
-           'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
-           'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
-           'scissors', 'teddy bear', 'hair drier', 'toothbrush')
-
-COLORS = {
-    cls: [random.randint(0, 255) for _ in range(3)]
-    for i, cls in enumerate(CLASSES)
-}
-
-# the same as yolov8
-MASK_COLORS = np.array([(255, 56, 56), (255, 157, 151), (255, 112, 31),
-                        (255, 178, 29), (207, 210, 49), (72, 249, 10),
-                        (146, 204, 23), (61, 219, 134), (26, 147, 52),
-                        (0, 212, 187), (44, 153, 168), (0, 194, 255),
-                        (52, 69, 147), (100, 115, 255), (0, 24, 236),
-                        (132, 56, 255), (82, 0, 133), (203, 56, 255),
-                        (255, 149, 200), (255, 55, 199)],
-                       dtype=np.float32) / 255.
-
-ALPHA = 0.5
-
-
-def letterbox(
-    im: ndarray,
-    new_shape: Union[Tuple, List] = (640, 640),
-    color: Union[Tuple, List] = (114, 114, 114)
-) -> Tuple[ndarray, float, Tuple[float, float]]:
-    # Resize and pad image while meeting stride-multiple constraints
-    shape = im.shape[:2]  # current shape [height, width]
-    if isinstance(new_shape, int):
-        new_shape = (new_shape, new_shape)
-
-    # Scale ratio (new / old)
-    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
-
-    # Compute padding
-    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
-    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[
-        1]  # wh padding
-
-    dw /= 2  # divide padding into 2 sides
-    dh /= 2
-
-    if shape[::-1] != new_unpad:  # resize
-        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
-    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
-    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
-    im = cv2.copyMakeBorder(im,
-                            top,
-                            bottom,
-                            left,
-                            right,
-                            cv2.BORDER_CONSTANT,
-                            value=color)  # add border
-    return im, r, (dw, dh)
-
-
-def blob(im: ndarray) -> Tuple[ndarray, ndarray]:
-    seg = im.astype(np.float32) / 255
-    im = im.transpose([2, 0, 1])
-    im = im[np.newaxis, ...]
-    im = np.ascontiguousarray(im).astype(np.float32) / 255
-    return im, seg
-
-
-def main(args):
-    device = torch.device(args.device)
-    Engine = TRTModule(args.engine, device)
-    H, W = Engine.inp_info[0].shape[-2:]
-
-    # set desired output names order
-    if args.seg:
-        Engine.set_desired(['outputs', 'proto'])
-    else:
-        Engine.set_desired(['num_dets', 'bboxes', 'scores', 'labels'])
-
-    images_path = Path(args.imgs)
-    assert images_path.exists()
-    save_path = Path(args.out_dir)
-
-    if images_path.is_dir():
-        images = [
-            i.absolute() for i in images_path.iterdir() if i.suffix in SUFFIXS
-        ]
-    else:
-        assert images_path.suffix in SUFFIXS
-        images = [images_path.absolute()]
-
-    if not args.show and not save_path.exists():
-        save_path.mkdir(parents=True, exist_ok=True)
-
-    for image in images:
-        save_image = save_path / image.name
-        bgr = cv2.imread(str(image))
-        draw = bgr.copy()
-        bgr, ratio, dwdh = letterbox(bgr, (W, H))
-        dw, dh = int(dwdh[0]), int(dwdh[1])
-        rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
-        tensor, seg_img = blob(rgb)
-        dwdh = torch.asarray(dwdh * 2, dtype=torch.float32, device=device)
-        tensor = torch.asarray(tensor, device=device)
-        data = Engine(tensor)
-
-        if args.seg:
-            seg_img = torch.asarray(seg_img[dh:H - dh, dw:W - dw, [2, 1, 0]],
-                                    device=device)
-            bboxes, scores, labels, masks = seg_postprocess(
-                data, bgr.shape[:2], args.conf_thres, args.iou_thres)
-            mask, mask_color = [m[:, dh:H - dh, dw:W - dw, :] for m in masks]
-            inv_alph_masks = (1 - mask * 0.5).cumprod(0)
-            mcs = (mask_color * inv_alph_masks).sum(0) * 2
-            seg_img = (seg_img * inv_alph_masks[-1] + mcs) * 255
-            draw = cv2.resize(seg_img.cpu().numpy().astype(np.uint8),
-                              draw.shape[:2][::-1])
-        else:
-            bboxes, scores, labels = det_postprocess(data)
-
-        bboxes -= dwdh
-        bboxes /= ratio
-
-        for (bbox, score, label) in zip(bboxes, scores, labels):
-            bbox = bbox.round().int().tolist()
-            cls_id = int(label)
-            cls = CLASSES[cls_id]
-            color = COLORS[cls]
-            cv2.rectangle(draw, bbox[:2], bbox[2:], color, 2)
-            cv2.putText(draw,
-                        f'{cls}:{score:.3f}', (bbox[0], bbox[1] - 2),
-                        cv2.FONT_HERSHEY_SIMPLEX,
-                        0.75, [225, 255, 255],
-                        thickness=2)
-        if args.show:
-            cv2.imshow('result', draw)
-            cv2.waitKey(0)
-        else:
-            cv2.imwrite(str(save_image), draw)
-
-
-def crop_mask(masks: Tensor, bboxes: Tensor) -> Tensor:
-    n, h, w = masks.shape
-    x1, y1, x2, y2 = torch.chunk(bboxes[:, :, None], 4, 1)  # x1 shape(1,1,n)
-    r = torch.arange(w, device=masks.device,
-                     dtype=x1.dtype)[None, None, :]  # rows shape(1,w,1)
-    c = torch.arange(h, device=masks.device,
-                     dtype=x1.dtype)[None, :, None]  # cols shape(h,1,1)
-
-    return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
-
-
-def seg_postprocess(
-        data: Tuple[Tensor],
-        shape: Union[Tuple, List],
-        conf_thres: float = 0.25,
-        iou_thres: float = 0.65) -> Tuple[Tensor, Tensor, Tensor, List]:
-    assert len(data) == 2
-    h, w = shape[0] // 4, shape[1] // 4  # 4x downsampling
-    outputs, proto = (i[0] for i in data)
-    bboxes, scores, labels, maskconf = outputs.split([4, 1, 1, 32], 1)
-    scores, labels = scores.squeeze(), labels.squeeze()
-    select = scores > conf_thres
-    bboxes, scores, labels, maskconf = bboxes[select], scores[select], labels[
-        select], maskconf[select]
-    idx = batched_nms(bboxes, scores, labels, iou_thres)
-    bboxes, scores, labels, maskconf = bboxes[idx], scores[idx], labels[
-        idx].int(), maskconf[idx]
-    masks = (maskconf @ proto).view(-1, h, w)
-    masks = crop_mask(masks, bboxes / 4.)
-    masks = F.interpolate(masks[None],
-                          shape,
-                          mode='bilinear',
-                          align_corners=False)[0]
-    masks = masks.gt_(0.5)[..., None]
-    cidx = (labels % len(MASK_COLORS)).cpu().numpy()
-    mask_color = torch.tensor(MASK_COLORS[cidx].reshape(-1, 1, 1,
-                                                        3)).to(bboxes) * ALPHA
-    out = [masks, masks @ mask_color]
-    return bboxes, scores, labels, out
-
-
-def det_postprocess(data: Tuple[Tensor, Tensor, Tensor, Tensor]):
-    assert len(data) == 4
-    num_dets, bboxes, scores, labels = (i[0] for i in data)
-    nums = num_dets.item()
-    bboxes = bboxes[:nums]
-    scores = scores[:nums]
-    labels = labels[:nums]
-    return bboxes, scores, labels
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--engine', type=str, help='Engine file')
-    parser.add_argument('--imgs', type=str, help='Images file')
-    parser.add_argument('--show',
-                        action='store_true',
-                        help='Show the detection results')
-    parser.add_argument('--seg', action='store_true', help='Seg inference')
-    parser.add_argument('--out-dir',
-                        type=str,
-                        default='./output',
-                        help='Path to output file')
-    parser.add_argument('--conf-thres',
-                        type=float,
-                        default=0.25,
-                        help='Confidence threshold')
-    parser.add_argument('--iou-thres',
-                        type=float,
-                        default=0.65,
-                        help='Confidence threshold')
-    parser.add_argument('--device',
-                        type=str,
-                        default='cuda:0',
-                        help='TensorRT infer device')
-    parser.add_argument('--profile',
-                        action='store_true',
-                        help='Profile TensorRT engine')
-    args = parser.parse_args()
-    return args
-
-
-def profile(args):
-    device = torch.device(args.device)
-    Engine = TRTModule(args.engine, device)
-    profiler = TRTProfilerV0()
-    Engine.set_profiler(profiler)
-    random_input = torch.randn(Engine.inp_info[0].shape, device=device)
-    _ = Engine(random_input)
-
-
-if __name__ == '__main__':
-    args = parse_args()
-    if args.profile:
-        profile(args)
-    else:
-        main(args)
--- a/models/engine.py
+++ b/models/engine.py
@ -1,3 +1,4 @@
+import os
 import pickle
 from collections import defaultdict, namedtuple
 from pathlib import Path
@ -7,6 +8,8 @@ import onnx
 import tensorrt as trt
 import torch

+os.environ['CUDA_MODULE_LOADING'] = 'LAZY'
+

 class EngineBuilder:
    seg = False
--- a/models/torch_utils.py
+++ b/models/torch_utils.py
@ -0,0 +1,54 @@
+from typing import List, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+from torchvision.ops import batched_nms
+
+
+def seg_postprocess(
+        data: Tuple[Tensor],
+        shape: Union[Tuple, List],
+        conf_thres: float = 0.25,
+        iou_thres: float = 0.65) \
+        -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+    assert len(data) == 2
+    h, w = shape[0] // 4, shape[1] // 4  # 4x downsampling
+    outputs, proto = (i[0] for i in data)
+    bboxes, scores, labels, maskconf = outputs.split([4, 1, 1, 32], 1)
+    scores, labels = scores.squeeze(), labels.squeeze()
+    idx = scores > conf_thres
+    bboxes, scores, labels, maskconf = \
+        bboxes[idx], scores[idx], labels[idx], maskconf[idx]
+    idx = batched_nms(bboxes, scores, labels, iou_thres)
+    bboxes, scores, labels, maskconf = \
+        bboxes[idx], scores[idx], labels[idx].int(), maskconf[idx]
+    masks = (maskconf @ proto).view(-1, h, w)
+    masks = crop_mask(masks, bboxes / 4.)
+    masks = F.interpolate(masks[None],
+                          shape,
+                          mode='bilinear',
+                          align_corners=False)[0]
+    masks = masks.gt_(0.5)[..., None]
+    return bboxes, scores, labels, masks
+
+
+def det_postprocess(data: Tuple[Tensor, Tensor, Tensor, Tensor]):
+    assert len(data) == 4
+    num_dets, bboxes, scores, labels = (i[0] for i in data)
+    nums = num_dets.item()
+    bboxes = bboxes[:nums]
+    scores = scores[:nums]
+    labels = labels[:nums]
+    return bboxes, scores, labels
+
+
+def crop_mask(masks: Tensor, bboxes: Tensor) -> Tensor:
+    n, h, w = masks.shape
+    x1, y1, x2, y2 = torch.chunk(bboxes[:, :, None], 4, 1)  # x1 shape(1,1,n)
+    r = torch.arange(w, device=masks.device,
+                     dtype=x1.dtype)[None, None, :]  # rows shape(1,w,1)
+    c = torch.arange(h, device=masks.device,
+                     dtype=x1.dtype)[None, :, None]  # cols shape(h,1,1)
+
+    return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
--- a/models/utils.py
+++ b/models/utils.py
@ -0,0 +1,125 @@
+from pathlib import Path
+from typing import List, Tuple, Union
+
+import cv2
+import numpy as np
+from numpy import ndarray
+
+# image suffixs
+SUFFIXS = ('.bmp', '.dng', '.jpeg', '.jpg', '.mpo', '.png', '.tif', '.tiff',
+           '.webp', '.pfm')
+
+
+def letterbox(im: ndarray,
+              new_shape: Union[Tuple, List] = (640, 640),
+              color: Union[Tuple, List] = (114, 114, 114)) \
+        -> Tuple[ndarray, float, Tuple[float, float]]:
+    # Resize and pad image while meeting stride-multiple constraints
+    shape = im.shape[:2]  # current shape [height, width]
+    if isinstance(new_shape, int):
+        new_shape = (new_shape, new_shape)
+
+    # Scale ratio (new / old)
+    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+
+    # Compute padding
+    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[
+        1]  # wh padding
+
+    dw /= 2  # divide padding into 2 sides
+    dh /= 2
+
+    if shape[::-1] != new_unpad:  # resize
+        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
+    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+    im = cv2.copyMakeBorder(im,
+                            top,
+                            bottom,
+                            left,
+                            right,
+                            cv2.BORDER_CONSTANT,
+                            value=color)  # add border
+    return im, r, (dw, dh)
+
+
+def blob(im: ndarray, return_seg: bool = False) -> Union[ndarray, Tuple]:
+    if return_seg:
+        seg = im.astype(np.float32) / 255
+    im = im.transpose([2, 0, 1])
+    im = im[np.newaxis, ...]
+    im = np.ascontiguousarray(im).astype(np.float32) / 255
+    if return_seg:
+        return im, seg
+    else:
+        return im
+
+
+def path_to_list(images_path: Union[str, Path]) -> List:
+    if isinstance(images_path, str):
+        images_path = Path(images_path)
+    assert images_path.exists()
+    if images_path.is_dir():
+        images = [
+            i.absolute() for i in images_path.iterdir() if i.suffix in SUFFIXS
+        ]
+    else:
+        assert images_path.suffix in SUFFIXS
+        images = [images_path.absolute()]
+    return images
+
+
+def crop_mask(masks: ndarray, bboxes: ndarray) -> ndarray:
+    n, h, w = masks.shape
+    x1, y1, x2, y2 = np.split(bboxes[:, :, None], [1, 2, 3],
+                              1)  # x1 shape(1,1,n)
+    r = np.arange(w, dtype=x1.dtype)[None, None, :]  # rows shape(1,w,1)
+    c = np.arange(h, dtype=x1.dtype)[None, :, None]  # cols shape(h,1,1)
+
+    return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
+
+
+def det_postprocess(data: Tuple[ndarray, ndarray, ndarray, ndarray]):
+    assert len(data) == 4
+    num_dets, bboxes, scores, labels = (i[0] for i in data)
+    nums = num_dets.item()
+    bboxes = bboxes[:nums]
+    scores = scores[:nums]
+    labels = labels[:nums]
+    return bboxes, scores, labels
+
+
+def seg_postprocess(
+        data: Tuple[ndarray],
+        shape: Union[Tuple, List],
+        conf_thres: float = 0.25,
+        iou_thres: float = 0.65) \
+        -> Tuple[ndarray, ndarray, ndarray, ndarray]:
+    assert len(data) == 2
+    h, w = shape[0] // 4, shape[1] // 4  # 4x downsampling
+    outputs, proto = (i[0] for i in data)
+    bboxes, scores, labels, maskconf = np.split(outputs, [4, 5, 6], 1)
+    scores, labels = scores.squeeze(), labels.squeeze()
+    idx = scores > conf_thres
+    bboxes, scores, labels, maskconf = \
+        bboxes[idx], scores[idx], labels[idx], maskconf[idx]
+    cvbboxes = np.concatenate([bboxes[:, :2], bboxes[:, 2:] - bboxes[:, :2]],
+                              1)
+    labels = labels.astype(np.int32)
+    v0, v1 = map(int, (cv2.__version__).split('.')[:2])
+    assert v0 == 4, 'OpenCV version is wrong'
+    if v1 > 6:
+        idx = cv2.dnn.NMSBoxesBatched(cvbboxes, scores, labels, conf_thres,
+                                      iou_thres)
+    else:
+        idx = cv2.dnn.NMSBoxes(cvbboxes, scores, conf_thres, iou_thres)
+    bboxes, scores, labels, maskconf = \
+        bboxes[idx], scores[idx], labels[idx], maskconf[idx]
+    masks = (maskconf @ proto).reshape(-1, h, w)
+    masks = crop_mask(masks, bboxes / 4.)
+    masks = cv2.resize(masks.transpose([1, 2, 0]),
+                       shape,
+                       interpolation=cv2.INTER_LINEAR).transpose(2, 0, 1)
+    masks = np.ascontiguousarray((masks > 0.5)[..., None])
+    return bboxes, scores, labels, masks
--- a/profile.py
+++ b/profile.py
@ -0,0 +1,29 @@
+from models import TRTModule, TRTProfilerV0  # isort:skip
+import argparse
+
+import torch
+
+
+def profile(args):
+    device = torch.device(args.device)
+    Engine = TRTModule(args.engine, device)
+    profiler = TRTProfilerV0()
+    Engine.set_profiler(profiler)
+    random_input = torch.randn(Engine.inp_info[0].shape, device=device)
+    _ = Engine(random_input)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--engine', type=str, help='Engine file')
+    parser.add_argument('--device',
+                        type=str,
+                        default='cuda:0',
+                        help='TensorRT infer device')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    profile(args)