From 9305126e14c214c5a526bbcaced079f11e399aa3 Mon Sep 17 00:00:00 2001
From: triple-Mu <gpu@163.com>
Date: Fri, 6 Jan 2023 11:14:39 +0800
Subject: [PATCH] Add profiler

---
 README.md          | 35 ++++++++++++++++++++++++---------
 infer.py           | 18 +++++++++++++++--
 models/__init__.py |  4 ++--
 models/engine.py   | 48 +++++++++++++++++++++++++++++++++++++---------
 4 files changed, 83 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index 3bb2261..014618c 100644
--- a/README.md
+++ b/README.md
@@ -1,18 +1,25 @@
 # YOLOv8-TensorRT
+
 YOLOv8 using TensorRT accelerate !
 
 # Preprocessed ONNX model
+
 You can dowload the onnx model which is pretrained by https://github.com/ultralytics .
 
-[**YOLOv8-n**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8n_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1772936700&Signature=r6HgJTTcCSAxQxD9bKO9qBTtigQ%3D)
+[**YOLOv8-n
+**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8n_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1772936700&Signature=r6HgJTTcCSAxQxD9bKO9qBTtigQ%3D)
 
-[**YOLOv8-s**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8s_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936722&Signature=JjxQFx1YElcVdsCaMoj81KJ4a5s%3D)
+[**YOLOv8-s
+**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8s_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936722&Signature=JjxQFx1YElcVdsCaMoj81KJ4a5s%3D)
 
-[**YOLOv8-m**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8m_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936739&Signature=IRKBELdVFemD7diixxxgzMYqsWg%3D)
+[**YOLOv8-m
+**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8m_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936739&Signature=IRKBELdVFemD7diixxxgzMYqsWg%3D)
 
-[**YOLOv8-l**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8l_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936763&Signature=RGkJ4G2XJ4J%2BNiki5cJi3oBkDnA%3D)
+[**YOLOv8-l
+**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8l_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936763&Signature=RGkJ4G2XJ4J%2BNiki5cJi3oBkDnA%3D)
 
-[**YOLOv8-x**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8x_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1673936778&Signature=3o%2F7QKhiZg1dW3I6sDrY4ug6MQU%3D)
+[**YOLOv8-x
+**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8x_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1673936778&Signature=3o%2F7QKhiZg1dW3I6sDrY4ug6MQU%3D)
 
 # Build TensorRT engine by ONNX
 
@@ -20,7 +27,7 @@ You can dowload the onnx model which is pretrained by https://github.com/ultraly
 
 You can export TensorRT engine by [`build.py` ](build.py).
 
-Usage: 
+Usage:
 
 ``` shell
 python3 build.py --onnx yolov8s_nms.onnx --device cuda:0 --fp16
@@ -42,9 +49,11 @@ Usage:
 /usr/src/tensorrt/bin/trtexec --onnx=yolov8s_nms.onnx --saveEngine=yolov8s_nms.engine --fp16
 ```
 
-***If you installed TensorRT by a debian package, then the installation path of `trtexec` is `/usr/src/tensorrt/bin/trtexec`***
+***If you installed TensorRT by a debian package, then the installation path of `trtexec`
+is `/usr/src/tensorrt/bin/trtexec`***
 
-***If you installed TensorRT by a tar package, then the installation path of trtexec is under the `bin` folder in the path you decompressed***
+***If you installed TensorRT by a tar package, then the installation path of `trtexec` is under the `bin` folder in the
+path you decompressed***
 
 # Infer images by the engine which you export
 
@@ -68,5 +77,13 @@ python3 infer.py --engine yolov8s_nms.engine --imgs data --show --out-dir output
 
 - `--device` : The CUDA deivce you use.
 
-  
+- `--profile` : Profile the TensorRT engine.
+
+If you want to profile the TensorRT engine:
+
+Usage:
+
+``` shell
+python3 infer.py --engine yolov8s_nms.engine --profile
+```
 
diff --git a/infer.py b/infer.py
index 554900a..ccedff8 100644
--- a/infer.py
+++ b/infer.py
@@ -1,4 +1,4 @@
-from models import TRTModule
+from models import TRTModule, TRTProfilerV0
 from pathlib import Path
 import cv2
 import argparse
@@ -112,10 +112,24 @@ def parse_args():
         '--out-dir', type=str, default='./output', help='Path to output file')
     parser.add_argument(
         '--device', type=str, default='cuda:0', help='TensorRT infer device')
+    parser.add_argument(
+        '--profile', action='store_true', help='Profile TensorRT engine')
     args = parser.parse_args()
     return args
 
 
+def profile(args):
+    device = torch.device(args.device)
+    Engine = TRTModule(args.engine, device)
+    profiler = TRTProfilerV0()
+    Engine.set_profiler(profiler)
+    random_input = torch.randn(Engine.inp_info[0].shape, device=device)
+    _ = Engine(random_input)
+
+
 if __name__ == '__main__':
     args = parse_args()
-    main(args)
+    if args.profile:
+        profile(args)
+    else:
+        main(args)
diff --git a/models/__init__.py b/models/__init__.py
index fc6d4e8..20da0b9 100644
--- a/models/__init__.py
+++ b/models/__init__.py
@@ -1,3 +1,3 @@
-from .engine import EngineBuilder, TRTModule
+from .engine import EngineBuilder, TRTModule, TRTProfilerV0, TRTProfilerV1
 
-__all__ = ['EngineBuilder', 'TRTModule']
\ No newline at end of file
+__all__ = ['EngineBuilder', 'TRTModule', 'TRTProfilerV0', 'TRTProfilerV1']
\ No newline at end of file
diff --git a/models/engine.py b/models/engine.py
index e36cddd..136ae66 100644
--- a/models/engine.py
+++ b/models/engine.py
@@ -1,6 +1,6 @@
 from pathlib import Path
-from typing import Optional, Union, List
-from collections import namedtuple
+from typing import Optional, Union, List, Tuple
+from collections import namedtuple, defaultdict
 
 try:
     import tensorrt as trt
@@ -40,9 +40,9 @@ class EngineBuilder:
         outputs = [network.get_output(i) for i in range(network.num_outputs)]
 
         for inp in inputs:
-            logger.log(trt.Logger.WARNING, f'input "{inp.name}" with shape{inp.shape} {inp.dtype}')
+            logger.log(trt.Logger.WARNING, f'input "{inp.name}" with shape: {inp.shape} dtype: {inp.dtype}')
         for out in outputs:
-            logger.log(trt.Logger.WARNING, f'output "{out.name}" with shape{out.shape} {out.dtype}')
+            logger.log(trt.Logger.WARNING, f'output "{out.name}" with shape: {out.shape} dtype: {out.dtype}')
         if fp16 and builder.platform_has_fast_fp16:
             config.set_flag(trt.BuilderFlag.FP16)
         self.weight = self.checkpoint.with_suffix('.engine')
@@ -53,7 +53,7 @@ class EngineBuilder:
             self.weight.write_bytes(engine.serialize())
         logger.log(trt.Logger.WARNING, f'Build tensorrt engine finish.\nSave in {str(self.weight.absolute())}')
 
-    def build(self, fp16: bool = True, with_profiling=True):
+    def build(self, fp16: bool = True, with_profiling=True) -> None:
         self.__build_engine(fp16, with_profiling)
 
 
@@ -64,7 +64,7 @@ class TRTModule(torch.nn.Module):
                     trt.float16: torch.float16,
                     trt.float32: torch.float32}
 
-    def __init__(self, weight: Union[str, Path], device: Optional[torch.device]):
+    def __init__(self, weight: Union[str, Path], device: Optional[torch.device]) -> None:
         super(TRTModule, self).__init__()
         self.weight = Path(weight) if isinstance(weight, str) else weight
         self.device = device if device is not None else torch.device('cuda:0')
@@ -72,7 +72,7 @@ class TRTModule(torch.nn.Module):
         self.__init_engine()
         self.__init_bindings()
 
-    def __init_engine(self):
+    def __init_engine(self) -> None:
         logger = trt.Logger(trt.Logger.WARNING)
         trt.init_libnvinfer_plugins(logger, namespace='')
         with trt.Runtime(logger) as runtime:
@@ -98,7 +98,7 @@ class TRTModule(torch.nn.Module):
         self.input_names = names[:num_inputs]
         self.output_names = names[num_inputs:]
 
-    def __init_bindings(self):
+    def __init_bindings(self) -> None:
         dynamic = False
         Tensor = namedtuple('Tensor', ('name', 'dtype', 'shape'))
         inp_info = []
@@ -122,7 +122,10 @@ class TRTModule(torch.nn.Module):
         self.inp_info = inp_info
         self.out_infp = out_info
 
-    def forward(self, *inputs):
+    def set_profiler(self, profiler: Optional[trt.IProfiler]):
+        self.context.profiler = profiler if profiler is not None else trt.Profiler()
+
+    def forward(self, *inputs) -> Union[Tuple, torch.Tensor]:
 
         assert len(inputs) == self.num_inputs
         contiguous_inputs: List[torch.Tensor] = [i.contiguous() for i in inputs]
@@ -148,3 +151,30 @@ class TRTModule(torch.nn.Module):
         self.stream.synchronize()
 
         return tuple(outputs) if len(outputs) > 1 else outputs[0]
+
+
+class TRTProfilerV1(trt.IProfiler):
+    def __init__(self):
+        trt.IProfiler.__init__(self)
+        self.total_runtime = 0.0
+        self.recorder = defaultdict(float)
+
+    def report_layer_time(self, layer_name: str, ms: float):
+        self.total_runtime += ms * 1000
+        self.recorder[layer_name] += ms * 1000
+
+    def report(self):
+        f = '\t%40s\t\t\t\t%10.4f'
+        print('\t%40s\t\t\t\t%10s' % ('layername', 'cost(us)'))
+        for name, cost in sorted(self.recorder.items(), key=lambda x: -x[1]):
+            print(f % (name if len(name) < 40 else name[:35] + ' ' + '*' * 4, cost))
+        print(f'\nTotal Inference Time: {self.total_runtime:.4f}(us)')
+
+
+class TRTProfilerV0(trt.IProfiler):
+    def __init__(self):
+        trt.IProfiler.__init__(self)
+
+    def report_layer_time(self, layer_name: str, ms: float):
+        f = '\t%40s\t\t\t\t%10.4fms'
+        print(f % (layer_name if len(layer_name) < 40 else layer_name[:35] + ' ' + '*' * 4, ms))