Add profiler

2 years ago · 9305126e14
parent 7c70bf6864
commit 9305126e14
4 changed files with 83 additions and 22 deletions
--- a/README.md
+++ b/README.md
@ -1,18 +1,25 @@
 # YOLOv8-TensorRT
+
 YOLOv8 using TensorRT accelerate !

 # Preprocessed ONNX model
+
 You can dowload the onnx model which is pretrained by https://github.com/ultralytics .

-[**YOLOv8-n**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8n_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1772936700&Signature=r6HgJTTcCSAxQxD9bKO9qBTtigQ%3D)
+[**YOLOv8-n
+**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8n_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1772936700&Signature=r6HgJTTcCSAxQxD9bKO9qBTtigQ%3D)

-[**YOLOv8-s**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8s_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936722&Signature=JjxQFx1YElcVdsCaMoj81KJ4a5s%3D)
+[**YOLOv8-s
+**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8s_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936722&Signature=JjxQFx1YElcVdsCaMoj81KJ4a5s%3D)

-[**YOLOv8-m**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8m_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936739&Signature=IRKBELdVFemD7diixxxgzMYqsWg%3D)
+[**YOLOv8-m
+**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8m_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936739&Signature=IRKBELdVFemD7diixxxgzMYqsWg%3D)

-[**YOLOv8-l**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8l_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936763&Signature=RGkJ4G2XJ4J%2BNiki5cJi3oBkDnA%3D)
+[**YOLOv8-l
+**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8l_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936763&Signature=RGkJ4G2XJ4J%2BNiki5cJi3oBkDnA%3D)

-[**YOLOv8-x**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8x_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1673936778&Signature=3o%2F7QKhiZg1dW3I6sDrY4ug6MQU%3D)
+[**YOLOv8-x
+**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8x_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1673936778&Signature=3o%2F7QKhiZg1dW3I6sDrY4ug6MQU%3D)

 # Build TensorRT engine by ONNX

@ -20,7 +27,7 @@ You can dowload the onnx model which is pretrained by https://github.com/ultraly

 You can export TensorRT engine by [`build.py` ](build.py).

-Usage: 
+Usage:

 ``` shell
 python3 build.py --onnx yolov8s_nms.onnx --device cuda:0 --fp16
@ -42,9 +49,11 @@ Usage:
 /usr/src/tensorrt/bin/trtexec --onnx=yolov8s_nms.onnx --saveEngine=yolov8s_nms.engine --fp16
 ```

-***If you installed TensorRT by a debian package, then the installation path of `trtexec` is `/usr/src/tensorrt/bin/trtexec`***
+***If you installed TensorRT by a debian package, then the installation path of `trtexec`
+is `/usr/src/tensorrt/bin/trtexec`***

-***If you installed TensorRT by a tar package, then the installation path of trtexec is under the `bin` folder in the path you decompressed***
+***If you installed TensorRT by a tar package, then the installation path of `trtexec` is under the `bin` folder in the
+path you decompressed***

 # Infer images by the engine which you export

@ -68,5 +77,13 @@ python3 infer.py --engine yolov8s_nms.engine --imgs data --show --out-dir output

 - `--device` : The CUDA deivce you use.

-  
+- `--profile` : Profile the TensorRT engine.
+
+If you want to profile the TensorRT engine:
+
+Usage:
+
+``` shell
+python3 infer.py --engine yolov8s_nms.engine --profile
+```

--- a/infer.py
+++ b/infer.py
@ -1,4 +1,4 @@
-from models import TRTModule
+from models import TRTModule, TRTProfilerV0
 from pathlib import Path
 import cv2
 import argparse
@ -112,10 +112,24 @@ def parse_args():
        '--out-dir', type=str, default='./output', help='Path to output file')
    parser.add_argument(
        '--device', type=str, default='cuda:0', help='TensorRT infer device')
+    parser.add_argument(
+        '--profile', action='store_true', help='Profile TensorRT engine')
    args = parser.parse_args()
    return args


+def profile(args):
+    device = torch.device(args.device)
+    Engine = TRTModule(args.engine, device)
+    profiler = TRTProfilerV0()
+    Engine.set_profiler(profiler)
+    random_input = torch.randn(Engine.inp_info[0].shape, device=device)
+    _ = Engine(random_input)
+
+
 if __name__ == '__main__':
    args = parse_args()
-    main(args)
+    if args.profile:
+        profile(args)
+    else:
+        main(args)
--- a/models/init.py
+++ b/models/init.py
@ -1,3 +1,3 @@
-from .engine import EngineBuilder, TRTModule
+from .engine import EngineBuilder, TRTModule, TRTProfilerV0, TRTProfilerV1

-__all__ = ['EngineBuilder', 'TRTModule']
+__all__ = ['EngineBuilder', 'TRTModule', 'TRTProfilerV0', 'TRTProfilerV1']
--- a/models/engine.py
+++ b/models/engine.py
@ -1,6 +1,6 @@
 from pathlib import Path
-from typing import Optional, Union, List
-from collections import namedtuple
+from typing import Optional, Union, List, Tuple
+from collections import namedtuple, defaultdict

 try:
    import tensorrt as trt
@ -40,9 +40,9 @@ class EngineBuilder:
        outputs = [network.get_output(i) for i in range(network.num_outputs)]

        for inp in inputs:
-            logger.log(trt.Logger.WARNING, f'input "{inp.name}" with shape{inp.shape} {inp.dtype}')
+            logger.log(trt.Logger.WARNING, f'input "{inp.name}" with shape: {inp.shape} dtype: {inp.dtype}')
        for out in outputs:
-            logger.log(trt.Logger.WARNING, f'output "{out.name}" with shape{out.shape} {out.dtype}')
+            logger.log(trt.Logger.WARNING, f'output "{out.name}" with shape: {out.shape} dtype: {out.dtype}')
        if fp16 and builder.platform_has_fast_fp16:
            config.set_flag(trt.BuilderFlag.FP16)
        self.weight = self.checkpoint.with_suffix('.engine')
@ -53,7 +53,7 @@ class EngineBuilder:
            self.weight.write_bytes(engine.serialize())
        logger.log(trt.Logger.WARNING, f'Build tensorrt engine finish.\nSave in {str(self.weight.absolute())}')

-    def build(self, fp16: bool = True, with_profiling=True):
+    def build(self, fp16: bool = True, with_profiling=True) -> None:
        self.__build_engine(fp16, with_profiling)


@ -64,7 +64,7 @@ class TRTModule(torch.nn.Module):
                    trt.float16: torch.float16,
                    trt.float32: torch.float32}

-    def __init__(self, weight: Union[str, Path], device: Optional[torch.device]):
+    def __init__(self, weight: Union[str, Path], device: Optional[torch.device]) -> None:
        super(TRTModule, self).__init__()
        self.weight = Path(weight) if isinstance(weight, str) else weight
        self.device = device if device is not None else torch.device('cuda:0')
@ -72,7 +72,7 @@ class TRTModule(torch.nn.Module):
        self.__init_engine()
        self.__init_bindings()

-    def __init_engine(self):
+    def __init_engine(self) -> None:
        logger = trt.Logger(trt.Logger.WARNING)
        trt.init_libnvinfer_plugins(logger, namespace='')
        with trt.Runtime(logger) as runtime:
@ -98,7 +98,7 @@ class TRTModule(torch.nn.Module):
        self.input_names = names[:num_inputs]
        self.output_names = names[num_inputs:]

-    def __init_bindings(self):
+    def __init_bindings(self) -> None:
        dynamic = False
        Tensor = namedtuple('Tensor', ('name', 'dtype', 'shape'))
        inp_info = []
@ -122,7 +122,10 @@ class TRTModule(torch.nn.Module):
        self.inp_info = inp_info
        self.out_infp = out_info

-    def forward(self, *inputs):
+    def set_profiler(self, profiler: Optional[trt.IProfiler]):
+        self.context.profiler = profiler if profiler is not None else trt.Profiler()
+
+    def forward(self, *inputs) -> Union[Tuple, torch.Tensor]:

        assert len(inputs) == self.num_inputs
        contiguous_inputs: List[torch.Tensor] = [i.contiguous() for i in inputs]
@ -148,3 +151,30 @@ class TRTModule(torch.nn.Module):
        self.stream.synchronize()

        return tuple(outputs) if len(outputs) > 1 else outputs[0]
+
+
+class TRTProfilerV1(trt.IProfiler):
+    def __init__(self):
+        trt.IProfiler.__init__(self)
+        self.total_runtime = 0.0
+        self.recorder = defaultdict(float)
+
+    def report_layer_time(self, layer_name: str, ms: float):
+        self.total_runtime += ms * 1000
+        self.recorder[layer_name] += ms * 1000
+
+    def report(self):
+        f = '\t%40s\t\t\t\t%10.4f'
+        print('\t%40s\t\t\t\t%10s' % ('layername', 'cost(us)'))
+        for name, cost in sorted(self.recorder.items(), key=lambda x: -x[1]):
+            print(f % (name if len(name) < 40 else name[:35] + ' ' + '*' * 4, cost))
+        print(f'\nTotal Inference Time: {self.total_runtime:.4f}(us)')
+
+
+class TRTProfilerV0(trt.IProfiler):
+    def __init__(self):
+        trt.IProfiler.__init__(self)
+
+    def report_layer_time(self, layer_name: str, ms: float):
+        f = '\t%40s\t\t\t\t%10.4fms'
+        print(f % (layer_name if len(layer_name) < 40 else layer_name[:35] + ' ' + '*' * 4, ms))