Add profiler

2 years ago · 9305126e14
parent 7c70bf6864
commit 9305126e14
4 changed files with 83 additions and 22 deletions
--- a/README.md
+++ b/README.md
@ -1,18 +1,25 @@
 # YOLOv8-TensorRT
 YOLOv8 using TensorRT accelerate !
 # Preprocessed ONNX model
 You can dowload the onnx model which is pretrained by https://github.com/ultralytics .
-[**YOLOv8-n**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8n_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1772936700&Signature=r6HgJTTcCSAxQxD9bKO9qBTtigQ%3D)
+[**YOLOv8-n
 **](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8n_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1772936700&Signature=r6HgJTTcCSAxQxD9bKO9qBTtigQ%3D)
-[**YOLOv8-s**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8s_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936722&Signature=JjxQFx1YElcVdsCaMoj81KJ4a5s%3D)
+[**YOLOv8-s
 **](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8s_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936722&Signature=JjxQFx1YElcVdsCaMoj81KJ4a5s%3D)
-[**YOLOv8-m**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8m_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936739&Signature=IRKBELdVFemD7diixxxgzMYqsWg%3D)
+[**YOLOv8-m
 **](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8m_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936739&Signature=IRKBELdVFemD7diixxxgzMYqsWg%3D)
-[**YOLOv8-l**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8l_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936763&Signature=RGkJ4G2XJ4J%2BNiki5cJi3oBkDnA%3D)
+[**YOLOv8-l
 **](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8l_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936763&Signature=RGkJ4G2XJ4J%2BNiki5cJi3oBkDnA%3D)
-[**YOLOv8-x**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8x_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1673936778&Signature=3o%2F7QKhiZg1dW3I6sDrY4ug6MQU%3D)
+[**YOLOv8-x
 **](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8x_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1673936778&Signature=3o%2F7QKhiZg1dW3I6sDrY4ug6MQU%3D)
 # Build TensorRT engine by ONNX
@ -42,9 +49,11 @@ Usage:
 /usr/src/tensorrt/bin/trtexec --onnx=yolov8s_nms.onnx --saveEngine=yolov8s_nms.engine --fp16
 ```
-***If you installed TensorRT by a debian package, then the installation path of `trtexec` is `/usr/src/tensorrt/bin/trtexec`***
+***If you installed TensorRT by a debian package, then the installation path of `trtexec`
 is `/usr/src/tensorrt/bin/trtexec`***
-***If you installed TensorRT by a tar package, then the installation path of trtexec is under the `bin` folder in the path you decompressed***
+***If you installed TensorRT by a tar package, then the installation path of `trtexec` is under the `bin` folder in the
 path you decompressed***
 # Infer images by the engine which you export
@ -68,5 +77,13 @@ python3 infer.py --engine yolov8s_nms.engine --imgs data --show --out-dir output
 - `--device` : The CUDA deivce you use.
 - `--profile` : Profile the TensorRT engine.
 If you want to profile the TensorRT engine:
 Usage:
 ``` shell
 python3 infer.py --engine yolov8s_nms.engine --profile
 ```
--- a/infer.py
+++ b/infer.py
@ -1,4 +1,4 @@
-from models import TRTModule
+from models import TRTModule, TRTProfilerV0
 from pathlib import Path
 import cv2
 import argparse
@ -112,10 +112,24 @@ def parse_args():
        '--out-dir', type=str, default='./output', help='Path to output file')
    parser.add_argument(
        '--device', type=str, default='cuda:0', help='TensorRT infer device')
    parser.add_argument(
        '--profile', action='store_true', help='Profile TensorRT engine')
    args = parser.parse_args()
    return args
 def profile(args):
    device = torch.device(args.device)
    Engine = TRTModule(args.engine, device)
    profiler = TRTProfilerV0()
    Engine.set_profiler(profiler)
    random_input = torch.randn(Engine.inp_info[0].shape, device=device)
    _ = Engine(random_input)
 if __name__ == '__main__':
    args = parse_args()
-    main(args)
+    if args.profile:
        profile(args)
    else:
        main(args)
--- a/models/init.py
+++ b/models/init.py
@ -1,3 +1,3 @@
-from .engine import EngineBuilder, TRTModule
+from .engine import EngineBuilder, TRTModule, TRTProfilerV0, TRTProfilerV1
-__all__ = ['EngineBuilder', 'TRTModule']
+__all__ = ['EngineBuilder', 'TRTModule', 'TRTProfilerV0', 'TRTProfilerV1']
--- a/models/engine.py
+++ b/models/engine.py
@ -1,6 +1,6 @@
 from pathlib import Path
-from typing import Optional, Union, List
+from typing import Optional, Union, List, Tuple
-from collections import namedtuple
+from collections import namedtuple, defaultdict
 try:
    import tensorrt as trt
@ -40,9 +40,9 @@ class EngineBuilder:
        outputs = [network.get_output(i) for i in range(network.num_outputs)]
        for inp in inputs:
-            logger.log(trt.Logger.WARNING, f'input "{inp.name}" with shape{inp.shape} {inp.dtype}')
+            logger.log(trt.Logger.WARNING, f'input "{inp.name}" with shape: {inp.shape} dtype: {inp.dtype}')
        for out in outputs:
-            logger.log(trt.Logger.WARNING, f'output "{out.name}" with shape{out.shape} {out.dtype}')
+            logger.log(trt.Logger.WARNING, f'output "{out.name}" with shape: {out.shape} dtype: {out.dtype}')
        if fp16 and builder.platform_has_fast_fp16:
            config.set_flag(trt.BuilderFlag.FP16)
        self.weight = self.checkpoint.with_suffix('.engine')
@ -53,7 +53,7 @@ class EngineBuilder:
            self.weight.write_bytes(engine.serialize())
        logger.log(trt.Logger.WARNING, f'Build tensorrt engine finish.\nSave in {str(self.weight.absolute())}')
-    def build(self, fp16: bool = True, with_profiling=True):
+    def build(self, fp16: bool = True, with_profiling=True) -> None:
        self.__build_engine(fp16, with_profiling)
@ -64,7 +64,7 @@ class TRTModule(torch.nn.Module):
                    trt.float16: torch.float16,
                    trt.float32: torch.float32}
-    def __init__(self, weight: Union[str, Path], device: Optional[torch.device]):
+    def __init__(self, weight: Union[str, Path], device: Optional[torch.device]) -> None:
        super(TRTModule, self).__init__()
        self.weight = Path(weight) if isinstance(weight, str) else weight
        self.device = device if device is not None else torch.device('cuda:0')
@ -72,7 +72,7 @@ class TRTModule(torch.nn.Module):
        self.__init_engine()
        self.__init_bindings()
-    def __init_engine(self):
+    def __init_engine(self) -> None:
        logger = trt.Logger(trt.Logger.WARNING)
        trt.init_libnvinfer_plugins(logger, namespace='')
        with trt.Runtime(logger) as runtime:
@ -98,7 +98,7 @@ class TRTModule(torch.nn.Module):
        self.input_names = names[:num_inputs]
        self.output_names = names[num_inputs:]
-    def __init_bindings(self):
+    def __init_bindings(self) -> None:
        dynamic = False
        Tensor = namedtuple('Tensor', ('name', 'dtype', 'shape'))
        inp_info = []
@ -122,7 +122,10 @@ class TRTModule(torch.nn.Module):
        self.inp_info = inp_info
        self.out_infp = out_info
-    def forward(self, *inputs):
+    def set_profiler(self, profiler: Optional[trt.IProfiler]):
        self.context.profiler = profiler if profiler is not None else trt.Profiler()
    def forward(self, *inputs) -> Union[Tuple, torch.Tensor]:
        assert len(inputs) == self.num_inputs
        contiguous_inputs: List[torch.Tensor] = [i.contiguous() for i in inputs]
@ -148,3 +151,30 @@ class TRTModule(torch.nn.Module):
        self.stream.synchronize()
        return tuple(outputs) if len(outputs) > 1 else outputs[0]
 class TRTProfilerV1(trt.IProfiler):
    def __init__(self):
        trt.IProfiler.__init__(self)
        self.total_runtime = 0.0
        self.recorder = defaultdict(float)
    def report_layer_time(self, layer_name: str, ms: float):
        self.total_runtime += ms * 1000
        self.recorder[layer_name] += ms * 1000
    def report(self):
        f = '\t%40s\t\t\t\t%10.4f'
        print('\t%40s\t\t\t\t%10s' % ('layername', 'cost(us)'))
        for name, cost in sorted(self.recorder.items(), key=lambda x: -x[1]):
            print(f % (name if len(name) < 40 else name[:35] + ' ' + '*' * 4, cost))
        print(f'\nTotal Inference Time: {self.total_runtime:.4f}(us)')
 class TRTProfilerV0(trt.IProfiler):
    def __init__(self):
        trt.IProfiler.__init__(self)
    def report_layer_time(self, layer_name: str, ms: float):
        f = '\t%40s\t\t\t\t%10.4fms'
        print(f % (layer_name if len(layer_name) < 40 else layer_name[:35] + ' ' + '*' * 4, ms))
`@ -1,3 +1,3 @@`
	`from .engine import EngineBuilder, TRTModule`	`from .engine import EngineBuilder, TRTModule, TRTProfilerV0, TRTProfilerV1`

	`__all__ = ['EngineBuilder', 'TRTModule']`	`__all__ = ['EngineBuilder', 'TRTModule', 'TRTProfilerV0', 'TRTProfilerV1']`