From 9305126e14c214c5a526bbcaced079f11e399aa3 Mon Sep 17 00:00:00 2001 From: triple-Mu Date: Fri, 6 Jan 2023 11:14:39 +0800 Subject: [PATCH] Add profiler --- README.md | 35 ++++++++++++++++++++++++--------- infer.py | 18 +++++++++++++++-- models/__init__.py | 4 ++-- models/engine.py | 48 +++++++++++++++++++++++++++++++++++++--------- 4 files changed, 83 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 3bb2261..014618c 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,25 @@ # YOLOv8-TensorRT + YOLOv8 using TensorRT accelerate ! # Preprocessed ONNX model + You can dowload the onnx model which is pretrained by https://github.com/ultralytics . -[**YOLOv8-n**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8n_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1772936700&Signature=r6HgJTTcCSAxQxD9bKO9qBTtigQ%3D) +[**YOLOv8-n +**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8n_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1772936700&Signature=r6HgJTTcCSAxQxD9bKO9qBTtigQ%3D) -[**YOLOv8-s**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8s_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936722&Signature=JjxQFx1YElcVdsCaMoj81KJ4a5s%3D) +[**YOLOv8-s +**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8s_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936722&Signature=JjxQFx1YElcVdsCaMoj81KJ4a5s%3D) -[**YOLOv8-m**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8m_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936739&Signature=IRKBELdVFemD7diixxxgzMYqsWg%3D) +[**YOLOv8-m +**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8m_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936739&Signature=IRKBELdVFemD7diixxxgzMYqsWg%3D) -[**YOLOv8-l**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8l_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936763&Signature=RGkJ4G2XJ4J%2BNiki5cJi3oBkDnA%3D) +[**YOLOv8-l +**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8l_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936763&Signature=RGkJ4G2XJ4J%2BNiki5cJi3oBkDnA%3D) -[**YOLOv8-x**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8x_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1673936778&Signature=3o%2F7QKhiZg1dW3I6sDrY4ug6MQU%3D) +[**YOLOv8-x +**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8x_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1673936778&Signature=3o%2F7QKhiZg1dW3I6sDrY4ug6MQU%3D) # Build TensorRT engine by ONNX @@ -20,7 +27,7 @@ You can dowload the onnx model which is pretrained by https://github.com/ultraly You can export TensorRT engine by [`build.py` ](build.py). -Usage: +Usage: ``` shell python3 build.py --onnx yolov8s_nms.onnx --device cuda:0 --fp16 @@ -42,9 +49,11 @@ Usage: /usr/src/tensorrt/bin/trtexec --onnx=yolov8s_nms.onnx --saveEngine=yolov8s_nms.engine --fp16 ``` -***If you installed TensorRT by a debian package, then the installation path of `trtexec` is `/usr/src/tensorrt/bin/trtexec`*** +***If you installed TensorRT by a debian package, then the installation path of `trtexec` +is `/usr/src/tensorrt/bin/trtexec`*** -***If you installed TensorRT by a tar package, then the installation path of trtexec is under the `bin` folder in the path you decompressed*** +***If you installed TensorRT by a tar package, then the installation path of `trtexec` is under the `bin` folder in the +path you decompressed*** # Infer images by the engine which you export @@ -68,5 +77,13 @@ python3 infer.py --engine yolov8s_nms.engine --imgs data --show --out-dir output - `--device` : The CUDA deivce you use. - +- `--profile` : Profile the TensorRT engine. + +If you want to profile the TensorRT engine: + +Usage: + +``` shell +python3 infer.py --engine yolov8s_nms.engine --profile +``` diff --git a/infer.py b/infer.py index 554900a..ccedff8 100644 --- a/infer.py +++ b/infer.py @@ -1,4 +1,4 @@ -from models import TRTModule +from models import TRTModule, TRTProfilerV0 from pathlib import Path import cv2 import argparse @@ -112,10 +112,24 @@ def parse_args(): '--out-dir', type=str, default='./output', help='Path to output file') parser.add_argument( '--device', type=str, default='cuda:0', help='TensorRT infer device') + parser.add_argument( + '--profile', action='store_true', help='Profile TensorRT engine') args = parser.parse_args() return args +def profile(args): + device = torch.device(args.device) + Engine = TRTModule(args.engine, device) + profiler = TRTProfilerV0() + Engine.set_profiler(profiler) + random_input = torch.randn(Engine.inp_info[0].shape, device=device) + _ = Engine(random_input) + + if __name__ == '__main__': args = parse_args() - main(args) + if args.profile: + profile(args) + else: + main(args) diff --git a/models/__init__.py b/models/__init__.py index fc6d4e8..20da0b9 100644 --- a/models/__init__.py +++ b/models/__init__.py @@ -1,3 +1,3 @@ -from .engine import EngineBuilder, TRTModule +from .engine import EngineBuilder, TRTModule, TRTProfilerV0, TRTProfilerV1 -__all__ = ['EngineBuilder', 'TRTModule'] \ No newline at end of file +__all__ = ['EngineBuilder', 'TRTModule', 'TRTProfilerV0', 'TRTProfilerV1'] \ No newline at end of file diff --git a/models/engine.py b/models/engine.py index e36cddd..136ae66 100644 --- a/models/engine.py +++ b/models/engine.py @@ -1,6 +1,6 @@ from pathlib import Path -from typing import Optional, Union, List -from collections import namedtuple +from typing import Optional, Union, List, Tuple +from collections import namedtuple, defaultdict try: import tensorrt as trt @@ -40,9 +40,9 @@ class EngineBuilder: outputs = [network.get_output(i) for i in range(network.num_outputs)] for inp in inputs: - logger.log(trt.Logger.WARNING, f'input "{inp.name}" with shape{inp.shape} {inp.dtype}') + logger.log(trt.Logger.WARNING, f'input "{inp.name}" with shape: {inp.shape} dtype: {inp.dtype}') for out in outputs: - logger.log(trt.Logger.WARNING, f'output "{out.name}" with shape{out.shape} {out.dtype}') + logger.log(trt.Logger.WARNING, f'output "{out.name}" with shape: {out.shape} dtype: {out.dtype}') if fp16 and builder.platform_has_fast_fp16: config.set_flag(trt.BuilderFlag.FP16) self.weight = self.checkpoint.with_suffix('.engine') @@ -53,7 +53,7 @@ class EngineBuilder: self.weight.write_bytes(engine.serialize()) logger.log(trt.Logger.WARNING, f'Build tensorrt engine finish.\nSave in {str(self.weight.absolute())}') - def build(self, fp16: bool = True, with_profiling=True): + def build(self, fp16: bool = True, with_profiling=True) -> None: self.__build_engine(fp16, with_profiling) @@ -64,7 +64,7 @@ class TRTModule(torch.nn.Module): trt.float16: torch.float16, trt.float32: torch.float32} - def __init__(self, weight: Union[str, Path], device: Optional[torch.device]): + def __init__(self, weight: Union[str, Path], device: Optional[torch.device]) -> None: super(TRTModule, self).__init__() self.weight = Path(weight) if isinstance(weight, str) else weight self.device = device if device is not None else torch.device('cuda:0') @@ -72,7 +72,7 @@ class TRTModule(torch.nn.Module): self.__init_engine() self.__init_bindings() - def __init_engine(self): + def __init_engine(self) -> None: logger = trt.Logger(trt.Logger.WARNING) trt.init_libnvinfer_plugins(logger, namespace='') with trt.Runtime(logger) as runtime: @@ -98,7 +98,7 @@ class TRTModule(torch.nn.Module): self.input_names = names[:num_inputs] self.output_names = names[num_inputs:] - def __init_bindings(self): + def __init_bindings(self) -> None: dynamic = False Tensor = namedtuple('Tensor', ('name', 'dtype', 'shape')) inp_info = [] @@ -122,7 +122,10 @@ class TRTModule(torch.nn.Module): self.inp_info = inp_info self.out_infp = out_info - def forward(self, *inputs): + def set_profiler(self, profiler: Optional[trt.IProfiler]): + self.context.profiler = profiler if profiler is not None else trt.Profiler() + + def forward(self, *inputs) -> Union[Tuple, torch.Tensor]: assert len(inputs) == self.num_inputs contiguous_inputs: List[torch.Tensor] = [i.contiguous() for i in inputs] @@ -148,3 +151,30 @@ class TRTModule(torch.nn.Module): self.stream.synchronize() return tuple(outputs) if len(outputs) > 1 else outputs[0] + + +class TRTProfilerV1(trt.IProfiler): + def __init__(self): + trt.IProfiler.__init__(self) + self.total_runtime = 0.0 + self.recorder = defaultdict(float) + + def report_layer_time(self, layer_name: str, ms: float): + self.total_runtime += ms * 1000 + self.recorder[layer_name] += ms * 1000 + + def report(self): + f = '\t%40s\t\t\t\t%10.4f' + print('\t%40s\t\t\t\t%10s' % ('layername', 'cost(us)')) + for name, cost in sorted(self.recorder.items(), key=lambda x: -x[1]): + print(f % (name if len(name) < 40 else name[:35] + ' ' + '*' * 4, cost)) + print(f'\nTotal Inference Time: {self.total_runtime:.4f}(us)') + + +class TRTProfilerV0(trt.IProfiler): + def __init__(self): + trt.IProfiler.__init__(self) + + def report_layer_time(self, layer_name: str, ms: float): + f = '\t%40s\t\t\t\t%10.4fms' + print(f % (layer_name if len(layer_name) < 40 else layer_name[:35] + ' ' + '*' * 4, ms))