Add profiler

pull/1/head
triple-Mu 2 years ago
parent 7c70bf6864
commit 9305126e14
  1. 31
      README.md
  2. 18
      infer.py
  3. 4
      models/__init__.py
  4. 48
      models/engine.py

@ -1,18 +1,25 @@
# YOLOv8-TensorRT # YOLOv8-TensorRT
YOLOv8 using TensorRT accelerate ! YOLOv8 using TensorRT accelerate !
# Preprocessed ONNX model # Preprocessed ONNX model
You can dowload the onnx model which is pretrained by https://github.com/ultralytics . You can dowload the onnx model which is pretrained by https://github.com/ultralytics .
[**YOLOv8-n**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8n_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1772936700&Signature=r6HgJTTcCSAxQxD9bKO9qBTtigQ%3D) [**YOLOv8-n
**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8n_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1772936700&Signature=r6HgJTTcCSAxQxD9bKO9qBTtigQ%3D)
[**YOLOv8-s**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8s_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936722&Signature=JjxQFx1YElcVdsCaMoj81KJ4a5s%3D) [**YOLOv8-s
**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8s_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936722&Signature=JjxQFx1YElcVdsCaMoj81KJ4a5s%3D)
[**YOLOv8-m**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8m_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936739&Signature=IRKBELdVFemD7diixxxgzMYqsWg%3D) [**YOLOv8-m
**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8m_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936739&Signature=IRKBELdVFemD7diixxxgzMYqsWg%3D)
[**YOLOv8-l**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8l_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936763&Signature=RGkJ4G2XJ4J%2BNiki5cJi3oBkDnA%3D) [**YOLOv8-l
**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8l_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936763&Signature=RGkJ4G2XJ4J%2BNiki5cJi3oBkDnA%3D)
[**YOLOv8-x**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8x_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1673936778&Signature=3o%2F7QKhiZg1dW3I6sDrY4ug6MQU%3D) [**YOLOv8-x
**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8x_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1673936778&Signature=3o%2F7QKhiZg1dW3I6sDrY4ug6MQU%3D)
# Build TensorRT engine by ONNX # Build TensorRT engine by ONNX
@ -42,9 +49,11 @@ Usage:
/usr/src/tensorrt/bin/trtexec --onnx=yolov8s_nms.onnx --saveEngine=yolov8s_nms.engine --fp16 /usr/src/tensorrt/bin/trtexec --onnx=yolov8s_nms.onnx --saveEngine=yolov8s_nms.engine --fp16
``` ```
***If you installed TensorRT by a debian package, then the installation path of `trtexec` is `/usr/src/tensorrt/bin/trtexec`*** ***If you installed TensorRT by a debian package, then the installation path of `trtexec`
is `/usr/src/tensorrt/bin/trtexec`***
***If you installed TensorRT by a tar package, then the installation path of trtexec is under the `bin` folder in the path you decompressed*** ***If you installed TensorRT by a tar package, then the installation path of `trtexec` is under the `bin` folder in the
path you decompressed***
# Infer images by the engine which you export # Infer images by the engine which you export
@ -68,5 +77,13 @@ python3 infer.py --engine yolov8s_nms.engine --imgs data --show --out-dir output
- `--device` : The CUDA deivce you use. - `--device` : The CUDA deivce you use.
- `--profile` : Profile the TensorRT engine.
If you want to profile the TensorRT engine:
Usage:
``` shell
python3 infer.py --engine yolov8s_nms.engine --profile
```

@ -1,4 +1,4 @@
from models import TRTModule from models import TRTModule, TRTProfilerV0
from pathlib import Path from pathlib import Path
import cv2 import cv2
import argparse import argparse
@ -112,10 +112,24 @@ def parse_args():
'--out-dir', type=str, default='./output', help='Path to output file') '--out-dir', type=str, default='./output', help='Path to output file')
parser.add_argument( parser.add_argument(
'--device', type=str, default='cuda:0', help='TensorRT infer device') '--device', type=str, default='cuda:0', help='TensorRT infer device')
parser.add_argument(
'--profile', action='store_true', help='Profile TensorRT engine')
args = parser.parse_args() args = parser.parse_args()
return args return args
def profile(args):
device = torch.device(args.device)
Engine = TRTModule(args.engine, device)
profiler = TRTProfilerV0()
Engine.set_profiler(profiler)
random_input = torch.randn(Engine.inp_info[0].shape, device=device)
_ = Engine(random_input)
if __name__ == '__main__': if __name__ == '__main__':
args = parse_args() args = parse_args()
main(args) if args.profile:
profile(args)
else:
main(args)

@ -1,3 +1,3 @@
from .engine import EngineBuilder, TRTModule from .engine import EngineBuilder, TRTModule, TRTProfilerV0, TRTProfilerV1
__all__ = ['EngineBuilder', 'TRTModule'] __all__ = ['EngineBuilder', 'TRTModule', 'TRTProfilerV0', 'TRTProfilerV1']

@ -1,6 +1,6 @@
from pathlib import Path from pathlib import Path
from typing import Optional, Union, List from typing import Optional, Union, List, Tuple
from collections import namedtuple from collections import namedtuple, defaultdict
try: try:
import tensorrt as trt import tensorrt as trt
@ -40,9 +40,9 @@ class EngineBuilder:
outputs = [network.get_output(i) for i in range(network.num_outputs)] outputs = [network.get_output(i) for i in range(network.num_outputs)]
for inp in inputs: for inp in inputs:
logger.log(trt.Logger.WARNING, f'input "{inp.name}" with shape{inp.shape} {inp.dtype}') logger.log(trt.Logger.WARNING, f'input "{inp.name}" with shape: {inp.shape} dtype: {inp.dtype}')
for out in outputs: for out in outputs:
logger.log(trt.Logger.WARNING, f'output "{out.name}" with shape{out.shape} {out.dtype}') logger.log(trt.Logger.WARNING, f'output "{out.name}" with shape: {out.shape} dtype: {out.dtype}')
if fp16 and builder.platform_has_fast_fp16: if fp16 and builder.platform_has_fast_fp16:
config.set_flag(trt.BuilderFlag.FP16) config.set_flag(trt.BuilderFlag.FP16)
self.weight = self.checkpoint.with_suffix('.engine') self.weight = self.checkpoint.with_suffix('.engine')
@ -53,7 +53,7 @@ class EngineBuilder:
self.weight.write_bytes(engine.serialize()) self.weight.write_bytes(engine.serialize())
logger.log(trt.Logger.WARNING, f'Build tensorrt engine finish.\nSave in {str(self.weight.absolute())}') logger.log(trt.Logger.WARNING, f'Build tensorrt engine finish.\nSave in {str(self.weight.absolute())}')
def build(self, fp16: bool = True, with_profiling=True): def build(self, fp16: bool = True, with_profiling=True) -> None:
self.__build_engine(fp16, with_profiling) self.__build_engine(fp16, with_profiling)
@ -64,7 +64,7 @@ class TRTModule(torch.nn.Module):
trt.float16: torch.float16, trt.float16: torch.float16,
trt.float32: torch.float32} trt.float32: torch.float32}
def __init__(self, weight: Union[str, Path], device: Optional[torch.device]): def __init__(self, weight: Union[str, Path], device: Optional[torch.device]) -> None:
super(TRTModule, self).__init__() super(TRTModule, self).__init__()
self.weight = Path(weight) if isinstance(weight, str) else weight self.weight = Path(weight) if isinstance(weight, str) else weight
self.device = device if device is not None else torch.device('cuda:0') self.device = device if device is not None else torch.device('cuda:0')
@ -72,7 +72,7 @@ class TRTModule(torch.nn.Module):
self.__init_engine() self.__init_engine()
self.__init_bindings() self.__init_bindings()
def __init_engine(self): def __init_engine(self) -> None:
logger = trt.Logger(trt.Logger.WARNING) logger = trt.Logger(trt.Logger.WARNING)
trt.init_libnvinfer_plugins(logger, namespace='') trt.init_libnvinfer_plugins(logger, namespace='')
with trt.Runtime(logger) as runtime: with trt.Runtime(logger) as runtime:
@ -98,7 +98,7 @@ class TRTModule(torch.nn.Module):
self.input_names = names[:num_inputs] self.input_names = names[:num_inputs]
self.output_names = names[num_inputs:] self.output_names = names[num_inputs:]
def __init_bindings(self): def __init_bindings(self) -> None:
dynamic = False dynamic = False
Tensor = namedtuple('Tensor', ('name', 'dtype', 'shape')) Tensor = namedtuple('Tensor', ('name', 'dtype', 'shape'))
inp_info = [] inp_info = []
@ -122,7 +122,10 @@ class TRTModule(torch.nn.Module):
self.inp_info = inp_info self.inp_info = inp_info
self.out_infp = out_info self.out_infp = out_info
def forward(self, *inputs): def set_profiler(self, profiler: Optional[trt.IProfiler]):
self.context.profiler = profiler if profiler is not None else trt.Profiler()
def forward(self, *inputs) -> Union[Tuple, torch.Tensor]:
assert len(inputs) == self.num_inputs assert len(inputs) == self.num_inputs
contiguous_inputs: List[torch.Tensor] = [i.contiguous() for i in inputs] contiguous_inputs: List[torch.Tensor] = [i.contiguous() for i in inputs]
@ -148,3 +151,30 @@ class TRTModule(torch.nn.Module):
self.stream.synchronize() self.stream.synchronize()
return tuple(outputs) if len(outputs) > 1 else outputs[0] return tuple(outputs) if len(outputs) > 1 else outputs[0]
class TRTProfilerV1(trt.IProfiler):
def __init__(self):
trt.IProfiler.__init__(self)
self.total_runtime = 0.0
self.recorder = defaultdict(float)
def report_layer_time(self, layer_name: str, ms: float):
self.total_runtime += ms * 1000
self.recorder[layer_name] += ms * 1000
def report(self):
f = '\t%40s\t\t\t\t%10.4f'
print('\t%40s\t\t\t\t%10s' % ('layername', 'cost(us)'))
for name, cost in sorted(self.recorder.items(), key=lambda x: -x[1]):
print(f % (name if len(name) < 40 else name[:35] + ' ' + '*' * 4, cost))
print(f'\nTotal Inference Time: {self.total_runtime:.4f}(us)')
class TRTProfilerV0(trt.IProfiler):
def __init__(self):
trt.IProfiler.__init__(self)
def report_layer_time(self, layer_name: str, ms: float):
f = '\t%40s\t\t\t\t%10.4fms'
print(f % (layer_name if len(layer_name) < 40 else layer_name[:35] + ' ' + '*' * 4, ms))

Loading…
Cancel
Save