Add profiler

pull/1/head
triple-Mu 2 years ago
parent 7c70bf6864
commit 9305126e14
  1. 35
      README.md
  2. 18
      infer.py
  3. 4
      models/__init__.py
  4. 48
      models/engine.py

@ -1,18 +1,25 @@
# YOLOv8-TensorRT
YOLOv8 using TensorRT accelerate !
# Preprocessed ONNX model
You can dowload the onnx model which is pretrained by https://github.com/ultralytics .
[**YOLOv8-n**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8n_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1772936700&Signature=r6HgJTTcCSAxQxD9bKO9qBTtigQ%3D)
[**YOLOv8-n
**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8n_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1772936700&Signature=r6HgJTTcCSAxQxD9bKO9qBTtigQ%3D)
[**YOLOv8-s**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8s_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936722&Signature=JjxQFx1YElcVdsCaMoj81KJ4a5s%3D)
[**YOLOv8-s
**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8s_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936722&Signature=JjxQFx1YElcVdsCaMoj81KJ4a5s%3D)
[**YOLOv8-m**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8m_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936739&Signature=IRKBELdVFemD7diixxxgzMYqsWg%3D)
[**YOLOv8-m
**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8m_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936739&Signature=IRKBELdVFemD7diixxxgzMYqsWg%3D)
[**YOLOv8-l**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8l_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936763&Signature=RGkJ4G2XJ4J%2BNiki5cJi3oBkDnA%3D)
[**YOLOv8-l
**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8l_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1682936763&Signature=RGkJ4G2XJ4J%2BNiki5cJi3oBkDnA%3D)
[**YOLOv8-x**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8x_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1673936778&Signature=3o%2F7QKhiZg1dW3I6sDrY4ug6MQU%3D)
[**YOLOv8-x
**](https://triplemu.oss-cn-beijing.aliyuncs.com/YOLOv8/ONNX/yolov8x_nms.onnx?OSSAccessKeyId=LTAI5tN1dgmZD4PF8AJUXp3J&Expires=1673936778&Signature=3o%2F7QKhiZg1dW3I6sDrY4ug6MQU%3D)
# Build TensorRT engine by ONNX
@ -20,7 +27,7 @@ You can dowload the onnx model which is pretrained by https://github.com/ultraly
You can export TensorRT engine by [`build.py` ](build.py).
Usage:
Usage:
``` shell
python3 build.py --onnx yolov8s_nms.onnx --device cuda:0 --fp16
@ -42,9 +49,11 @@ Usage:
/usr/src/tensorrt/bin/trtexec --onnx=yolov8s_nms.onnx --saveEngine=yolov8s_nms.engine --fp16
```
***If you installed TensorRT by a debian package, then the installation path of `trtexec` is `/usr/src/tensorrt/bin/trtexec`***
***If you installed TensorRT by a debian package, then the installation path of `trtexec`
is `/usr/src/tensorrt/bin/trtexec`***
***If you installed TensorRT by a tar package, then the installation path of trtexec is under the `bin` folder in the path you decompressed***
***If you installed TensorRT by a tar package, then the installation path of `trtexec` is under the `bin` folder in the
path you decompressed***
# Infer images by the engine which you export
@ -68,5 +77,13 @@ python3 infer.py --engine yolov8s_nms.engine --imgs data --show --out-dir output
- `--device` : The CUDA deivce you use.
- `--profile` : Profile the TensorRT engine.
If you want to profile the TensorRT engine:
Usage:
``` shell
python3 infer.py --engine yolov8s_nms.engine --profile
```

@ -1,4 +1,4 @@
from models import TRTModule
from models import TRTModule, TRTProfilerV0
from pathlib import Path
import cv2
import argparse
@ -112,10 +112,24 @@ def parse_args():
'--out-dir', type=str, default='./output', help='Path to output file')
parser.add_argument(
'--device', type=str, default='cuda:0', help='TensorRT infer device')
parser.add_argument(
'--profile', action='store_true', help='Profile TensorRT engine')
args = parser.parse_args()
return args
def profile(args):
device = torch.device(args.device)
Engine = TRTModule(args.engine, device)
profiler = TRTProfilerV0()
Engine.set_profiler(profiler)
random_input = torch.randn(Engine.inp_info[0].shape, device=device)
_ = Engine(random_input)
if __name__ == '__main__':
args = parse_args()
main(args)
if args.profile:
profile(args)
else:
main(args)

@ -1,3 +1,3 @@
from .engine import EngineBuilder, TRTModule
from .engine import EngineBuilder, TRTModule, TRTProfilerV0, TRTProfilerV1
__all__ = ['EngineBuilder', 'TRTModule']
__all__ = ['EngineBuilder', 'TRTModule', 'TRTProfilerV0', 'TRTProfilerV1']

@ -1,6 +1,6 @@
from pathlib import Path
from typing import Optional, Union, List
from collections import namedtuple
from typing import Optional, Union, List, Tuple
from collections import namedtuple, defaultdict
try:
import tensorrt as trt
@ -40,9 +40,9 @@ class EngineBuilder:
outputs = [network.get_output(i) for i in range(network.num_outputs)]
for inp in inputs:
logger.log(trt.Logger.WARNING, f'input "{inp.name}" with shape{inp.shape} {inp.dtype}')
logger.log(trt.Logger.WARNING, f'input "{inp.name}" with shape: {inp.shape} dtype: {inp.dtype}')
for out in outputs:
logger.log(trt.Logger.WARNING, f'output "{out.name}" with shape{out.shape} {out.dtype}')
logger.log(trt.Logger.WARNING, f'output "{out.name}" with shape: {out.shape} dtype: {out.dtype}')
if fp16 and builder.platform_has_fast_fp16:
config.set_flag(trt.BuilderFlag.FP16)
self.weight = self.checkpoint.with_suffix('.engine')
@ -53,7 +53,7 @@ class EngineBuilder:
self.weight.write_bytes(engine.serialize())
logger.log(trt.Logger.WARNING, f'Build tensorrt engine finish.\nSave in {str(self.weight.absolute())}')
def build(self, fp16: bool = True, with_profiling=True):
def build(self, fp16: bool = True, with_profiling=True) -> None:
self.__build_engine(fp16, with_profiling)
@ -64,7 +64,7 @@ class TRTModule(torch.nn.Module):
trt.float16: torch.float16,
trt.float32: torch.float32}
def __init__(self, weight: Union[str, Path], device: Optional[torch.device]):
def __init__(self, weight: Union[str, Path], device: Optional[torch.device]) -> None:
super(TRTModule, self).__init__()
self.weight = Path(weight) if isinstance(weight, str) else weight
self.device = device if device is not None else torch.device('cuda:0')
@ -72,7 +72,7 @@ class TRTModule(torch.nn.Module):
self.__init_engine()
self.__init_bindings()
def __init_engine(self):
def __init_engine(self) -> None:
logger = trt.Logger(trt.Logger.WARNING)
trt.init_libnvinfer_plugins(logger, namespace='')
with trt.Runtime(logger) as runtime:
@ -98,7 +98,7 @@ class TRTModule(torch.nn.Module):
self.input_names = names[:num_inputs]
self.output_names = names[num_inputs:]
def __init_bindings(self):
def __init_bindings(self) -> None:
dynamic = False
Tensor = namedtuple('Tensor', ('name', 'dtype', 'shape'))
inp_info = []
@ -122,7 +122,10 @@ class TRTModule(torch.nn.Module):
self.inp_info = inp_info
self.out_infp = out_info
def forward(self, *inputs):
def set_profiler(self, profiler: Optional[trt.IProfiler]):
self.context.profiler = profiler if profiler is not None else trt.Profiler()
def forward(self, *inputs) -> Union[Tuple, torch.Tensor]:
assert len(inputs) == self.num_inputs
contiguous_inputs: List[torch.Tensor] = [i.contiguous() for i in inputs]
@ -148,3 +151,30 @@ class TRTModule(torch.nn.Module):
self.stream.synchronize()
return tuple(outputs) if len(outputs) > 1 else outputs[0]
class TRTProfilerV1(trt.IProfiler):
def __init__(self):
trt.IProfiler.__init__(self)
self.total_runtime = 0.0
self.recorder = defaultdict(float)
def report_layer_time(self, layer_name: str, ms: float):
self.total_runtime += ms * 1000
self.recorder[layer_name] += ms * 1000
def report(self):
f = '\t%40s\t\t\t\t%10.4f'
print('\t%40s\t\t\t\t%10s' % ('layername', 'cost(us)'))
for name, cost in sorted(self.recorder.items(), key=lambda x: -x[1]):
print(f % (name if len(name) < 40 else name[:35] + ' ' + '*' * 4, cost))
print(f'\nTotal Inference Time: {self.total_runtime:.4f}(us)')
class TRTProfilerV0(trt.IProfiler):
def __init__(self):
trt.IProfiler.__init__(self)
def report_layer_time(self, layer_name: str, ms: float):
f = '\t%40s\t\t\t\t%10.4fms'
print(f % (layer_name if len(layer_name) < 40 else layer_name[:35] + ' ' + '*' * 4, ms))

Loading…
Cancel
Save