You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
161 lines
6.0 KiB
161 lines
6.0 KiB
2 years ago
|
import os
|
||
|
import warnings
|
||
|
from collections import namedtuple
|
||
|
from pathlib import Path
|
||
|
from typing import List, Optional, Tuple, Union
|
||
|
|
||
|
import numpy as np
|
||
|
import tensorrt as trt
|
||
|
from cuda import cudart
|
||
|
from numpy import ndarray
|
||
|
|
||
|
os.environ['CUDA_MODULE_LOADING'] = 'LAZY'
|
||
|
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
|
||
|
|
||
|
|
||
|
class TRTEngine:
|
||
|
|
||
|
def __init__(self, weight: Union[str, Path]) -> None:
|
||
|
self.weight = Path(weight) if isinstance(weight, str) else weight
|
||
|
status, self.stream = cudart.cudaStreamCreate()
|
||
|
assert status.value == 0
|
||
|
self.__init_engine()
|
||
|
self.__init_bindings()
|
||
|
self.__warm_up()
|
||
|
|
||
|
def __init_engine(self) -> None:
|
||
|
logger = trt.Logger(trt.Logger.WARNING)
|
||
|
trt.init_libnvinfer_plugins(logger, namespace='')
|
||
|
with trt.Runtime(logger) as runtime:
|
||
|
model = runtime.deserialize_cuda_engine(self.weight.read_bytes())
|
||
|
|
||
|
context = model.create_execution_context()
|
||
|
|
||
|
names = [model.get_binding_name(i) for i in range(model.num_bindings)]
|
||
|
self.num_bindings = model.num_bindings
|
||
|
self.bindings: List[int] = [0] * self.num_bindings
|
||
|
num_inputs, num_outputs = 0, 0
|
||
|
|
||
|
for i in range(model.num_bindings):
|
||
|
if model.binding_is_input(i):
|
||
|
num_inputs += 1
|
||
|
else:
|
||
|
num_outputs += 1
|
||
|
|
||
|
self.num_inputs = num_inputs
|
||
|
self.num_outputs = num_outputs
|
||
|
self.model = model
|
||
|
self.context = context
|
||
|
self.input_names = names[:num_inputs]
|
||
|
self.output_names = names[num_inputs:]
|
||
|
|
||
|
def __init_bindings(self) -> None:
|
||
|
dynamic = False
|
||
|
Tensor = namedtuple('Tensor', ('name', 'dtype', 'shape', 'cpu', 'gpu'))
|
||
|
inp_info = []
|
||
|
out_info = []
|
||
|
out_ptrs = []
|
||
|
for i, name in enumerate(self.input_names):
|
||
|
assert self.model.get_binding_name(i) == name
|
||
|
dtype = trt.nptype(self.model.get_binding_dtype(i))
|
||
|
shape = tuple(self.model.get_binding_shape(i))
|
||
|
if -1 in shape:
|
||
|
dynamic |= True
|
||
|
if not dynamic:
|
||
|
cpu = np.empty(shape, dtype)
|
||
|
status, gpu = cudart.cudaMallocAsync(cpu.nbytes, self.stream)
|
||
|
assert status.value == 0
|
||
|
cudart.cudaMemcpyAsync(
|
||
|
gpu, cpu.ctypes.data, cpu.nbytes,
|
||
|
cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, self.stream)
|
||
|
else:
|
||
|
cpu, gpu = np.empty(0), 0
|
||
|
inp_info.append(Tensor(name, dtype, shape, cpu, gpu))
|
||
|
for i, name in enumerate(self.output_names):
|
||
|
i += self.num_inputs
|
||
|
assert self.model.get_binding_name(i) == name
|
||
|
dtype = trt.nptype(self.model.get_binding_dtype(i))
|
||
|
shape = tuple(self.model.get_binding_shape(i))
|
||
|
if not dynamic:
|
||
|
cpu = np.empty(shape, dtype=dtype)
|
||
|
status, gpu = cudart.cudaMallocAsync(cpu.nbytes, self.stream)
|
||
|
assert status.value == 0
|
||
|
cudart.cudaMemcpyAsync(
|
||
|
gpu, cpu.ctypes.data, cpu.nbytes,
|
||
|
cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, self.stream)
|
||
|
out_ptrs.append(gpu)
|
||
|
else:
|
||
|
cpu, gpu = np.empty(0), 0
|
||
|
out_info.append(Tensor(name, dtype, shape, cpu, gpu))
|
||
|
|
||
|
self.is_dynamic = dynamic
|
||
|
self.inp_info = inp_info
|
||
|
self.out_info = out_info
|
||
|
self.out_ptrs = out_ptrs
|
||
|
|
||
|
def __warm_up(self) -> None:
|
||
|
if self.is_dynamic:
|
||
|
print('You engine has dynamic axes, please warm up by yourself !')
|
||
|
return
|
||
|
for _ in range(10):
|
||
|
inputs = []
|
||
|
for i in self.inp_info:
|
||
|
inputs.append(i.cpu)
|
||
|
self.__call__(inputs)
|
||
|
|
||
|
def set_profiler(self, profiler: Optional[trt.IProfiler]) -> None:
|
||
|
self.context.profiler = profiler \
|
||
|
if profiler is not None else trt.Profiler()
|
||
|
|
||
|
def __call__(self, *inputs) -> Union[Tuple, ndarray]:
|
||
|
|
||
|
assert len(inputs) == self.num_inputs
|
||
|
contiguous_inputs: List[ndarray] = [
|
||
|
np.ascontiguousarray(i) for i in inputs
|
||
|
]
|
||
|
|
||
|
for i in range(self.num_inputs):
|
||
|
|
||
|
if self.is_dynamic:
|
||
|
self.context.set_binding_shape(
|
||
|
i, tuple(contiguous_inputs[i].shape))
|
||
|
status, self.inp_info[i].gpu = cudart.cudaMallocAsync(
|
||
|
contiguous_inputs[i].nbytes, self.stream)
|
||
|
assert status.value == 0
|
||
|
cudart.cudaMemcpyAsync(
|
||
|
self.inp_info[i].gpu, contiguous_inputs[i].ctypes.data,
|
||
|
contiguous_inputs[i].nbytes,
|
||
|
cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, self.stream)
|
||
|
self.bindings[i] = self.inp_info[i].gpu
|
||
|
|
||
|
output_gpu_ptrs: List[int] = []
|
||
|
outputs: List[ndarray] = []
|
||
|
|
||
|
for i in range(self.num_outputs):
|
||
|
j = i + self.num_inputs
|
||
|
if self.is_dynamic:
|
||
|
shape = tuple(self.context.get_binding_shape(j))
|
||
|
dtype = self.out_info[i].dtype
|
||
|
cpu = np.empty(shape, dtype=dtype)
|
||
|
status, gpu = cudart.cudaMallocAsync(cpu.nbytes, self.stream)
|
||
|
assert status.value == 0
|
||
|
cudart.cudaMemcpyAsync(
|
||
|
gpu, cpu.ctypes.data, cpu.nbytes,
|
||
|
cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, self.stream)
|
||
|
else:
|
||
|
cpu = self.out_info[i].cpu
|
||
|
gpu = self.out_info[i].gpu
|
||
|
outputs.append(cpu)
|
||
|
output_gpu_ptrs.append(gpu)
|
||
|
self.bindings[j] = gpu
|
||
|
|
||
|
self.context.execute_async_v2(self.bindings, self.stream)
|
||
|
cudart.cudaStreamSynchronize(self.stream)
|
||
|
|
||
|
for i, o in enumerate(output_gpu_ptrs):
|
||
|
cudart.cudaMemcpyAsync(
|
||
|
outputs[i].ctypes.data, o, outputs[i].nbytes,
|
||
|
cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, self.stream)
|
||
|
|
||
|
return tuple(outputs) if len(outputs) > 1 else outputs[0]
|