YOLOv8-TensorRT/models/engine.py

import os
import pickle
from collections import defaultdict, namedtuple
from pathlib import Path
from typing import List, Optional, Tuple, Union

import onnx
import tensorrt as trt
import torch

os.environ['CUDA_MODULE_LOADING'] = 'LAZY'


class EngineBuilder:
    seg = False

    def __init__(
            self,
            checkpoint: Union[str, Path],
            device: Optional[Union[str, int, torch.device]] = None) -> None:
        checkpoint = Path(checkpoint) if isinstance(checkpoint,
                                                    str) else checkpoint
        assert checkpoint.exists() and checkpoint.suffix in ('.onnx', '.pkl')
        self.api = checkpoint.suffix == '.pkl'
        if isinstance(device, str):
            device = torch.device(device)
        elif isinstance(device, int):
            device = torch.device(f'cuda:{device}')

        self.checkpoint = checkpoint
        self.device = device

    def __build_engine(self,
                       fp16: bool = True,
                       input_shape: Union[List, Tuple] = (1, 3, 640, 640),
                       iou_thres: float = 0.65,
                       conf_thres: float = 0.25,
                       topk: int = 100,
                       with_profiling: bool = True) -> None:
        logger = trt.Logger(trt.Logger.WARNING)
        trt.init_libnvinfer_plugins(logger, namespace='')
        builder = trt.Builder(logger)
        config = builder.create_builder_config()
        config.max_workspace_size = torch.cuda.get_device_properties(
            self.device).total_memory
        flag = (1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
        network = builder.create_network(flag)

        self.logger = logger
        self.builder = builder
        self.network = network
        if self.api:
            self.build_from_api(fp16, input_shape, iou_thres, conf_thres, topk)
        else:
            self.build_from_onnx(iou_thres, conf_thres, topk)
        if fp16 and self.builder.platform_has_fast_fp16:
            config.set_flag(trt.BuilderFlag.FP16)
        self.weight = self.checkpoint.with_suffix('.engine')

        if with_profiling:
            config.profiling_verbosity = trt.ProfilingVerbosity.DETAILED
        with self.builder.build_engine(self.network, config) as engine:
            self.weight.write_bytes(engine.serialize())
        self.logger.log(
            trt.Logger.WARNING, f'Build tensorrt engine finish.\n'
            f'Save in {str(self.weight.absolute())}')

    def build(self,
              fp16: bool = True,
              input_shape: Union[List, Tuple] = (1, 3, 640, 640),
              iou_thres: float = 0.65,
              conf_thres: float = 0.25,
              topk: int = 100,
              with_profiling=True) -> None:
        self.__build_engine(fp16, input_shape, iou_thres, conf_thres, topk,
                            with_profiling)

    def build_from_onnx(self,
                        iou_thres: float = 0.65,
                        conf_thres: float = 0.25,
                        topk: int = 100):
        parser = trt.OnnxParser(self.network, self.logger)
        onnx_model = onnx.load(str(self.checkpoint))
        if not self.seg:
            onnx_model.graph.node[-1].attribute[2].i = topk
            onnx_model.graph.node[-1].attribute[3].f = conf_thres
            onnx_model.graph.node[-1].attribute[4].f = iou_thres

        if not parser.parse(onnx_model.SerializeToString()):
            raise RuntimeError(
                f'failed to load ONNX file: {str(self.checkpoint)}')
        inputs = [
            self.network.get_input(i) for i in range(self.network.num_inputs)
        ]
        outputs = [
            self.network.get_output(i) for i in range(self.network.num_outputs)
        ]

        for inp in inputs:
            self.logger.log(
                trt.Logger.WARNING,
                f'input "{inp.name}" with shape: {inp.shape} '
                f'dtype: {inp.dtype}')
        for out in outputs:
            self.logger.log(
                trt.Logger.WARNING,
                f'output "{out.name}" with shape: {out.shape} '
                f'dtype: {out.dtype}')

    def build_from_api(
        self,
        fp16: bool = True,
        input_shape: Union[List, Tuple] = (1, 3, 640, 640),
        iou_thres: float = 0.65,
        conf_thres: float = 0.25,
        topk: int = 100,
    ):
        assert not self.seg
        from .api import SPPF, C2f, Conv, Detect, get_depth, get_width

        with open(self.checkpoint, 'rb') as f:
            state_dict = pickle.load(f)
        mapping = {0.25: 1024, 0.5: 1024, 0.75: 768, 1.0: 512, 1.25: 512}

        GW = state_dict['GW']
        GD = state_dict['GD']
        width_64 = get_width(64, GW)
        width_128 = get_width(128, GW)
        width_256 = get_width(256, GW)
        width_512 = get_width(512, GW)
        width_1024 = get_width(mapping[GW], GW)
        depth_3 = get_depth(3, GD)
        depth_6 = get_depth(6, GD)
        strides = state_dict['strides']
        reg_max = state_dict['reg_max']
        images = self.network.add_input(name='images',
                                        dtype=trt.float32,
                                        shape=trt.Dims4(input_shape))
        assert images, 'Add input failed'

        Conv_0 = Conv(self.network, state_dict, images, width_64, 3, 2, 1,
                      'Conv.0')
        Conv_1 = Conv(self.network, state_dict, Conv_0.get_output(0),
                      width_128, 3, 2, 1, 'Conv.1')
        C2f_2 = C2f(self.network, state_dict, Conv_1.get_output(0), width_128,
                    depth_3, True, 1, 0.5, 'C2f.2')
        Conv_3 = Conv(self.network, state_dict, C2f_2.get_output(0), width_256,
                      3, 2, 1, 'Conv.3')
        C2f_4 = C2f(self.network, state_dict, Conv_3.get_output(0), width_256,
                    depth_6, True, 1, 0.5, 'C2f.4')
        Conv_5 = Conv(self.network, state_dict, C2f_4.get_output(0), width_512,
                      3, 2, 1, 'Conv.5')
        C2f_6 = C2f(self.network, state_dict, Conv_5.get_output(0), width_512,
                    depth_6, True, 1, 0.5, 'C2f.6')
        Conv_7 = Conv(self.network, state_dict, C2f_6.get_output(0),
                      width_1024, 3, 2, 1, 'Conv.7')
        C2f_8 = C2f(self.network, state_dict, Conv_7.get_output(0), width_1024,
                    depth_3, True, 1, 0.5, 'C2f.8')
        SPPF_9 = SPPF(self.network, state_dict, C2f_8.get_output(0),
                      width_1024, width_1024, 5, 'SPPF.9')
        Upsample_10 = self.network.add_resize(SPPF_9.get_output(0))
        assert Upsample_10, 'Add Upsample_10 failed'
        Upsample_10.resize_mode = trt.ResizeMode.NEAREST
        Upsample_10.shape = Upsample_10.get_output(
            0).shape[:2] + C2f_6.get_output(0).shape[2:]
        input_tensors11 = [Upsample_10.get_output(0), C2f_6.get_output(0)]
        Cat_11 = self.network.add_concatenation(input_tensors11)
        C2f_12 = C2f(self.network, state_dict, Cat_11.get_output(0), width_512,
                     depth_3, False, 1, 0.5, 'C2f.12')
        Upsample13 = self.network.add_resize(C2f_12.get_output(0))
        assert Upsample13, 'Add Upsample13 failed'
        Upsample13.resize_mode = trt.ResizeMode.NEAREST
        Upsample13.shape = Upsample13.get_output(
            0).shape[:2] + C2f_4.get_output(0).shape[2:]
        input_tensors14 = [Upsample13.get_output(0), C2f_4.get_output(0)]
        Cat_14 = self.network.add_concatenation(input_tensors14)
        C2f_15 = C2f(self.network, state_dict, Cat_14.get_output(0), width_256,
                     depth_3, False, 1, 0.5, 'C2f.15')
        Conv_16 = Conv(self.network, state_dict, C2f_15.get_output(0),
                       width_256, 3, 2, 1, 'Conv.16')
        input_tensors17 = [Conv_16.get_output(0), C2f_12.get_output(0)]
        Cat_17 = self.network.add_concatenation(input_tensors17)
        C2f_18 = C2f(self.network, state_dict, Cat_17.get_output(0), width_512,
                     depth_3, False, 1, 0.5, 'C2f.18')
        Conv_19 = Conv(self.network, state_dict, C2f_18.get_output(0),
                       width_512, 3, 2, 1, 'Conv.19')
        input_tensors20 = [Conv_19.get_output(0), SPPF_9.get_output(0)]
        Cat_20 = self.network.add_concatenation(input_tensors20)
        C2f_21 = C2f(self.network, state_dict, Cat_20.get_output(0),
                     width_1024, depth_3, False, 1, 0.5, 'C2f.21')
        input_tensors22 = [
            C2f_15.get_output(0),
            C2f_18.get_output(0),
            C2f_21.get_output(0)
        ]
        batched_nms = Detect(self.network, state_dict, input_tensors22,
                             strides, 'Detect.22', reg_max, fp16, iou_thres,
                             conf_thres, topk)
        for o in range(batched_nms.num_outputs):
            self.network.mark_output(batched_nms.get_output(o))


class TRTModule(torch.nn.Module):
    dtypeMapping = {
        trt.bool: torch.bool,
        trt.int8: torch.int8,
        trt.int32: torch.int32,
        trt.float16: torch.float16,
        trt.float32: torch.float32
    }

    def __init__(self, weight: Union[str, Path],
                 device: Optional[torch.device]) -> None:
        super(TRTModule, self).__init__()
        self.weight = Path(weight) if isinstance(weight, str) else weight
        self.device = device if device is not None else torch.device('cuda:0')
        self.stream = torch.cuda.Stream(device=device)
        self.__init_engine()
        self.__init_bindings()

    def __init_engine(self) -> None:
        logger = trt.Logger(trt.Logger.WARNING)
        trt.init_libnvinfer_plugins(logger, namespace='')
        with trt.Runtime(logger) as runtime:
            model = runtime.deserialize_cuda_engine(self.weight.read_bytes())

        context = model.create_execution_context()
        num_bindings = model.num_bindings
        names = [model.get_binding_name(i) for i in range(num_bindings)]

        self.bindings: List[int] = [0] * num_bindings
        num_inputs, num_outputs = 0, 0

        for i in range(num_bindings):
            if model.binding_is_input(i):
                num_inputs += 1
            else:
                num_outputs += 1

        self.num_bindings = num_bindings
        self.num_inputs = num_inputs
        self.num_outputs = num_outputs
        self.model = model
        self.context = context
        self.input_names = names[:num_inputs]
        self.output_names = names[num_inputs:]
        self.idx = list(range(self.num_outputs))

    def __init_bindings(self) -> None:
        idynamic = odynamic = False
        Tensor = namedtuple('Tensor', ('name', 'dtype', 'shape'))
        inp_info = []
        out_info = []
        for i, name in enumerate(self.input_names):
            assert self.model.get_binding_name(i) == name
            dtype = self.dtypeMapping[self.model.get_binding_dtype(i)]
            shape = tuple(self.model.get_binding_shape(i))
            if -1 in shape:
                idynamic |= True
            inp_info.append(Tensor(name, dtype, shape))
        for i, name in enumerate(self.output_names):
            i += self.num_inputs
            assert self.model.get_binding_name(i) == name
            dtype = self.dtypeMapping[self.model.get_binding_dtype(i)]
            shape = tuple(self.model.get_binding_shape(i))
            if -1 in shape:
                odynamic |= True
            out_info.append(Tensor(name, dtype, shape))

        if not odynamic:
            self.output_tensor = [
                torch.empty(info.shape, dtype=info.dtype, device=self.device)
                for info in out_info
            ]
        self.idynamic = idynamic
        self.odynamic = odynamic
        self.inp_info = inp_info
        self.out_info = out_info

    def set_profiler(self, profiler: Optional[trt.IProfiler]):
        self.context.profiler = profiler \
            if profiler is not None else trt.Profiler()

    def set_desired(self, desired: Optional[Union[List, Tuple]]):
        if isinstance(desired,
                      (list, tuple)) and len(desired) == self.num_outputs:
            self.idx = [self.output_names.index(i) for i in desired]

    def forward(self, *inputs) -> Union[Tuple, torch.Tensor]:

        assert len(inputs) == self.num_inputs
        contiguous_inputs: List[torch.Tensor] = [
            i.contiguous() for i in inputs
        ]

        for i in range(self.num_inputs):
            self.bindings[i] = contiguous_inputs[i].data_ptr()
            if self.idynamic:
                self.context.set_binding_shape(
                    i, tuple(contiguous_inputs[i].shape))

        outputs: List[torch.Tensor] = []

        for i in range(self.num_outputs):
            j = i + self.num_inputs
            if self.odynamic:
                shape = tuple(self.context.get_binding_shape(j))
                output = torch.empty(size=shape,
                                     dtype=self.out_info[i].dtype,
                                     device=self.device)
            else:
                output = self.output_tensor[i]
            self.bindings[j] = output.data_ptr()
            outputs.append(output)

        self.context.execute_async_v2(self.bindings, self.stream.cuda_stream)
        self.stream.synchronize()

        return tuple(outputs[i]
                     for i in self.idx) if len(outputs) > 1 else outputs[0]


class TRTProfilerV1(trt.IProfiler):

    def __init__(self):
        trt.IProfiler.__init__(self)
        self.total_runtime = 0.0
        self.recorder = defaultdict(float)

    def report_layer_time(self, layer_name: str, ms: float):
        self.total_runtime += ms * 1000
        self.recorder[layer_name] += ms * 1000

    def report(self):
        f = '\t%40s\t\t\t\t%10.4f'
        print('\t%40s\t\t\t\t%10s' % ('layername', 'cost(us)'))
        for name, cost in sorted(self.recorder.items(), key=lambda x: -x[1]):
            print(
                f %
                (name if len(name) < 40 else name[:35] + ' ' + '*' * 4, cost))
        print(f'\nTotal Inference Time: {self.total_runtime:.4f}(us)')


class TRTProfilerV0(trt.IProfiler):

    def __init__(self):
        trt.IProfiler.__init__(self)

    def report_layer_time(self, layer_name: str, ms: float):
        f = '\t%40s\t\t\t\t%10.4fms'
        print(f % (layer_name if len(layer_name) < 40 else layer_name[:35] +
                   ' ' + '*' * 4, ms))
Refactor code for detection and segment 2 years ago			`import os`
Support TensorRT api build 2 years ago			`import pickle`
			`from collections import defaultdict, namedtuple`
add converter 2 years ago			`from pathlib import Path`
Support TensorRT api build 2 years ago			`from typing import List, Optional, Tuple, Union`
add converter 2 years ago
Support TensorRT api build 2 years ago			`import onnx`
			`import tensorrt as trt`
add converter 2 years ago			`import torch`

Refactor code for detection and segment 2 years ago			`os.environ['CUDA_MODULE_LOADING'] = 'LAZY'`

add converter 2 years ago
			`class EngineBuilder:`
Add seg README 2 years ago			`seg = False`
add converter 2 years ago
Support TensorRT api build 2 years ago			`def __init__(`
			`self,`
			`checkpoint: Union[str, Path],`
			`device: Optional[Union[str, int, torch.device]] = None) -> None:`
			`checkpoint = Path(checkpoint) if isinstance(checkpoint,`
			`str) else checkpoint`
			`assert checkpoint.exists() and checkpoint.suffix in ('.onnx', '.pkl')`
			`self.api = checkpoint.suffix == '.pkl'`
add converter 2 years ago			`if isinstance(device, str):`
			`device = torch.device(device)`
			`elif isinstance(device, int):`
			`device = torch.device(f'cuda:{device}')`

			`self.checkpoint = checkpoint`
			`self.device = device`

Support TensorRT api build 2 years ago			`def __build_engine(self,`
			`fp16: bool = True,`
			`input_shape: Union[List, Tuple] = (1, 3, 640, 640),`
			`iou_thres: float = 0.65,`
			`conf_thres: float = 0.25,`
			`topk: int = 100,`
			`with_profiling: bool = True) -> None:`
add converter 2 years ago			`logger = trt.Logger(trt.Logger.WARNING)`
			`trt.init_libnvinfer_plugins(logger, namespace='')`
			`builder = trt.Builder(logger)`
			`config = builder.create_builder_config()`
Support TensorRT api build 2 years ago			`config.max_workspace_size = torch.cuda.get_device_properties(`
			`self.device).total_memory`
add converter 2 years ago			`flag = (1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))`
			`network = builder.create_network(flag)`

Support TensorRT api build 2 years ago			`self.logger = logger`
			`self.builder = builder`
			`self.network = network`
			`if self.api:`
			`self.build_from_api(fp16, input_shape, iou_thres, conf_thres, topk)`
			`else:`
			`self.build_from_onnx(iou_thres, conf_thres, topk)`
			`if fp16 and self.builder.platform_has_fast_fp16:`
add converter 2 years ago			`config.set_flag(trt.BuilderFlag.FP16)`
			`self.weight = self.checkpoint.with_suffix('.engine')`

			`if with_profiling:`
			`config.profiling_verbosity = trt.ProfilingVerbosity.DETAILED`
Support TensorRT api build 2 years ago			`with self.builder.build_engine(self.network, config) as engine:`
add converter 2 years ago			`self.weight.write_bytes(engine.serialize())`
Support TensorRT api build 2 years ago			`self.logger.log(`
			`trt.Logger.WARNING, f'Build tensorrt engine finish.\n'`
			`f'Save in {str(self.weight.absolute())}')`

			`def build(self,`
			`fp16: bool = True,`
			`input_shape: Union[List, Tuple] = (1, 3, 640, 640),`
			`iou_thres: float = 0.65,`
			`conf_thres: float = 0.25,`
			`topk: int = 100,`
			`with_profiling=True) -> None:`
			`self.__build_engine(fp16, input_shape, iou_thres, conf_thres, topk,`
			`with_profiling)`

			`def build_from_onnx(self,`
			`iou_thres: float = 0.65,`
			`conf_thres: float = 0.25,`
			`topk: int = 100):`
			`parser = trt.OnnxParser(self.network, self.logger)`
			`onnx_model = onnx.load(str(self.checkpoint))`
Add seg README 2 years ago			`if not self.seg:`
			`onnx_model.graph.node[-1].attribute[2].i = topk`
			`onnx_model.graph.node[-1].attribute[3].f = conf_thres`
			`onnx_model.graph.node[-1].attribute[4].f = iou_thres`
Support TensorRT api build 2 years ago
			`if not parser.parse(onnx_model.SerializeToString()):`
			`raise RuntimeError(`
			`f'failed to load ONNX file: {str(self.checkpoint)}')`
			`inputs = [`
			`self.network.get_input(i) for i in range(self.network.num_inputs)`
			`]`
			`outputs = [`
			`self.network.get_output(i) for i in range(self.network.num_outputs)`
			`]`
add converter 2 years ago
Support TensorRT api build 2 years ago			`for inp in inputs:`
			`self.logger.log(`
			`trt.Logger.WARNING,`
			`f'input "{inp.name}" with shape: {inp.shape} '`
			`f'dtype: {inp.dtype}')`
			`for out in outputs:`
			`self.logger.log(`
			`trt.Logger.WARNING,`
			`f'output "{out.name}" with shape: {out.shape} '`
			`f'dtype: {out.dtype}')`

			`def build_from_api(`
			`self,`
			`fp16: bool = True,`
			`input_shape: Union[List, Tuple] = (1, 3, 640, 640),`
			`iou_thres: float = 0.65,`
			`conf_thres: float = 0.25,`
			`topk: int = 100,`
			`):`
Add seg README 2 years ago			`assert not self.seg`
Support TensorRT api build 2 years ago			`from .api import SPPF, C2f, Conv, Detect, get_depth, get_width`

			`with open(self.checkpoint, 'rb') as f:`
			`state_dict = pickle.load(f)`
			`mapping = {0.25: 1024, 0.5: 1024, 0.75: 768, 1.0: 512, 1.25: 512}`

			`GW = state_dict['GW']`
			`GD = state_dict['GD']`
			`width_64 = get_width(64, GW)`
			`width_128 = get_width(128, GW)`
			`width_256 = get_width(256, GW)`
			`width_512 = get_width(512, GW)`
			`width_1024 = get_width(mapping[GW], GW)`
			`depth_3 = get_depth(3, GD)`
			`depth_6 = get_depth(6, GD)`
			`strides = state_dict['strides']`
			`reg_max = state_dict['reg_max']`
			`images = self.network.add_input(name='images',`
			`dtype=trt.float32,`
			`shape=trt.Dims4(input_shape))`
			`assert images, 'Add input failed'`

			`Conv_0 = Conv(self.network, state_dict, images, width_64, 3, 2, 1,`
			`'Conv.0')`
			`Conv_1 = Conv(self.network, state_dict, Conv_0.get_output(0),`
			`width_128, 3, 2, 1, 'Conv.1')`
			`C2f_2 = C2f(self.network, state_dict, Conv_1.get_output(0), width_128,`
			`depth_3, True, 1, 0.5, 'C2f.2')`
			`Conv_3 = Conv(self.network, state_dict, C2f_2.get_output(0), width_256,`
			`3, 2, 1, 'Conv.3')`
			`C2f_4 = C2f(self.network, state_dict, Conv_3.get_output(0), width_256,`
			`depth_6, True, 1, 0.5, 'C2f.4')`
			`Conv_5 = Conv(self.network, state_dict, C2f_4.get_output(0), width_512,`
			`3, 2, 1, 'Conv.5')`
			`C2f_6 = C2f(self.network, state_dict, Conv_5.get_output(0), width_512,`
			`depth_6, True, 1, 0.5, 'C2f.6')`
			`Conv_7 = Conv(self.network, state_dict, C2f_6.get_output(0),`
			`width_1024, 3, 2, 1, 'Conv.7')`
			`C2f_8 = C2f(self.network, state_dict, Conv_7.get_output(0), width_1024,`
			`depth_3, True, 1, 0.5, 'C2f.8')`
			`SPPF_9 = SPPF(self.network, state_dict, C2f_8.get_output(0),`
			`width_1024, width_1024, 5, 'SPPF.9')`
			`Upsample_10 = self.network.add_resize(SPPF_9.get_output(0))`
			`assert Upsample_10, 'Add Upsample_10 failed'`
			`Upsample_10.resize_mode = trt.ResizeMode.NEAREST`
			`Upsample_10.shape = Upsample_10.get_output(`
			`0).shape[:2] + C2f_6.get_output(0).shape[2:]`
			`input_tensors11 = [Upsample_10.get_output(0), C2f_6.get_output(0)]`
			`Cat_11 = self.network.add_concatenation(input_tensors11)`
			`C2f_12 = C2f(self.network, state_dict, Cat_11.get_output(0), width_512,`
			`depth_3, False, 1, 0.5, 'C2f.12')`
			`Upsample13 = self.network.add_resize(C2f_12.get_output(0))`
			`assert Upsample13, 'Add Upsample13 failed'`
			`Upsample13.resize_mode = trt.ResizeMode.NEAREST`
			`Upsample13.shape = Upsample13.get_output(`
			`0).shape[:2] + C2f_4.get_output(0).shape[2:]`
			`input_tensors14 = [Upsample13.get_output(0), C2f_4.get_output(0)]`
			`Cat_14 = self.network.add_concatenation(input_tensors14)`
			`C2f_15 = C2f(self.network, state_dict, Cat_14.get_output(0), width_256,`
			`depth_3, False, 1, 0.5, 'C2f.15')`
			`Conv_16 = Conv(self.network, state_dict, C2f_15.get_output(0),`
			`width_256, 3, 2, 1, 'Conv.16')`
			`input_tensors17 = [Conv_16.get_output(0), C2f_12.get_output(0)]`
			`Cat_17 = self.network.add_concatenation(input_tensors17)`
			`C2f_18 = C2f(self.network, state_dict, Cat_17.get_output(0), width_512,`
			`depth_3, False, 1, 0.5, 'C2f.18')`
			`Conv_19 = Conv(self.network, state_dict, C2f_18.get_output(0),`
			`width_512, 3, 2, 1, 'Conv.19')`
			`input_tensors20 = [Conv_19.get_output(0), SPPF_9.get_output(0)]`
			`Cat_20 = self.network.add_concatenation(input_tensors20)`
			`C2f_21 = C2f(self.network, state_dict, Cat_20.get_output(0),`
			`width_1024, depth_3, False, 1, 0.5, 'C2f.21')`
			`input_tensors22 = [`
			`C2f_15.get_output(0),`
			`C2f_18.get_output(0),`
			`C2f_21.get_output(0)`
			`]`
			`batched_nms = Detect(self.network, state_dict, input_tensors22,`
			`strides, 'Detect.22', reg_max, fp16, iou_thres,`
			`conf_thres, topk)`
			`for o in range(batched_nms.num_outputs):`
			`self.network.mark_output(batched_nms.get_output(o))`
add converter 2 years ago

			`class TRTModule(torch.nn.Module):`
Support TensorRT api build 2 years ago			`dtypeMapping = {`
			`trt.bool: torch.bool,`
			`trt.int8: torch.int8,`
			`trt.int32: torch.int32,`
			`trt.float16: torch.float16,`
			`trt.float32: torch.float32`
			`}`

			`def __init__(self, weight: Union[str, Path],`
			`device: Optional[torch.device]) -> None:`
add converter 2 years ago			`super(TRTModule, self).__init__()`
			`self.weight = Path(weight) if isinstance(weight, str) else weight`
			`self.device = device if device is not None else torch.device('cuda:0')`
			`self.stream = torch.cuda.Stream(device=device)`
			`self.__init_engine()`
			`self.__init_bindings()`

Add profiler 2 years ago			`def __init_engine(self) -> None:`
add converter 2 years ago			`logger = trt.Logger(trt.Logger.WARNING)`
			`trt.init_libnvinfer_plugins(logger, namespace='')`
			`with trt.Runtime(logger) as runtime:`
			`model = runtime.deserialize_cuda_engine(self.weight.read_bytes())`

			`context = model.create_execution_context()`
Rename 2 years ago			`num_bindings = model.num_bindings`
			`names = [model.get_binding_name(i) for i in range(num_bindings)]`
add converter 2 years ago
Rename 2 years ago			`self.bindings: List[int] = [0] * num_bindings`
add converter 2 years ago			`num_inputs, num_outputs = 0, 0`

Rename 2 years ago			`for i in range(num_bindings):`
add converter 2 years ago			`if model.binding_is_input(i):`
			`num_inputs += 1`
			`else:`
			`num_outputs += 1`

Rename 2 years ago			`self.num_bindings = num_bindings`
add converter 2 years ago			`self.num_inputs = num_inputs`
			`self.num_outputs = num_outputs`
			`self.model = model`
			`self.context = context`
			`self.input_names = names[:num_inputs]`
			`self.output_names = names[num_inputs:]`
Support YOLOv8 seg model convert onnx and tensorrt 2 years ago			`self.idx = list(range(self.num_outputs))`
add converter 2 years ago
Add profiler 2 years ago			`def __init_bindings(self) -> None:`
Rename 2 years ago			`idynamic = odynamic = False`
add converter 2 years ago			`Tensor = namedtuple('Tensor', ('name', 'dtype', 'shape'))`
			`inp_info = []`
			`out_info = []`
			`for i, name in enumerate(self.input_names):`
			`assert self.model.get_binding_name(i) == name`
			`dtype = self.dtypeMapping[self.model.get_binding_dtype(i)]`
			`shape = tuple(self.model.get_binding_shape(i))`
Support TensorRT api build 2 years ago			`if -1 in shape:`
Rename 2 years ago			`idynamic \|= True`
add converter 2 years ago			`inp_info.append(Tensor(name, dtype, shape))`
			`for i, name in enumerate(self.output_names):`
			`i += self.num_inputs`
			`assert self.model.get_binding_name(i) == name`
			`dtype = self.dtypeMapping[self.model.get_binding_dtype(i)]`
			`shape = tuple(self.model.get_binding_shape(i))`
Rename 2 years ago			`if -1 in shape:`
			`odynamic \|= True`
add converter 2 years ago			`out_info.append(Tensor(name, dtype, shape))`

Rename 2 years ago			`if not odynamic:`
Support TensorRT api build 2 years ago			`self.output_tensor = [`
			`torch.empty(info.shape, dtype=info.dtype, device=self.device)`
			`for info in out_info`
			`]`
Rename 2 years ago			`self.idynamic = idynamic`
			`self.odynamic = odynamic`
Add infer.py 2 years ago			`self.inp_info = inp_info`
Rename 2 years ago			`self.out_info = out_info`
add converter 2 years ago
Add profiler 2 years ago			`def set_profiler(self, profiler: Optional[trt.IProfiler]):`
Support TensorRT api build 2 years ago			`self.context.profiler = profiler \`
			`if profiler is not None else trt.Profiler()`
Add profiler 2 years ago
Support YOLOv8 seg model convert onnx and tensorrt 2 years ago			`def set_desired(self, desired: Optional[Union[List, Tuple]]):`
			`if isinstance(desired,`
			`(list, tuple)) and len(desired) == self.num_outputs:`
			`self.idx = [self.output_names.index(i) for i in desired]`

Add profiler 2 years ago			`def forward(self, *inputs) -> Union[Tuple, torch.Tensor]:`
add converter 2 years ago
			`assert len(inputs) == self.num_inputs`
Support TensorRT api build 2 years ago			`contiguous_inputs: List[torch.Tensor] = [`
			`i.contiguous() for i in inputs`
			`]`
add converter 2 years ago
			`for i in range(self.num_inputs):`
			`self.bindings[i] = contiguous_inputs[i].data_ptr()`
Rename 2 years ago			`if self.idynamic:`
Support TensorRT api build 2 years ago			`self.context.set_binding_shape(`
			`i, tuple(contiguous_inputs[i].shape))`
add converter 2 years ago
			`outputs: List[torch.Tensor] = []`

			`for i in range(self.num_outputs):`
			`j = i + self.num_inputs`
Fix seg inference with torch and numpy 2 years ago			`if self.odynamic:`
add converter 2 years ago			`shape = tuple(self.context.get_binding_shape(j))`
Support TensorRT api build 2 years ago			`output = torch.empty(size=shape,`
			`dtype=self.out_info[i].dtype,`
			`device=self.device)`
add converter 2 years ago			`else:`
			`output = self.output_tensor[i]`
			`self.bindings[j] = output.data_ptr()`
			`outputs.append(output)`

			`self.context.execute_async_v2(self.bindings, self.stream.cuda_stream)`
			`self.stream.synchronize()`

Support YOLOv8 seg model convert onnx and tensorrt 2 years ago			`return tuple(outputs[i]`
			`for i in self.idx) if len(outputs) > 1 else outputs[0]`
Add profiler 2 years ago

			`class TRTProfilerV1(trt.IProfiler):`
Support TensorRT api build 2 years ago
Add profiler 2 years ago			`def __init__(self):`
			`trt.IProfiler.__init__(self)`
			`self.total_runtime = 0.0`
			`self.recorder = defaultdict(float)`

			`def report_layer_time(self, layer_name: str, ms: float):`
			`self.total_runtime += ms * 1000`
			`self.recorder[layer_name] += ms * 1000`

			`def report(self):`
			`f = '\t%40s\t\t\t\t%10.4f'`
			`print('\t%40s\t\t\t\t%10s' % ('layername', 'cost(us)'))`
			`for name, cost in sorted(self.recorder.items(), key=lambda x: -x[1]):`
Support TensorRT api build 2 years ago			`print(`
			`f %`
			`(name if len(name) < 40 else name[:35] + ' ' + '' 4, cost))`
Add profiler 2 years ago			`print(f'\nTotal Inference Time: {self.total_runtime:.4f}(us)')`


			`class TRTProfilerV0(trt.IProfiler):`
Support TensorRT api build 2 years ago
Add profiler 2 years ago			`def __init__(self):`
			`trt.IProfiler.__init__(self)`

			`def report_layer_time(self, layer_name: str, ms: float):`
			`f = '\t%40s\t\t\t\t%10.4fms'`
Support TensorRT api build 2 years ago			`print(f % (layer_name if len(layer_name) < 40 else layer_name[:35] +`
			`' ' + '' 4, ms))`