#!/usr/bin/env python import os import os.path as osp import argparse from operator import itemgetter import numpy as np import paddle from paddle.inference import Config from paddle.inference import create_predictor from paddle.inference import PrecisionType from paddlers.tasks import load_model from paddlers.utils import logging from config_utils import parse_configs class _bool(object): def __new__(cls, x): if isinstance(x, str): if x.lower() == 'false': return False elif x.lower() == 'true': return True return bool.__new__(x) class TIPCPredictor(object): def __init__(self, model_dir, device='cpu', gpu_id=0, cpu_thread_num=1, use_mkl=True, mkl_thread_num=4, use_trt=False, memory_optimize=True, trt_precision_mode='fp32', benchmark=False, model_name='', batch_size=1): self.model_dir = model_dir self._model = load_model(model_dir, with_net=False) if trt_precision_mode.lower() == 'fp32': trt_precision_mode = PrecisionType.Float32 elif trt_precision_mode.lower() == 'fp16': trt_precision_mode = PrecisionType.Float16 else: logging.error( "TensorRT precision mode {} is invalid. Supported modes are fp32 and fp16." .format(trt_precision_mode), exit=True) self.config = self.get_config( device=device, gpu_id=gpu_id, cpu_thread_num=cpu_thread_num, use_mkl=use_mkl, mkl_thread_num=mkl_thread_num, use_trt=use_trt, use_glog=False, memory_optimize=memory_optimize, max_trt_batch_size=1, trt_precision_mode=trt_precision_mode) self.predictor = create_predictor(self.config) self.batch_size = batch_size if benchmark: import auto_log pid = os.getpid() self.autolog = auto_log.AutoLogger( model_name=model_name, model_precision=trt_precision_mode, batch_size=batch_size, data_shape='dynamic', save_path=None, inference_config=self.config, pids=pid, process_name=None, gpu_ids=0, time_keys=[ 'preprocess_time', 'inference_time', 'postprocess_time' ], warmup=0, logger=logging) self.benchmark = benchmark def get_config(self, device, gpu_id, cpu_thread_num, use_mkl, mkl_thread_num, use_trt, use_glog, memory_optimize, max_trt_batch_size, trt_precision_mode): config = Config( osp.join(self.model_dir, 'model.pdmodel'), osp.join(self.model_dir, 'model.pdiparams')) if device == 'gpu': config.enable_use_gpu(200, gpu_id) config.switch_ir_optim(True) if use_trt: if self._model.model_type == 'segmenter': logging.warning( "Semantic segmentation models do not support TensorRT acceleration, " "TensorRT is forcibly disabled.") elif self._model.model_type == 'detector' and 'RCNN' in self._model.__class__.__name__: logging.warning( "RCNN models do not support TensorRT acceleration, " "TensorRT is forcibly disabled.") else: config.enable_tensorrt_engine( workspace_size=1 << 10, max_batch_size=max_trt_batch_size, min_subgraph_size=3, precision_mode=trt_precision_mode, use_static=False, use_calib_mode=False) else: config.disable_gpu() config.set_cpu_math_library_num_threads(cpu_thread_num) if use_mkl: if self._model.__class__.__name__ == 'MaskRCNN': logging.warning( "MaskRCNN does not support MKL-DNN, MKL-DNN is forcibly disabled" ) else: try: # Cache 10 different shapes for mkldnn to avoid memory leak. config.set_mkldnn_cache_capacity(10) config.enable_mkldnn() config.set_cpu_math_library_num_threads(mkl_thread_num) except Exception as e: logging.warning( "The current environment does not support MKL-DNN, MKL-DNN is disabled." ) pass if not use_glog: config.disable_glog_info() if memory_optimize: config.enable_memory_optim() config.switch_use_feed_fetch_ops(False) return config def preprocess(self, images, transforms): preprocessed_samples = self._model.preprocess( images, transforms, to_tensor=False) if self._model.model_type == 'classifier': preprocessed_samples = {'image': preprocessed_samples[0]} elif self._model.model_type == 'segmenter': preprocessed_samples = { 'image': preprocessed_samples[0], 'ori_shape': preprocessed_samples[1] } elif self._model.model_type == 'detector': pass elif self._model.model_type == 'change_detector': preprocessed_samples = { 'image': preprocessed_samples[0], 'image2': preprocessed_samples[1], 'ori_shape': preprocessed_samples[2] } elif self._model.model_type == 'restorer': preprocessed_samples = { 'image': preprocessed_samples[0], 'tar_shape': preprocessed_samples[1] } else: logging.error( "Invalid model type {}".format(self._model.model_type), exit=True) return preprocessed_samples def postprocess(self, net_outputs, topk=1, ori_shape=None, tar_shape=None, transforms=None): if self._model.model_type == 'classifier': true_topk = min(self._model.num_classes, topk) if self._model.postprocess is None: self._model.build_postprocess_from_labels(topk) # XXX: Convert ndarray to tensor as self._model.postprocess requires assert len(net_outputs) == 1 net_outputs = paddle.to_tensor(net_outputs[0]) outputs = self._model.postprocess(net_outputs) class_ids = map(itemgetter('class_ids'), outputs) scores = map(itemgetter('scores'), outputs) label_names = map(itemgetter('label_names'), outputs) preds = [{ 'class_ids_map': l, 'scores_map': s, 'label_names_map': n, } for l, s, n in zip(class_ids, scores, label_names)] elif self._model.model_type in ('segmenter', 'change_detector'): label_map, score_map = self._model.postprocess( net_outputs, batch_origin_shape=ori_shape, transforms=transforms.transforms) preds = [{ 'label_map': l, 'score_map': s } for l, s in zip(label_map, score_map)] elif self._model.model_type == 'detector': net_outputs = { k: v for k, v in zip(['bbox', 'bbox_num', 'mask'], net_outputs) } preds = self._model.postprocess(net_outputs) elif self._model.model_type == 'restorer': res_maps = self._model.postprocess( net_outputs[0], batch_tar_shape=tar_shape, transforms=transforms.transforms) preds = [{'res_map': res_map} for res_map in res_maps] else: logging.error( "Invalid model type {}.".format(self._model.model_type), exit=True) return preds def _run(self, images, topk=1, transforms=None, time_it=False): if self.benchmark and time_it: self.autolog.times.start() preprocessed_input = self.preprocess(images, transforms) input_names = self.predictor.get_input_names() for name in input_names: input_tensor = self.predictor.get_input_handle(name) input_tensor.copy_from_cpu(preprocessed_input[name]) if self.benchmark and time_it: self.autolog.times.stamp() self.predictor.run() output_names = self.predictor.get_output_names() net_outputs = [] for name in output_names: output_tensor = self.predictor.get_output_handle(name) net_outputs.append(output_tensor.copy_to_cpu()) if self.benchmark and time_it: self.autolog.times.stamp() res = self.postprocess( net_outputs, topk, ori_shape=preprocessed_input.get('ori_shape', None), tar_shape=preprocessed_input.get('tar_shape', None), transforms=transforms) if self.benchmark and time_it: self.autolog.times.end(stamp=True) return res def predict(self, data_dir, file_list, topk=1, warmup_iters=5): transforms = self._model.test_transforms # Warm up iters = 0 while True: for images in self._parse_lines(data_dir, file_list): if iters >= warmup_iters: break self._run( images=images, topk=topk, transforms=transforms, time_it=False) iters += 1 else: continue break results = [] for images in self._parse_lines(data_dir, file_list): res = self._run( images=images, topk=topk, transforms=transforms, time_it=True) results.append(res) return results def _parse_lines(self, data_dir, file_list): with open(file_list, 'r') as f: batch = [] for line in f: items = line.strip().split() items = [osp.join(data_dir, item) for item in items] if self._model.model_type == 'change_detector': batch.append((items[0], items[1])) else: batch.append(items[0]) if len(batch) == self.batch_size: yield batch batch.clear() if 0 < len(batch) < self.batch_size: yield batch if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--config', type=str) parser.add_argument('--inherit_off', action='store_true') parser.add_argument('--model_dir', type=str, default='./') parser.add_argument( '--device', type=str, choices=['cpu', 'gpu'], default='cpu') parser.add_argument('--enable_mkldnn', type=_bool, default=False) parser.add_argument('--cpu_threads', type=int, default=10) parser.add_argument('--use_trt', type=_bool, default=False) parser.add_argument( '--precision', type=str, choices=['fp32', 'fp16'], default='fp16') parser.add_argument('--batch_size', type=int, default=1) parser.add_argument('--benchmark', type=_bool, default=False) parser.add_argument('--model_name', type=str, default='') args = parser.parse_args() cfg = parse_configs(args.config, not args.inherit_off) eval_dataset = cfg['datasets']['eval'] data_dir = eval_dataset.args['data_dir'] file_list = eval_dataset.args['file_list'] predictor = TIPCPredictor( args.model_dir, device=args.device, cpu_thread_num=args.cpu_threads, use_mkl=args.enable_mkldnn, mkl_thread_num=args.cpu_threads, use_trt=args.use_trt, trt_precision_mode=args.precision, benchmark=args.benchmark) predictor.predict(data_dir, file_list) if args.benchmark: predictor.autolog.report()