mmdeploy/mmdeploy/backend/tensorrt/utils.py

260 lines
9.2 KiB
Python

# Copyright (c) OpenMMLab. All rights reserved.
import logging
import os
import re
import sys
from typing import Any, Dict, Optional, Sequence, Union
import onnx
import tensorrt as trt
from packaging import version
from mmdeploy.utils import get_root_logger
from .init_plugins import load_tensorrt_plugin
def save(engine: Any, path: str) -> None:
"""Serialize TensorRT engine to disk.
Args:
engine (Any): TensorRT engine to be serialized.
path (str): The absolute disk path to write the engine.
"""
with open(path, mode='wb') as f:
if isinstance(engine, trt.ICudaEngine):
engine = engine.serialize()
f.write(bytearray(engine))
def load(path: str, allocator: Optional[Any] = None) -> trt.ICudaEngine:
"""Deserialize TensorRT engine from disk.
Args:
path (str): The disk path to read the engine.
allocator (Any): gpu allocator
Returns:
tensorrt.ICudaEngine: The TensorRT engine loaded from disk.
"""
load_tensorrt_plugin()
with trt.Logger() as logger, trt.Runtime(logger) as runtime:
if allocator is not None:
runtime.gpu_allocator = allocator
with open(path, mode='rb') as f:
engine_bytes = f.read()
trt.init_libnvinfer_plugins(logger, namespace='')
engine = runtime.deserialize_cuda_engine(engine_bytes)
return engine
def search_cuda_version() -> str:
"""try cmd to get cuda version, then try `torch.cuda`
Returns:
str: cuda version, for example 10.2
"""
version = None
pattern = re.compile(r'[0-9]+\.[0-9]+')
platform = sys.platform.lower()
def cmd_result(txt: str):
cmd = os.popen(txt)
return cmd.read().rstrip().lstrip()
if platform == 'linux' or platform == 'darwin' or platform == 'freebsd': # noqa E501
version = cmd_result(
" nvcc --version | grep release | awk '{print $5}' | awk -F , '{print $1}' " # noqa E501
)
if version is None or pattern.match(version) is None:
version = cmd_result(
" nvidia-smi | grep CUDA | awk '{print $9}' ")
elif platform == 'win32' or platform == 'cygwin':
# nvcc_release = "Cuda compilation tools, release 10.2, V10.2.89"
nvcc_release = cmd_result(' nvcc --version | find "release" ')
if nvcc_release is not None:
result = pattern.findall(nvcc_release)
if len(result) > 0:
version = result[0]
if version is None or pattern.match(version) is None:
# nvidia_smi = "| NVIDIA-SMI 440.33.01 Driver Version: 440.33.01 CUDA Version: 10.2 |" # noqa E501
nvidia_smi = cmd_result(' nvidia-smi | find "CUDA Version" ')
result = pattern.findall(nvidia_smi)
if len(result) > 2:
version = result[2]
if version is None or pattern.match(version) is None:
try:
import torch
version = torch.version.cuda
except Exception:
pass
return version
def from_onnx(onnx_model: Union[str, onnx.ModelProto],
output_file_prefix: str,
input_shapes: Dict[str, Sequence[int]],
max_workspace_size: int = 0,
fp16_mode: bool = False,
int8_mode: bool = False,
int8_param: Optional[dict] = None,
device_id: int = 0,
log_level: trt.Logger.Severity = trt.Logger.ERROR,
**kwargs) -> trt.ICudaEngine:
"""Create a tensorrt engine from ONNX.
Args:
onnx_model (str or onnx.ModelProto): Input onnx model to convert from.
output_file_prefix (str): The path to save the output ncnn file.
input_shapes (Dict[str, Sequence[int]]): The min/opt/max shape of
each input.
max_workspace_size (int): To set max workspace size of TensorRT engine.
some tactics and layers need large workspace. Defaults to `0`.
fp16_mode (bool): Specifying whether to enable fp16 mode.
Defaults to `False`.
int8_mode (bool): Specifying whether to enable int8 mode.
Defaults to `False`.
int8_param (dict): A dict of parameter int8 mode. Defaults to `None`.
device_id (int): Choice the device to create engine. Defaults to `0`.
log_level (trt.Logger.Severity): The log level of TensorRT. Defaults to
`trt.Logger.ERROR`.
Returns:
tensorrt.ICudaEngine: The TensorRT engine created from onnx_model.
Example:
>>> from mmdeploy.apis.tensorrt import from_onnx
>>> engine = from_onnx(
>>> "onnx_model.onnx",
>>> {'input': {"min_shape" : [1, 3, 160, 160],
>>> "opt_shape" : [1, 3, 320, 320],
>>> "max_shape" : [1, 3, 640, 640]}},
>>> log_level=trt.Logger.WARNING,
>>> fp16_mode=True,
>>> max_workspace_size=1 << 30,
>>> device_id=0)
>>> })
"""
if device_id != 0:
import os
old_cuda_device = os.environ.get('CUDA_DEVICE', None)
os.environ['CUDA_DEVICE'] = str(device_id)
import pycuda.autoinit # noqa:F401
if old_cuda_device is not None:
os.environ['CUDA_DEVICE'] = old_cuda_device
else:
os.environ.pop('CUDA_DEVICE')
load_tensorrt_plugin()
# create builder and network
logger = trt.Logger(log_level)
builder = trt.Builder(logger)
# TODO: use TorchAllocator as builder.gpu_allocator
EXPLICIT_BATCH = 1 << (int)(
trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
network = builder.create_network(EXPLICIT_BATCH)
# parse onnx
parser = trt.OnnxParser(network, logger)
if isinstance(onnx_model, str):
onnx_model = onnx.load(onnx_model)
if not parser.parse(onnx_model.SerializeToString()):
error_msgs = ''
for error in range(parser.num_errors):
error_msgs += f'{parser.get_error(error)}\n'
raise RuntimeError(f'Failed to parse onnx, {error_msgs}')
# config builder
if version.parse(trt.__version__) < version.parse('8'):
builder.max_workspace_size = max_workspace_size
config = builder.create_builder_config()
if hasattr(config, 'set_memory_pool_limit'):
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE,
max_workspace_size)
else:
config.max_workspace_size = max_workspace_size
cuda_version = search_cuda_version()
if cuda_version is not None:
version_major = int(cuda_version.split('.')[0])
if version_major < 11:
# cu11 support cublasLt, so cudnn heuristic tactic should disable CUBLAS_LT # noqa E501
tactic_source = config.get_tactic_sources() - (
1 << int(trt.TacticSource.CUBLAS_LT))
config.set_tactic_sources(tactic_source)
profile = builder.create_optimization_profile()
for input_name, param in input_shapes.items():
min_shape = param['min_shape']
opt_shape = param['opt_shape']
max_shape = param['max_shape']
profile.set_shape(input_name, min_shape, opt_shape, max_shape)
if config.add_optimization_profile(profile) < 0:
logger.warning(f'Invalid optimization profile {profile}.')
if fp16_mode:
if not getattr(builder, 'platform_has_fast_fp16', True):
logger.warning('Platform does not has fast native fp16.')
if version.parse(trt.__version__) < version.parse('8'):
builder.fp16_mode = fp16_mode
config.set_flag(trt.BuilderFlag.FP16)
if int8_mode:
if not getattr(builder, 'platform_has_fast_int8', True):
logger.warning('Platform does not has fast native int8.')
from .calib_utils import HDF5Calibrator
config.set_flag(trt.BuilderFlag.INT8)
assert int8_param is not None
config.int8_calibrator = HDF5Calibrator(
int8_param['calib_file'],
input_shapes,
model_type=int8_param['model_type'],
device_id=device_id,
algorithm=int8_param.get(
'algorithm', trt.CalibrationAlgoType.ENTROPY_CALIBRATION_2))
if version.parse(trt.__version__) < version.parse('8'):
builder.int8_mode = int8_mode
builder.int8_calibrator = config.int8_calibrator
# create engine
if hasattr(builder, 'build_serialized_network'):
engine = builder.build_serialized_network(network, config)
else:
engine = builder.build_engine(network, config)
assert engine is not None, 'Failed to create TensorRT engine'
save(engine, output_file_prefix + '.engine')
return engine
def get_trt_log_level() -> trt.Logger.Severity:
"""Get tensorrt log level from root logger.
Returns:
level (tensorrt.Logger.Severity):
Logging level of tensorrt.Logger.
"""
logger = get_root_logger()
level = logger.level
trt_log_level = trt.Logger.INFO
if level == logging.ERROR:
trt_log_level = trt.Logger.ERROR
elif level == logging.WARNING:
trt_log_level = trt.Logger.WARNING
elif level == logging.DEBUG:
trt_log_level = trt.Logger.VERBOSE
return trt_log_level