#include "fastrt/utils.h" #include "fastrt/InferenceEngine.h" namespace trt { InferenceEngine::InferenceEngine(const EngineConfig &enginecfg): _engineCfg(enginecfg) { TRTASSERT((_engineCfg.max_batch_size > 0)); CHECK(cudaSetDevice(_engineCfg.device_id)); _runtime = make_holder(nvinfer1::createInferRuntime(gLogger)); TRTASSERT(_runtime.get()); _engine = make_holder(_runtime->deserializeCudaEngine(_engineCfg.trtModelStream.get(), _engineCfg.stream_size)); TRTASSERT(_engine.get()); _context = make_holder(_engine->createExecutionContext()); TRTASSERT(_context.get()); _inputSize = _engineCfg.max_batch_size * 3 * _engineCfg.input_h * _engineCfg.input_w * _depth; _outputSize = _engineCfg.max_batch_size * _engineCfg.output_size * _depth; CHECK(cudaMallocHost((void**)&_input, _inputSize)); CHECK(cudaMallocHost((void**)&_output, _outputSize)); _streamptr = std::shared_ptr( new cudaStream_t, [](cudaStream_t* ptr){ cudaStreamDestroy(*ptr); if(ptr != nullptr){ delete ptr; } }); CHECK(cudaStreamCreate(&*_streamptr.get())); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. TRTASSERT((_engine->getNbBindings() == 2)); // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() _inputIndex = _engine->getBindingIndex(_engineCfg.input_name.c_str()); _outputIndex = _engine->getBindingIndex(_engineCfg.output_name.c_str()); // Create GPU buffers on device CHECK(cudaMalloc(&_buffers[_inputIndex], _inputSize)); CHECK(cudaMalloc(&_buffers[_outputIndex], _outputSize)); _inputSize /= _engineCfg.max_batch_size; _outputSize /= _engineCfg.max_batch_size; } bool InferenceEngine::doInference(const int inference_batch_size, std::function preprocessing) { TRTASSERT(( inference_batch_size <= _engineCfg.max_batch_size && inference_batch_size > 0)); preprocessing(_input); CHECK(cudaSetDevice(_engineCfg.device_id)); CHECK(cudaMemcpyAsync(_buffers[_inputIndex], _input, inference_batch_size * _inputSize, cudaMemcpyHostToDevice, *_streamptr)); auto status = _context->enqueue(inference_batch_size, _buffers, *_streamptr, nullptr); CHECK(cudaMemcpyAsync(_output, _buffers[_outputIndex], inference_batch_size * _outputSize, cudaMemcpyDeviceToHost, *_streamptr)); CHECK(cudaStreamSynchronize(*_streamptr)); return status; } InferenceEngine::InferenceEngine(InferenceEngine &&other) noexcept: _engineCfg(other._engineCfg) , _input(other._input) , _output(other._output) , _inputIndex(other._inputIndex) , _outputIndex(other._outputIndex) , _inputSize(other._inputSize) , _outputSize(other._outputSize) , _runtime(std::move(other._runtime)) , _engine(std::move(other._engine)) , _context(std::move(other._context)) , _streamptr(other._streamptr) { _buffers[0] = other._buffers[0]; _buffers[1] = other._buffers[1]; other._streamptr.reset(); other._input = nullptr; other._output = nullptr; other._buffers[0] = nullptr; other._buffers[1] = nullptr; } InferenceEngine::~InferenceEngine() { CHECK(cudaFreeHost(_input)); CHECK(cudaFreeHost(_output)); CHECK(cudaFree(_buffers[_inputIndex])); CHECK(cudaFree(_buffers[_outputIndex])); } }