在上一篇中学习了pytorch模型如何转为onnx模型,TensorRT推理的一般过程,以及python 接口下onnx采用TensorRT推理的示例。 本文继续学习下C++接口下采用TensorRT推理onnx模型。
其过程和上篇文章中说到的过程一致:
- 使用
logger
创建builder
; -
builder
可以创建INetworkDefinition
即计算图; - 使用
onnxParser
解析onnx模型填充计算图; - 由计算图
INetworkDefinition
创建CudaEngine
; - 由
cudaEngine
创建推理的上下文IExecutionContext
.
推理过程:
- 分配输入输出空间
- 将输入拷贝到缓存空间
- 进行推理
- 将输出拷贝到cpu
- 进行后处理
下面我们直接以~/TensorRT-7.0.0.11/samples/sampleMnist/
为例,中间遇到的一些问题记录下其解决办法。
// 头文件
#include "argsParser.h" // TensorRT-7.0.0.11/samples/common
#include "buffers.h" // TensorRT-7.0.0.11/samples/common
#include "common.h" // TensorRT-7.0.0.11/samples/common
#include "logger.h" // TensorRT-7.0.0.11/samples/common
#include "parserOnnxConfig.h" // TensorRT-7.0.0.11/samples/common
#include "NvInfer.h" // TensorRT-7.0.0.11/include
#include <cuda_runtime_api.h>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <sstream>
示例代码中将mnist的转换过程和推理过程封装成了一个类。
class SampleOnnxMNIST
{
template <typename T>
using SampleUniquePtr = std::unique_ptr<T, samplesCommon::InferDeleter>;
// 模板类的指针声明,以及对应的deleter函数,参考
//http://senlinzhan.github.io/2015/04/20/%E8%B0%88%E8%B0%88C-%E7%9A%84%E6%99%BA%E8%83%BD%E6%8C%87%E9%92%88/
public:
SampleOnnxMNIST(const samplesCommon::OnnxSampleParams& params)
: mParams(params)
, mEngine(nullptr)
{//构造函数,params用于存储需要的一些变量值
//mEngine即cudaEngine的指针
}
bool build(); // 用于构造cudaEngine, Function builds the network engine
bool infer(); // 推理 Runs the TensorRT inference engine for this sample
private:
samplesCommon::OnnxSampleParams mParams; //实例的参数,
nvinfer1::Dims mInputDims; // 网络输入参数的维度, 和python中的tuple类型相似
nvinfer1::Dims mOutputDims; //!网络输出参数的维度
int mNumber{0}; //!< The number to classify
std::shared_ptr<nvinfer1::ICudaEngine> mEngine; // CudaEngine的指针
bool constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
SampleUniquePtr<nvinfer1::INetworkDefinition>& network,
SampleUniquePtr<nvinfer1::IBuilderConfig>& config,
SampleUniquePtr<nvonnxparser::IParser>& parser);
// 类似python接口中的with 环境,目的是采用onnxParser的结果填充计算图并得到cudaEngine
bool processInput(const samplesCommon::BufferManager& buffers);
//对输入进行预处理,并将其存入buffer
bool verifyOutput(const samplesCommon::BufferManager& buffers);
// 输出存放在缓存中,从中获取输出
};
其中命名空间 samplesCommon
中的OnnxSampleParams
在argsParser.h
中定义:
struct SampleParams
{
int batchSize{1}; //!< Number of inputs in a batch
int dlaCore{-1}; //!< Specify the DLA core to run network on.
bool int8{false}; //!< Allow runnning the network in Int8 mode.
bool fp16{false}; //!< Allow running the network in FP16 mode.
std::vector<std::string> dataDirs; //!< Directory paths where sample data files are stored
std::vector<std::string> inputTensorNames;
std::vector<std::string> outputTensorNames;
};
struct OnnxSampleParams : public SampleParams
{
std::string onnxFileName; //!< Filename of ONNX file of a network
};
可以发现SampleParams
中主要参数包括网络输入输出的名称, 网络输入的bs,精度模式以及数据可能的存储路径,该路径参数可以是多个。 OnnxSampleParams
这添加了onnx模型的路径参数。
接下来,我们先忽略类的方法的具体实现,看一下主函数中的代码:
void printHelpInfo()
{
std::cout
<< "Usage: ./sample_onnx_mnist [-h or --help] [-d or --datadir=<path to data directory>] [--useDLACore=<int>]"
<< std::endl;
std::cout << "--help Display help information" << std::endl;
std::cout << "--datadir Specify path to a data directory, overriding the default. This option can be used "
"multiple times to add multiple directories. If no data directories are given, the default is to use "
"(data/samples/mnist/, data/mnist/)"
<< std::endl;
std::cout << "--useDLACore=N Specify a DLA engine for layers that support DLA. Value can range from 0 to n-1, "
"where n is the number of DLA engines on the platform."
<< std::endl;
std::cout << "--int8 Run in Int8 mode." << std::endl;
std::cout << "--fp16 Run in FP16 mode." << std::endl;
}
int main(int argc, char** argv)
{
samplesCommon::Args args;
bool argsOK = samplesCommon::parseArgs(args, argc, argv); //解析输入参数
if (!argsOK)
{
gLogError << "Invalid arguments" << std::endl;
printHelpInfo(); // 输出辅助信息的函数
return EXIT_FAILURE;
}
if (args.help)
{
printHelpInfo();
return EXIT_SUCCESS;
}
auto sampleTest = gLogger.defineTest(gSampleName, argc, argv);
gLogger.reportTestStart(sampleTest);
SampleOnnxMNIST sample(initializeSampleParams(args));
// 由输入的参数初始化 samplesCommon::OnnxSampleParams类型,并由其构建 SampleOnnxMNIST对象
gLogInfo << "Building and running a GPU inference engine for Onnx MNIST" << std::endl;
if (!sample.build()) // 构建 cudaEngine
{
return gLogger.reportFail(sampleTest);
}
if (!sample.infer()) // 进行推理
{
return gLogger.reportFail(sampleTest);
}
return gLogger.reportPass(sampleTest);
}
这里的gLogger.defineTest
应该定义的是一个测试单元,与推理并无直接关系。initializeSamples(args)
用来构建输入类型数据
samplesCommon::OnnxSampleParams initializeSampleParams(const samplesCommon::Args& args)
{
samplesCommon::OnnxSampleParams params;
if (args.dataDirs.empty()) //!< Use default directories if user hasn't provided directory paths
{ // args.dataDirs存放的是提供数据的待搜索的路径
params.dataDirs.push_back("data/mnist/");
params.dataDirs.push_back("data/samples/mnist/");
}
else //!< Use the data directory provided by the user
{
params.dataDirs = args.dataDirs;
}
params.onnxFileName = "mnist.onnx"; //onnx模型名
params.inputTensorNames.push_back("Input3"); //网络输入变量名
params.batchSize = 1; // 批大小
params.outputTensorNames.push_back("Plus214_Output_0"); //网络的输出名
params.dlaCore = args.useDLACore; // 是否使用DLA 深度学习加速器,对网络进行硬件加速
params.int8 = args.runInInt8;
params.fp16 = args.runInFp16;
return params;
}
再接下来我们深入到SampleOnnxMNIST
类中方法的具体定义
1.首先是创建 cuda引擎的部分,及build
方法, 最好和我们上一篇的python API下示例结合看
bool SampleOnnxMNIST::build()
{
auto builder = SampleUniquePtr<nvinfer1::IBuilder> nvinfer1::createInferBuilder(gLogger.getTRTLogger()));
// 由Logger创建builder
if (!builder) return false;
const auto explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
auto network = SampleUniquePtr<nvinfer1::INetworkDefinition>(builder->createNetworkV2(explicitBatch));
// 创建计算图,这里和python API中接口一样,TENSORRT对于onnx仅支持full-dimension的输入
if (!network) return false;
auto config = SampleUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
if (!config) return false;
// 这里和python API不同,使用onnxparser 填充计算图时还需要IBuilderConfig类型
auto parser = SampleUniquePtr<nvonnxparser::IParser>(nvonnxparser::createParser(*network, gLogger.getTRTLogger())); if (!parser) return false;
auto constructed = constructNetwork(builder, network, config, parser); //填充计算图,这是自定义的函数
if (!constructed) return false;
mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(
builder->buildEngineWithConfig(*network, *config), samplesCommon::InferDeleter());
if (!mEngine) return false; // 由计算图创建Cuda引擎
assert(network->getNbInputs() == 1); // 只有一个输入
mInputDims = network->getInput(0)->getDimensions();
assert(mInputDims.nbDims == 4);
assert(network->getNbOutputs() == 1); //只有一个输出
mOutputDims = network->getOutput(0)->getDimensions();
assert(mOutputDims.nbDims == 2);
return true;
}
其中填充 计算图 的代码如下:
bool SampleOnnxMNIST::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
SampleUniquePtr<nvinfer1::INetworkDefinition>& network,
SampleUniquePtr<nvinfer1::IBuilderConfig>& config,
SampleUniquePtr<nvonnxparser::IParser>& parser)
{
auto parsed = parser->parseFromFile( locateFile(mParams.onnxFileName, mParams.dataDirs).c_str(), static_cast<int>(gLogger.getReportableSeverity()));
if (!parsed) return false; // locateFile() 是从给定的dir list中定位 file的位置, parseFromFile是从文件解析onnx模型
builder->setMaxBatchSize(mParams.batchSize); // 设置 最大的bs
config->setMaxWorkspaceSize(16_MiB);
if (mParams.fp16)
{
config->setFlag(BuilderFlag::kFP16);
}
if (mParams.int8)
{
config->setFlag(BuilderFlag::kINT8);
samplesCommon::setAllTensorScales(network.get(), 127.0f, 127.0f);
}
samplesCommon::enableDLA(builder.get(), config.get(), mParams.dlaCore);
return true;
}
- 其次就是 推理 阶段
infer
, 包括数据预处理,前向和后处理。
bool SampleOnnxMNIST::infer()
{
// Create RAII buffer manager object
samplesCommon::BufferManager buffers(mEngine, mParams.batchSize);
// 在 samples/common/buffers.h中定义,和python API中 HOST 的 DEVICE中申请现存一样的功能
auto context = SampleUniquePtr<nvinfer1::IExecutionContext>(mEngine->createExecutionContext());
if (!context) return false; //创建推理环境
// Read the input data into the managed buffers
assert(mParams.inputTensorNames.size() == 1);
if (!processInput(buffers) return false; // 数据输入
// Memcpy from host input buffers to device input buffers
buffers.copyInputToDevice();
bool status = context->executeV2(buffers.getDeviceBindings().data()); // 执行推理
if (!status) return false;
// Memcpy from device output buffers to host output buffers
buffers.copyOutputToHost(); // 由cuda复制到cpu
// Verify results
if (!verifyOutput(buffers)) return false; // 验证输出
return true;
}
- 最后输入输出定义。 input方法主要进行了输入的预处理,以及拷贝到指定的缓存中, 输出则是验证分类是否正确
bool SampleOnnxMNIST::processInput(const samplesCommon::BufferManager& buffers)
{
const int inputH = mInputDims.d[2];
const int inputW = mInputDims.d[3];
// Read a random digit file
srand(unsigned(time(nullptr)));
std::vector<uint8_t> fileData(inputH * inputW);
mNumber = rand() % 10;
readPGMFile(locateFile(std::to_string(mNumber) + ".pgm", mParams.dataDirs), fileData.data(), inputH, inputW); // 读取 pgm文件, samples/common/common.h中定义,读取pgm文件,存在在 fileData.data()为首地址,大小得inputH*inputW得空间。
// Print an ascii representation
gLogInfo << "Input:" << std::endl;
for (int i = 0; i < inputH * inputW; i++)
{
gLogInfo << (" .:-=+*#%@"[fileData[i] / 26]) << (((i + 1) % inputW) ? "" : "\n");
}
gLogInfo << std::endl;
float* hostDataBuffer = static_cast<float*>(buffers.getHostBuffer(mParams.inputTensorNames[0]));
for (int i = 0; i < inputH * inputW; i++)
{
hostDataBuffer[i] = 1.0 - float(fileData[i] / 255.0);
}
return true;
}
bool SampleOnnxMNIST::verifyOutput(const samplesCommon::BufferManager& buffers)
{
const int outputSize = mOutputDims.d[1];
float* output = static_cast<float*>(buffers.getHostBuffer(mParams.outputTensorNames[0]));
float val{0.0f};
int idx{0};
// Calculate Softmax
float sum{0.0f};
for (int i = 0; i < outputSize; i++)
{
output[i] = exp(output[i]);
sum += output[i];
}
gLogInfo << "Output:" << std::endl;
for (int i = 0; i < outputSize; i++)
{
output[i] /= sum;
val = std::max(val, output[i]);
if (val == output[i])
{
idx = i;
}
gLogInfo << " Prob " << i << " " << std::fixed << std::setw(5) << std::setprecision(4) << output[i] << " "
<< "Class " << i << ": " << std::string(int(std::floor(output[i] * 10 + 0.5f)), '*') << std::endl;
}
gLogInfo << std::endl;
return idx == mNumber && val > 0.9f;
}
- 编写
CMakeLists.txt
cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
project(FP_TEST)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
find_package(CUDA)
set(
CUDA_NVCC_FLAGS
${CUDA_NVCC_FLAGS}
-o3
-gencode arch=compute_70,code=sm_70
)
find_package(Protobuf)
if(PROTOBUF_FOUND)
message(STATUS " version: ${Protobuf_VERSION}")
message(STATUS " libraries: ${PROTOBUF_LIBRARIES}")
message(STATUS " include path: ${PROTOBUF_INCLUDE_DIR}")
else()
message(WARNING "Protobuf not found, onnx model convert tool won't be built")
endif()
set(TENSORRT_ROOT /home/zwzhou/packages/TensorRT-7.0.0.11)
find_path(TENSORRT_INCLUDE_DIR NvInfer.h
HINTS ${TENSORRT_ROOT} ${CUDA_TOOLKIT_ROOT_DIR}
PATH_SUFFIXES include)
MESSAGE(STATUS "Found TensorRT headers at ${TENSORRT_INCLUDE_DIR}")
find_library(TENSORRT_LIBRARY_INFER nvinfer
HINTS ${TENSORRT_ROOT} ${TENSORRT_BUILD} ${CUDA_TOOLKIT_ROOT_DIR}
PATH_SUFFIXES lib lib64 lib/x64)
find_library(TENSORRT_LIBRARY_INFER_PLUGIN nvinfer_plugin
HINTS ${TENSORRT_ROOT} ${TENSORRT_BUILD} ${CUDA_TOOLKIT_ROOT_DIR}
PATH_SUFFIXES lib lib64 lib/x64)
set(TENSORRT_LIBRARY ${TENSORRT_LIBRARY_INFER} ${TENSORRT_LIBRARY_INFER_PLUGIN})
MESSAGE(STATUS "Find TensorRT libs at ${TENSORRT_LIBRARY}")
find_package_handle_standard_args(
TENSORRT DEFAULT_MSG TENSORRT_INCLUDE_DIR TENSORRT_LIBRARY)
if(NOT TENSORRT_FOUND)
message(ERROR
"Cannot find TensorRT library.")
endif()
LINK_LIBRARIES("/home/zwzhou/packages/TensorRT-7.0.0.11/lib/libnvonnxparser.so")
LINK_LIBRARIES("/home/zwzhou/packages/TensorRT-7.0.0.11/lib/libnvinfer.so")
INCLUDE_DIRECTORIES("/home/zwzhou/packages/TensorRT-7.0.0.11/samples/common")
# opencv
set(OpenCV_DIR /home/zwzhou/opencv4/lib/cmake/opencv4/)
find_package(OpenCV REQUIRED)
include_directories(${OpenCV_INCLUDE_DIRS})
# OpenCV_INCLUDE_DIRS 中存储OpenCV相关头文件
set(OpenCV_LIBS opencv_core opencv_imgproc opencv_objdetect )
##############################################
# set(gLogger /home/zwzhou/packages/TensorRT-7.0.0.11/samples/common/logger.cpp)
##############################################
cuda_add_executable(mtest ./onnx2trt_test.cpp ${gLogger})
target_include_directories(mtest PUBLIC ${CUDA_INCLUDE_DIRS} ${TENSORRT_INCLUDE_DIR})
target_link_libraries(mtest ${CUDA_LIBRARIES} ${OpenCV_LIBS} ${TENSORRT_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_cudart_static_LIBRARY})
- 遇到的问题:
- 找不到 各种后缀 cuda.9.0的动态链接库
解决办法: 怀疑可能是因为所用环境下是cuda 9.2, 但安装的TensorRT对应的是cuda9.0. 于是在自己的目录下重新安装了 cuda9.0, 安装过程参考:非 root权限安装 cuda和cudnn。
安装之后设置环境变量.bashrc
:
# CUDA9.0
export CUDA_HOME=/home/zwzhou/cuda-9.0
export PATH=$PATH:$CUDA_HOME/bin
# export PATH=$CUDA_HOME/bin:$PATH
#export LD_LIBRARY_PATH=/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zwzhou/cuda-9.0/lib64
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zwzhou/packages/TensorRT-7.0.0.11/lib
export PKG_CONFIG_PATH=/home/zwzhou/opencv4/lib/pkgconfig:$PKG_CONFIG_PATH
执行 source ~/.bashrc
之后,发现nvcc -V
仍然显示 cuda 9.2,此时输出echo $PATH
发现:
/home/zwzhou/bin:/home/zwzhou/.local/bin:/home/zwzhou/anaconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/home/zwzhou/.dotnet/tools:/usr/local/cuda-9.2/bin:/home/zwzhou/cuda-9.0/bin:/home/zwzhou/cuda-9.0/bin:/usr/local/cuda/bin
即先采用的是cuda 9.2的nvcc和路径,所以修改PATH变量导出为: export PATH=$CUDA_HOME/bin:$PATH
再次 nvcc -V
显示 cuda 9.0 版本。
- 找不到
gLogger
以及对应的各种函数
解决办法 将gLogger.cpp
文件纳入到依赖中,即在CMakeLists.txt
中修改如下:
set(gLogger /home/zwzhou/packages/TensorRT-7.0.0.11/samples/common/logger.cpp)
cuda_add_executable(mtest ./onnx2trt_test.cpp ${gLogger})
- 找不到
*.pgm
文件,即找不到MNIST的图像数据
解决办法:执行TensorRT-7.0.0.11/data/mnist/download_pgms.py
文件,会下载并解压10个pgm文件在对应文件夹。
- 尝试bs>1的情形。
- maxBatchSize>1,但输入batchsize=1时依然能正确运行。
- 大batchsize的输入输出。修改部分如下:
const int BATCHSIZE = 2; //全局变量
...
bool SampleOnnxMNIST::processInput(const samplesCommon::BufferManager& buffers)
{ //批量读入
const int inputH = mInputDims.d[2];
const int inputW = mInputDims.d[3];
int batch_size = BATCHSIZE;
srand(unsigned(time(nullptr)));
std::vector<uint8_t> fileData(batch_size * inputH * inputW);
for(int i=0; i<batch_size; ++i)
{
mNumber = rand() % 10;
readPGMFile(locateFile(std::to_string(mNumber) + ".pgm", mParams.dataDirs), fileData.data()+i*(inputH*inputW), inputH, inputW);
std::cout<<std::to_string(mNumber) + ".pgm"<<"\n";
}
// Print an ascii representation
gLogInfo << "Input:" << std::endl;
for (int i = 0; i < batch_size* inputH * inputW; i++)
{
gLogInfo << (" .:-=+*#%@"[fileData[i] / 26]) << (((i + 1) % inputW) ? "" : "\n");
}
gLogInfo << std::endl;
float* hostDataBuffer = static_cast<float*>(buffers.getHostBuffer(mParams.inputTensorNames[0]));
for (int i = 0; i < batch_size * inputH * inputW; i++)
{
hostDataBuffer[i] = 1.0 - float(fileData[i] / 255.0);
}
return true;
}
bool SampleOnnxMNIST::verifyOutput(const samplesCommon::BufferManager& buffers)
{ // 批量输出
const int outputSize = mOutputDims.d[1];
float* output = static_cast<float*>(buffers.getHostBuffer(mParams.outputTensorNames[0]));
float val{0.0f};
int idx{0};
// Calculate Softmax
float sum{0.0f};
for(int b=0; b<BATCHSIZE; ++b)
{
for (int i = b*outputSize; i < (b+1)*outputSize; i++)
{
output[i] = exp(output[i]);
sum += output[i];
}
gLogInfo << "Output:" << std::endl;
for (int i = b*outputSize; i < (b+1)*outputSize; i++)
{
output[i] /= sum;
val = std::max(val, output[i]);
if (val == output[i])
{
idx = i;
}
gLogInfo << " Prob " << i << " " << std::fixed << std::setw(5) << std::setprecision(4) << output[i] << " "
<< "Class " << i << ": " << std::string(int(std::floor(output[i] * 10 + 0.5f)), '*') << std::endl;
}
gLogInfo << std::endl;
}
return idx == mNumber && val > 0.9f;
}
samplesCommon::OnnxSampleParams initializeSampleParams(const samplesCommon::Args& args)
{ //设置最大batchsize
...
params.batchSize = BATCHSIZE;
...
return params;
}
输出结果为:
发现TensorRT对于ONNX的大batchsize的支持还是和python API相同的问题,因为onnx存储时bs=1,所以只有第一个sample输出是正确的,其余的输出都为0.
参考:
nvcc定位不到的问题
gLogger找不到问题
非root权限安装多版本cuda和cudnn
利用TensorRT对深度学习进行加速
利用TensorRT实现神经网络提速(读取onnx模型并运行
Nvidia/TensorRT doc
动态batchsize