在最近的项目中需要对Xavier上的tensorflow代码进行加速,然后xavier中又自带了TensorRT,所以就直接使用TensorRT进行加速。本文针对tensorflow,如果使用其他的框架需要进行相应的修改。
TensorRT简介
TensorRT是NVIDIA 推出的一款基于CUDA和cudnn的神经网络推断加速引擎,相比于一般的深度学习框架,在CPU或者GPU模式下其可提供10X乃至100X的加速,极大提高了深度学习模型在边缘设备上的推断速度。
TensorRT主要是通过两种方式对inference进行加速,一种是将模型进行融合聚合,另外一种就是调整精度,使用FP16,INT8等精度。
在Xavier平台上,预先安装了c++版本的TensorRT,所以本文不涉及TensorRT的安装
使用TensorRT的整体流程如下:
- 在pc端首先训练好模型,得到pb模型文件
- 在pc端将模型文件转化为TensorRT能够使用的模型格式,tensorflow的话将pb转化为uff格式
- 在xavier端使用模型文件构建出engine
- 在xavier端使用engine进行推理
导出pb模型文件
pb文件转化为uff文件
参考我另外一篇文章ubuntu16.04安装TensorRT5.1,安装完TensorRT后,可以使用convert-to-uff命令进行模型的转换,-o表示转化完的模型的名称
convert-to-uff model.pb -o model.uff
xavier端构建engine
这一步我觉得最苦难的是CMakeLists.txt文件的编写,需要加入cuda的一些动态库,以及TensorRT的头文件和动态库,我的CMakeLists.txt文件如下所示。这一步和下一步我都使用这份CMakeLists.txt。
cmake_minimum_required(VERSION 2.8)
project(tensorrt)
find_package(OpenCV REQUIRED )
# 添加cuda头文件
include_directories(/usr/local/cuda/include)
# 添加tensorrt的动态库
link_libraries("/usr/lib/aarch64-linux-gnu/libnvparsers.so")
link_libraries("/usr/lib/aarch64-linux-gnu/libnvinfer.so")
# 添加cuda动态库
link_libraries("/usr/local/cuda/lib64/libcudart.so")
# 确定可执行文件名称
add_executable(tensorrt tensorrt.cpp)
add_executable(uff_to_plan uff_to_plan.cpp)
# 添加opencv动态库
target_link_libraries(tensorrt ${OpenCV_LIBS})
然后下面是构建engine并保存的代码
在这一步有一个问题,tensorflow中输入的格式为(height,width,channel) 即HWC,但是在TensorRT中是CHW格式,这一步在转换模型的时候就自动完成了,不需要自己转换,但是最后使用tensorrt推理的时候要注意把图像的格式转换为CHW。
class Logger : public ILogger
{
void log(Severity severity, const char *msg) override
{
cout << msg << endl;
}
} gLogger;
int main(int argc, char *argv[])
{
/* parse uff */
IBuilder *builder = createInferBuilder(gLogger);
INetworkDefinition *network = builder->createNetwork();
IUffParser *parser = createUffParser();
/* register input and output */
parser->registerInput(inputName.c_str(), DimsCHW(3, inputHeight, inputWidth), UffInputOrder::kNCHW);
parser->registerOutput(outputName.c_str());
if (!parser->parse(modelName.c_str(), *network, DataType::kFLOAT))
{
cout << "Failed to parse UFF\n";
builder->destroy();
parser->destroy();
network->destroy();
return 1;
}
/* build engine */
builder->setMaxBatchSize(maxBatchSize);
builder->setMaxWorkspaceSize(maxWorkspaceSize);
/* use FP16 */
builder->setFp16Mode(true);
// builder->setInt8Mode(true);
ICudaEngine *engine = builder->buildCudaEngine(*network);
/* serialize engine and write to file */
ofstream planFile;
planFile.open(planFilename);
IHostMemory *serializedEngine = engine->serialize();
planFile.write((char *)serializedEngine->data(), serializedEngine->size());
planFile.close();
/* break down */
builder->destroy();
parser->destroy();
network->destroy();
engine->destroy();
serializedEngine->destroy();
return 0;
}
使用TensorRT进行推理
在上一步中构建出了engine并进行序列化保存成了plan文件,这一步就是读取plan文件并且反序列化构建出engine,使用engine进行推理
void cvImageToTensor(const cv::Mat &image, float *tensor, nvinfer1::Dims dimensions)
{
const size_t channels = dimensions.d[0];
const size_t height = dimensions.d[1];
const size_t width = dimensions.d[2];
// TODO: validate dimensions match
const size_t stridesCv[3] = {width * channels, channels, 1};
const size_t strides[3] = {height * width, width, 1};
for (int i = 0; i < height; i++)
{
for (int j = 0; j < width; j++)
{
for (int k = 0; k < channels; k++)
{
const size_t offsetCv = i * stridesCv[0] + j * stridesCv[1] + k * stridesCv[2];
const size_t offset = k * strides[0] + i * strides[1] + j * strides[2];
tensor[offset] = (float)image.data[offsetCv];
}
}
}
}
void solve()
{
/* build the engine */
ifstream planFile(planFileName);
stringstream planBuffer;
planBuffer << planFile.rdbuf();
string plan = planBuffer.str();
IRuntime *runtime = createInferRuntime(gLogger);
ICudaEngine *engine = runtime->deserializeCudaEngine((void*)plan.data(),
plan.size(), nullptr);
IExecutionContext *context = engine->createExecutionContext();
// get the input and output dimensions
int inputBindingIndex, outputBindingIndex;
inputBindingIndex = engine->getBindingIndex(inputName.c_str());
outputBindingIndex = engine->getBindingIndex(outputName.c_str());
Dims inputDims, outputDims;
inputDims = engine->getBindingDimensions(inputBindingIndex);
outputDims = engine->getBindingDimensions(outputBindingIndex);
// get the input and output size
int inputWidth, inputHeight, outputHeight, outputWidth;
inputHeight = inputDims.d[1];
inputWidth = inputDims.d[2];
outputHeight = outputDims.d[1];
outputWidth = outputDims.d[2];
/* get the number of input and output */
float *inputDataHost, *outputDataHost;
size_t numInput, numOutput;
numInput = numTensorElements(inputDims);
numOutput = numTensorElements(outputDims);
inputDataHost = (float *)malloc(numInput * sizeof(float));
outputDataHost = (float *)malloc(numOutput * sizeof(float));
/* transfer to device */
void *inputDataDevice, *outputDataDevice;
cudaMalloc(&inputDataDevice, numInput * sizeof(float));
cudaMalloc(&outputDataDevice, numOutput * sizeof(float));
if (inputDataDevice == nullptr || outputDataDevice == nullptr)
{
std::cerr << "Out of memory" << std::endl;
exit(1);
}
void *bindings[2];
bindings[inputBindingIndex] = inputDataDevice;
bindings[outputBindingIndex] = outputDataDevice;
// get the image name
vector<string> images;
getImages(imageFolderName, images);
cout << "Executing inference engine..." << endl;
for(int i = 0; i < images.size(); ++i) {
string imageFileName = images[i];
cv::Mat image = cv::imread(imageFolderName + imageFileName);
/* BGR to RGB */
cv::cvtColor(image, image, cv::COLOR_BGR2RGB);
/* resize */
cv::resize(image, image, cv::Size(inputWidth, inputHeight));
/* convert HWC to float CHW */
cvImageToTensor(image, inputDataHost, inputDims);
cudaMemcpy(inputDataDevice, inputDataHost, numInput * sizeof(float), cudaMemcpyHostToDevice);
/* execute engine */
context->execute(kBatchSize, bindings);
/* transfer output back to host */
cudaMemcpy(outputDataHost, outputDataDevice, numOutput * sizeof(float), cudaMemcpyDeviceToHost);
cv::Mat preds(outputHeight, outputWidth, CV_8UC1);
TensorToImage(outputDataHost, preds, outputDims);
// 后续处理省略
cout << i << "/" << images.size() << " is over" << endl;
}
engine->destroy();
context->destroy();
free(inputDataHost);
free(outputDataHost);
cudaFree(inputDataDevice);
cudaFree(outputDataDevice);
}
参考代码
- https://github.com/NVIDIA-AI-IOT/tf_to_trt_image_classification
- xavier中自带的tensorrt实例,在/usr/src/tensorrt/examples中