一、概述
实测SwinTransformer真的是涨点神器,刷榜秘籍,用SwinTransformer作为模型主干网络来微调下游任务对比ResNet50保守能够带来2~5个点的提升,当然模型参数量是大了点。测试了下基于OnnxRuntime cpu模式和gpu(非TensorRT)模式下的速度。对于大部分图片识别类任务,这个速度也是可以接受的。
模式 | 硬件 | 输入 | 平均速度 |
---|---|---|---|
cpu | Intel(R) Xeon(R) W-2102 CPU @ 2.90GHz | 224*224 | |
gpu | Nvidia Tesla T4 | 224*224 |
二、环境
- ubuntu18.04
- pytorch1.10
低版本不支持
- onnxruntime1.10
- cuda11.4
- cudnn 8.24
cuda版本和cudnn版本需要11.4以上
- opencv4.4
Onnxruntime和cuda版本之间的对应关系如下图
CUDA - onnxruntime对应关系
Onnxruntime下载地址
三、模型转onnx
去掉训练时候的分类头,只提取timm的版本SwinTransformer的特征。
转出来的模型可以在此处下载
链接:https://pan.baidu.com/s/1oKUrPxPtYUFGVXJ2SiBP3g
提取码:czfj
import timm
import torch.nn as nn
import torch
class ft_net_swin_extract(nn.Module):
def __init__(self, class_num, droprate=0.5, stride=2,):
super(ft_net_swin_extract, self).__init__()
model_ft = timm.create_model('swin_base_patch4_window7_224', pretrained=True)
# avg pooling to global pooling
#model_ft.avgpool = nn.AdaptiveAvgPool2d((1,1))
model_ft.head = nn.Sequential() # save memory
self.model = model_ft
def forward(self, x):
x = self.model.forward_features(x)
return x
from model import ft_net_swin_extract
import numpy as np
from torchvision import models,transforms,datasets
import cv2
import onnx
import onnxruntime
data_transforms = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
image_file = "000_000_gan0002_c3s1_136308_04.jpg"
input = cv2.imread(image_file)
img_h,img_w,_ = input.shape
resize_input = cv2.resize(input,(224,224))
image = data_transforms(resize_input)
image = image.unsqueeze(0)
print(image.shape)
model = ft_net_swin_extract(class_num=751,circle=True)
model_path = "/home/nemo/DeepLearning/Person_reID_baseline_pytorch/MarketOut/best.pth"
model.load_state_dict(torch.load(model_path))
model.classifier.classifier = nn.Sequential()
model.eval()
torch_out = model(image)
# Export the model
torch.onnx.export(model, # model being run
image, # model input (or a tuple for multiple inputs)
"swin-transform.onnx", # where to save the model (can be a file or file-like object)
export_params=True, # store the trained parameter weights inside the model file
opset_version=12, # the ONNX version to export the model to
do_constant_folding=True, # whether to execute constant folding for optimization
input_names = ['input'], # the model's input names
output_names = ['output'], # the model's output names
)
onnx_model = onnx.load("swin-transform.onnx")
onnx.checker.check_model(onnx_model)
四、编写onnxruntime 推理代码
代码地址
https://gitee.com/running_jiang/swintransformer-onnxruntime.git
https://github.com/runningJ/swintransformer-onnxruntime.git
欢迎star,拒绝白嫖。
cpu版本
#include <iostream>
#include <vector>
#include <opencv2/core.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/dnn.hpp>
#include <onnxruntime_cxx_api.h>
#include <algorithm>
#include <numeric>
#include <chrono>
using namespace std;
using namespace cv;
using namespace Ort;
template <typename T>
T vectorProduct(const std::vector<T>& v)
{
return accumulate(v.begin(), v.end(), 1, std::multiplies<T>());
};
int main(int argc,char**argv)
{
if (argc != 3)
{
cerr<<"usage "<< argv[0] <<" image_path model_path"<<endl;
return 0;
}
cv::Mat image = imread(argv[1]);
if(image.empty())
{
cerr <<"input image has problem "<< argv[1]<<endl;
return 0;
}
string model_path = argv[2];
Env env;
SessionOptions options{nullptr};
Session session(env, model_path.c_str(),options);
size_t numInputNodes = session.GetInputCount();
size_t numOutputNodes = session.GetOutputCount();
std::cout << "Number of Input Nodes: " << numInputNodes << std::endl;
std::cout << "Number of Output Nodes: " << numOutputNodes << std::endl;
AllocatorWithDefaultOptions allocator;
const char* inputName = session.GetInputName(0, allocator);
std::cout << "Input Name: " << inputName << std::endl;
TypeInfo inputTypeInfo = session.GetInputTypeInfo(0);
auto inputTensorInfo = inputTypeInfo.GetTensorTypeAndShapeInfo();
ONNXTensorElementDataType inputType = inputTensorInfo.GetElementType();
std::vector<int64_t> inputDims = inputTensorInfo.GetShape();
std::cout << "Input Dimensions: ";
for(int i = 0; i < inputDims.size(); ++i)
{
cout<< inputDims[i]<<" ";
}
cout <<endl;
cout <<"-----------------------------------------"<<endl;
const char* outputName = session.GetOutputName(0, allocator);
cout << "Output Name: " << outputName << std::endl;
TypeInfo outputTypeInfo = session.GetOutputTypeInfo(0);
auto outputTensorInfo = outputTypeInfo.GetTensorTypeAndShapeInfo();
ONNXTensorElementDataType outputType = outputTensorInfo.GetElementType();
std::vector<int64_t> outputDims = outputTensorInfo.GetShape();
std::cout << "Output Dimensions: ";
for(int i = 0; i < outputDims.size(); ++i)
{
cout<< outputDims[i]<<" ";
}
cout <<endl;
//data preprocess
cv::Mat resizedImageBGR, resizedImageRGB, resizedImage, preprocessedImage;
cv::resize(image, resizedImageBGR,cv::Size(inputDims.at(2), inputDims.at(3)));
resizedImageRGB = resizedImageBGR;
//cv::cvtColor(resizedImageBGR, resizedImageRGB,cv::ColorConversionCodes::COLOR_BGR2RGB);
resizedImageRGB.convertTo(resizedImage, CV_32F, 1.0 / 255);
cv::Mat channels[3];
cv::split(resizedImage, channels);
channels[0] = (channels[0] - 0.485) / 0.229;
channels[1] = (channels[1] - 0.456) / 0.224;
channels[2] = (channels[2] - 0.406) / 0.225;
cv::merge(channels, 3, resizedImage);
cv::dnn::blobFromImage(resizedImage, preprocessedImage);
size_t inputTensorSize = vectorProduct(inputDims);
std::vector<float> inputTensorValues(inputTensorSize);
inputTensorValues.assign(preprocessedImage.begin<float>(),
preprocessedImage.end<float>());
size_t outputTensorSize = vectorProduct(outputDims);
std::vector<float> outputTensorValues(outputTensorSize);
std::vector<const char*> inputNames{inputName};
std::vector<const char*> outputNames{outputName};
std::vector<Value> inputTensors;
std::vector<Value> outputTensors;
MemoryInfo memoryInfo = Ort::MemoryInfo::CreateCpu(
OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
inputTensors.push_back(Value::CreateTensor<float>(
memoryInfo, inputTensorValues.data(), inputTensorSize, inputDims.data(),
inputDims.size()));
outputTensors.push_back(Value::CreateTensor<float>(
memoryInfo, outputTensorValues.data(), outputTensorSize,
outputDims.data(), outputDims.size()));
for(int i = 0; i < 100; ++i)
{
auto s_t=std::chrono::steady_clock::now();
session.Run(Ort::RunOptions{nullptr}, inputNames.data(),
inputTensors.data(), 1, outputNames.data(),
outputTensors.data(), 1);
auto e_t=std::chrono::steady_clock::now();
double dr_s=std::chrono::duration<double,std::milli>(e_t-s_t).count();
cout <<"runing inference cost time "<< dr_s <<"ms"<<endl;
}
for(int j = 0; j < 10; ++j)
{
cout << outputTensorValues.at(j)<<endl;
}
return 0;
}
cuda 版本
#include <iostream>
#include <vector>
#include <opencv2/core.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/dnn.hpp>
#include <onnxruntime_cxx_api.h>
#include <algorithm>
#include <numeric>
#include <chrono>
using namespace std;
using namespace cv;
using namespace Ort;
template <typename T>
T vectorProduct(const std::vector<T>& v)
{
return accumulate(v.begin(), v.end(), 1, std::multiplies<T>());
};
int main(int argc,char**argv)
{
if (argc != 3)
{
cerr<<"usage "<< argv[0] <<" image_path model_path"<<endl;
return 0;
}
cv::Mat image = imread(argv[1]);
if(image.empty())
{
cerr <<"input image has problem "<< argv[1]<<endl;
return 0;
}
string model_path = argv[2];
Env env(ORT_LOGGING_LEVEL_WARNING, "Default");
Session session{nullptr};
SessionOptions session_options;
OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0);
session = Ort::Session(env, model_path.c_str(), session_options);
size_t numInputNodes = session.GetInputCount();
size_t numOutputNodes = session.GetOutputCount();
std::cout << "Number of Input Nodes: " << numInputNodes << std::endl;
std::cout << "Number of Output Nodes: " << numOutputNodes << std::endl;
AllocatorWithDefaultOptions allocator;
const char* inputName = session.GetInputName(0, allocator);
std::cout << "Input Name: " << inputName << std::endl;
TypeInfo inputTypeInfo = session.GetInputTypeInfo(0);
auto inputTensorInfo = inputTypeInfo.GetTensorTypeAndShapeInfo();
ONNXTensorElementDataType inputType = inputTensorInfo.GetElementType();
std::vector<int64_t> inputDims = inputTensorInfo.GetShape();
std::cout << "Input Dimensions: ";
for(int i = 0; i < inputDims.size(); ++i)
{
cout<< inputDims[i]<<" ";
}
cout <<endl;
cout <<"-----------------------------------------"<<endl;
const char* outputName = session.GetOutputName(0, allocator);
cout << "Output Name: " << outputName << std::endl;
TypeInfo outputTypeInfo = session.GetOutputTypeInfo(0);
auto outputTensorInfo = outputTypeInfo.GetTensorTypeAndShapeInfo();
ONNXTensorElementDataType outputType = outputTensorInfo.GetElementType();
std::vector<int64_t> outputDims = outputTensorInfo.GetShape();
std::cout << "Output Dimensions: ";
for(int i = 0; i < outputDims.size(); ++i)
{
cout<< outputDims[i]<<" ";
}
cout <<endl;
//data preprocess
cv::Mat resizedImageBGR, resizedImageRGB, resizedImage, preprocessedImage;
cv::resize(image, resizedImageBGR,cv::Size(inputDims.at(2), inputDims.at(3)));
resizedImageRGB = resizedImageBGR;
//cv::cvtColor(resizedImageBGR, resizedImageRGB,cv::ColorConversionCodes::COLOR_BGR2RGB);
resizedImageRGB.convertTo(resizedImage, CV_32F, 1.0 / 255);
cv::Mat channels[3];
cv::split(resizedImage, channels);
channels[0] = (channels[0] - 0.485) / 0.229;
channels[1] = (channels[1] - 0.456) / 0.224;
channels[2] = (channels[2] - 0.406) / 0.225;
cv::merge(channels, 3, resizedImage);
cv::dnn::blobFromImage(resizedImage, preprocessedImage);
size_t inputTensorSize = vectorProduct(inputDims);
std::vector<float> inputTensorValues(inputTensorSize);
inputTensorValues.assign(preprocessedImage.begin<float>(),
preprocessedImage.end<float>());
size_t outputTensorSize = vectorProduct(outputDims);
std::vector<float> outputTensorValues(outputTensorSize);
std::vector<const char*> inputNames{inputName};
std::vector<const char*> outputNames{outputName};
std::vector<Value> inputTensors;
std::vector<Value> outputTensors;
MemoryInfo memoryInfo = Ort::MemoryInfo::CreateCpu(
OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
inputTensors.push_back(Value::CreateTensor<float>(
memoryInfo, inputTensorValues.data(), inputTensorSize, inputDims.data(),
inputDims.size()));
outputTensors.push_back(Value::CreateTensor<float>(
memoryInfo, outputTensorValues.data(), outputTensorSize,
outputDims.data(), outputDims.size()));
for(int i = 0; i < 100; ++i)
{
auto s_t=std::chrono::steady_clock::now();
session.Run(Ort::RunOptions{nullptr}, inputNames.data(),
inputTensors.data(), 1, outputNames.data(),
outputTensors.data(), 1);
auto e_t=std::chrono::steady_clock::now();
double dr_s=std::chrono::duration<double,std::milli>(e_t-s_t).count();
cout <<"runing inference cost time "<< dr_s <<"ms"<<endl;
}
for(int j = 0; j < 10; ++j)
{
cout << outputTensorValues.at(j)<<endl;
}
return 0;
}