前言
cuda对图像处理过程进行加速是很常见的操作,而且图像处理算法膨胀和腐蚀是常常用来做一些噪声过滤的操作,本篇博客就是使用cuda opencv c++实现对图像进行膨胀和腐蚀加速操作,opencv的api接口和cuda并行处理速度进行一个对比试验
膨胀腐蚀原理简介
形态学操作中的膨胀/腐蚀,具体效果就是让图像中高像素值范围扩大/低像素值范围缩小,肉眼观察到则是图像亮的区域收缩/扩张
整个膨胀/腐蚀流程如下:
1)我们定义一个卷积核矩阵.这个矩阵可以是任何形状的,但通常而言,是矩形或者圆形的.同时要定义一个锚点位置,一般默认锚点为矩阵或者圆形中心点
2)用这个卷积核矩阵逐像素地在原始图像矩阵平移,同时更改图像锚点位置的像素值。修改的方式是通过卷积核矩阵与图像矩阵对应元素相加/相减,然后取最大的值/最小值回填到原始图像锚点位置 (分别对应膨胀/腐蚀)
以灰度图为例,来说明膨胀和腐蚀的过程,首先这是输入的灰度图和卷积核
这是膨胀操作
这是腐蚀操作
代码展示
废话不多说,上代码
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <device_functions.h>
#include <opencv2\opencv.hpp>
#include <iostream>
using namespace std;
using namespace cv;
//腐蚀
__global__ void erodeInCuda(unsigned char* dataIn, unsigned char* dataOut, Size erodeElement, int imgWidth, int imgHeight)
{
//Grid中x方向上的索引
int xIndex = threadIdx.x + blockIdx.x * blockDim.x;
//Grid中y方向上的索引
int yIndex = threadIdx.y + blockIdx.y * blockDim.y;
int elementWidth = erodeElement.width;
int elementHeight = erodeElement.height;
int halfEW = elementWidth / 2;
int halfEH = elementHeight / 2;
//初始化输出图
dataOut[yIndex * imgWidth + xIndex] = dataIn[yIndex * imgWidth + xIndex];;
//防止越界
if (xIndex > halfEW && xIndex < imgWidth - halfEW && yIndex > halfEH && yIndex < imgHeight - halfEH)
{
for (int i = -halfEH; i < halfEH + 1; i++)
{
for (int j = -halfEW; j < halfEW + 1; j++)
{
if (dataIn[(i + yIndex) * imgWidth + xIndex + j] < dataOut[yIndex * imgWidth + xIndex])
{
dataOut[yIndex * imgWidth + xIndex] = dataIn[(i + yIndex) * imgWidth + xIndex + j];
}
}
}
}
}
//膨胀
__global__ void dilateInCuda(unsigned char* dataIn, unsigned char* dataOut, Size dilateElement, int imgWidth, int imgHeight)
{
//Grid中x方向上的索引
int xIndex = threadIdx.x + blockIdx.x * blockDim.x;
//Grid中y方向上的索引
int yIndex = threadIdx.y + blockIdx.y * blockDim.y;
int elementWidth = dilateElement.width;
int elementHeight = dilateElement.height;
int halfEW = elementWidth / 2;
int halfEH = elementHeight / 2;
//初始化输出图
dataOut[yIndex * imgWidth + xIndex] = dataIn[yIndex * imgWidth + xIndex];
//防止越界
if (xIndex > halfEW && xIndex < imgWidth - halfEW && yIndex > halfEH && yIndex < imgHeight - halfEH)
{
for (int i = -halfEH; i < halfEH + 1; i++)
{
for (int j = -halfEW; j < halfEW + 1; j++)
{
if (dataIn[(i + yIndex) * imgWidth + xIndex + j] > dataOut[yIndex * imgWidth + xIndex])
{
dataOut[yIndex * imgWidth + xIndex] = dataIn[(i + yIndex) * imgWidth + xIndex + j];
}
}
}
}
}
int main()
{
Mat grayImg = imread("lena.jpg", 0);//输入的灰度图
unsigned char* d_in;//输入图片在GPU内的内存
unsigned char* d_out1;//腐蚀后输出图片在GPU内的内存
unsigned char* d_out2;//膨胀后输出图片在GPU内的内存
int imgWidth = grayImg.cols;
int imgHeight = grayImg.rows;
Mat dstImg1(imgHeight, imgWidth, CV_8UC1, Scalar(0));//腐蚀后输出图片在CPU内的内存
Mat dstImg2(imgHeight, imgWidth, CV_8UC1, Scalar(0));//膨胀后输出图片在CPU内的内存
//在GPU中开辟内存
cudaMalloc((void**)&d_in, imgWidth * imgHeight * sizeof(unsigned char));
cudaMalloc((void**)&d_out1, imgWidth * imgHeight * sizeof(unsigned char));
cudaMalloc((void**)&d_out2, imgWidth * imgHeight * sizeof(unsigned char));
//将输入图片传入GPU
cudaMemcpy(d_in, grayImg.data, imgWidth * imgHeight * sizeof(unsigned char), cudaMemcpyHostToDevice);
//定义block中thread的分布
dim3 threadsPerBlock(32, 32);
//根据输入图片的宽高定义block的大小
dim3 blocksPerGrid((imgWidth + threadsPerBlock.x - 1) / threadsPerBlock.x, (imgHeight + threadsPerBlock.y - 1) / threadsPerBlock.y);
//算子大小
Size Element(3, 5);
//记录起始时间
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//CUDA腐蚀
erodeInCuda << <blocksPerGrid, threadsPerBlock >> > (d_in, d_out1, Element, imgWidth, imgHeight);
//将结果传回CPU
cudaMemcpy(dstImg1.data, d_out1, imgWidth * imgHeight * sizeof(unsigned char), cudaMemcpyDeviceToHost);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("erode gpu time cost: %3.5f ms\n", elapsedTime);
cv::imwrite("erode_gpu.jpg", dstImg1);
//CPU内腐蚀(OpenCV实现)
Mat erodeImg;
Mat element = getStructuringElement(MORPH_RECT, Size(3, 5));
double time1 = static_cast<double>(cv::getTickCount());
erode(grayImg, erodeImg, element);
double time2 = static_cast<double>(cv::getTickCount());
std::cout << "erode cpu Time use: " << 1000 * (time2 - time1) / cv::getTickFrequency() << "ms" << std::endl;//输出运行时间
cv::imwrite("erode_cpu.jpg", erodeImg);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//CUDA膨胀
dilateInCuda << <blocksPerGrid, threadsPerBlock >> > (d_in, d_out2, Element, imgWidth, imgHeight);
//将结果传回CPU
cudaMemcpy(dstImg2.data, d_out2, imgWidth * imgHeight * sizeof(unsigned char), cudaMemcpyDeviceToHost);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf(" dilate gpu time cost: %3.5f ms\n", elapsedTime);
cv::imwrite("dilate_gpu.jpg", dstImg2);
//CPU内膨胀(OpenCV实现)
Mat dilateImg;
time1 = static_cast<double>(cv::getTickCount());
dilate(grayImg, dilateImg, element);
time2 = static_cast<double>(cv::getTickCount());
std::cout << "dilate cpu Time use: " << 1000 * (time2 - time1) / cv::getTickFrequency() << "ms" << std::endl;//输出运行时间
cv::imwrite("dilate_cpu.jpg", dilateImg);
return 0;
}
这里的代码比较简单且存在一些注释,不需要详细说明,直接看结果就行了
结果展示和时间测试
输入图片还是女神lena
膨胀的cpu与gpu输出对比
腐蚀的cpu与gpu输出对比
测试时间来看,基本上cuda gpu只比cpu快不到50%左右,cuda gpu花费的大部分时间估计还是在数据拷贝上
参考博客
opencv之膨胀与腐蚀 - core! - 博客园 (cnblogs.com)
CUDA精进之路(二):图像处理——形态学滤波(膨胀、腐蚀、开闭运算
数学形态学运算——腐蚀、膨胀、开运算、闭运算