使用opencv的warpaffine完成letter box图片预处理

仿射变换原理

齐次坐标

普通坐标--->齐次坐标
如果(x,y,z)是个点，则变为(x,y,z,1);
如果(x,y,z)是个向量，则变为(x,y,z,0)
齐次坐标 ---> 普通坐标
如果是(x,y,z,1)，则知道它是个点，变成(x,y,z);
如果是(x,y,z,0)，则知道它是个向量，仍然变成(x,y,z);

仿射变换

缩放
$\begin{bmatrix} x \\ y\\ 1 \end{bmatrix} => \begin{bmatrix} X \\ Y\\ 1 \end{bmatrix}$
$X = x*scale, Y= y*scale$
$\begin{bmatrix} scale & 0 & 0 \\ 0 & scale & 0 \\ 0 & 0 & 1 \end{bmatrix} * \begin{bmatrix} x \\ y\\ 1 \end{bmatrix} = \begin{bmatrix} X\\ Y\\ 1 \end{bmatrix}$
平移
$\begin{bmatrix} x \\ y\\ 1 \end{bmatrix} => \begin{bmatrix} X \\ Y\\ 1 \end{bmatrix}$
$X = x + dx, Y = y + dy$
$\begin{bmatrix} 1 & 0 & dx \\ 0 & 1 & dy \\ 0 & 0 & 1 \end{bmatrix} * \begin{bmatrix} x \\ y \\ 1 \end{bmatrix} = \begin{bmatrix} X\\ Y\\ 1 \end{bmatrix}$
旋转
$\begin{bmatrix} x \\ y\\ 1 \end{bmatrix} => \begin{bmatrix} X \\ Y\\ 1 \end{bmatrix}$
$x = r*cos(\theta), y=r*sin(\theta)$
$X = r*cos(\theta+\alpha) = r*cos(\theta)*cos(\alpha) - r*sin(\theta)*sin(\alpha) = x*cos(\alpha) - y*sin(\alpha)$
$Y = r*sin(\theta+\alpha) = r*sin(\theta)*cos(\alpha) + r*cos(\theta)*sin(\alpha) = y*cos(\alpha) + x*sin(\alpha)$
$\begin{bmatrix} cos(\alpha) &-sin(\alpha) & 0 \\ sin(\alpha) &cos(\alpha) & 0 \\ 0 & 0 & 1 \end{bmatrix} * \begin{bmatrix} x \\ y \\ 1 \end{bmatrix} = \begin{bmatrix} X\\ Y\\ 1 \end{bmatrix}$

letter box实现思路

缩放比例：

M矩阵是 M * from = to的方式进行映射，因此scale的分母一定是from
取最小，即根据宽高比，算出最小的比例，如果取最大，则势必有一部分超出图像范围而被裁剪掉，这不是我们要的

仿射变换：
这里的仿射变换矩阵实质上是2x3的矩阵，具体实现是
$\begin{bmatrix} scale& 0 & -scale * from.width * 0.5 + to.width * 0.5 \\ 0 & scale & -scale * from.height * 0.5 + to.height * 0.5 \end{bmatrix}$

这里可以想象成，是经历过缩放、平移、平移三次变换后的组合，M = TPS
例如第一个S矩阵，定义为把输入的from图像，等比缩放scale倍，到to尺度下
$S = \begin{bmatrix} scale & 0 & 0 \\ 0 & scale & 0 \\ 0 & 0 & 1 \end{bmatrix}$
P矩阵定义为第一次平移变换矩阵，将图像的中心点移动到缩放后图片的原点
$P = \begin{bmatrix} 1& 0 & -scale * from.width * 0.5 \\ 0 & 1 & -scale * from.height * 0.5 \\ 0 & 0 & 1 \end{bmatrix}$
T矩阵定义为第二次平移变换矩阵，将当前图像中心点移动到目标（to）图的中心上
$T = \begin{bmatrix} 1& 0 & to.width * 0.5 \\ 0 & 1 & to.height * 0.5 \\ 0 & 0 & 1 \end{bmatrix}$
通过将3个矩阵顺序乘起来，即可得到下面的表达式：
$M = \begin{bmatrix} scale& 0 & -scale * from.width * 0.5 + to.width * 0.5 \\ 0 & scale & -scale * from.height * 0.5 + to.height * 0.5 \\ 0 & 0 & 1 \end{bmatrix}$

变换图示

将一张1920 x 1080 的图片缩放到640 x 640

image.png

python实现

def letter_box(image_path):
    m_width = 608
    m_height = 608
    image   = cv2.imread(image_path)
    scale_x = m_width / image.shape[1]
    scale_y = m_height / image.shape[0]
    scale   = min(scale_x, scale_y)
    M = np.array([[scale, 0, (-scale *  image.shape[1] + m_width + scale  - 1) * 0.5],
                  [0, scale, (-scale *  image.shape[0] + m_height + scale  - 1) * 0.5]])
    M_T = cv2.invertAffineTransform(M)
    image = cv2.warpAffine(image ,
                M,
                (m_width , m_height ),
                flags=cv2.INTER_NEAREST,
                borderMode=cv2.BORDER_CONSTANT,
                borderValue=(114,114,114))
    return image, M_T

C++ 实现

// letter box
void letter_box(std::string image_path)
{
    int m_width  = 640;
    int m_height = 640;
    float *input_data_host = (float*)malloc(3 * m_width * m_height * sizeof(float));
    auto image = cv::imread(image_path);
    float scale_x = m_width / (float)image.cols;
    float scale_y = m_height / (float)image.rows;
    float scale = std::min(scale_x, scale_y);
    float i2d[6], d2i[6];
    // d2i 是为了后续坐标映射回去
    i2d[0] = scale;  i2d[1] = 0;  i2d[2] = (-scale * image.cols + m_width + scale  - 1) * 0.5;
    i2d[3] = 0;  i2d[4] = scale;  i2d[5] = (-scale * image.rows + m_height + scale - 1) * 0.5;

    cv::Mat m2x3_i2d(2, 3, CV_32F, i2d);
    cv::Mat m2x3_d2i(2, 3, CV_32F, d2i);
    cv::invertAffineTransform(m2x3_i2d, m2x3_d2i);

    cv::Mat input_image(m_height, m_width, CV_8UC3);
    cv::warpAffine(image, input_image, m2x3_i2d, input_image.size(), cv::INTER_LINEAR, 
                   cv::BORDER_CONSTANT, cv::Scalar::all(114));
}

// 坐标映射回原图
void affine_project(float* matrix, float x, float y, float* ox, float* oy){
    *ox = matrix[0] * x + matrix[1] * y + matrix[2];
    *oy = matrix[3] * x + matrix[4] * y + matrix[5];
}

cuda实现

struct AffineMatrix
{
    float value[6];
};

struct AffineMatrixImage
{
    float value[6];
    float* affine_image;
};

__device__ void affine_proj(float* matrix, int x, int y, float* proj_x, float* proj_y)
{
    // matrix
    // m0, m1, m2
    // m3, m4, m5

    //            m0, m1, m2
    //(x, y ,1) * m3, m4, m5 = (proj_x, proj_y, _)
    //            0,  0,  1
    *proj_x = matrix[0] * x + matrix[1] * y + matrix[2];
    *proj_y = matrix[3] * x + matrix[4] * y + matrix[5];
}

__global__ void warpaffine_kernel(
    uint8_t *input_image, int input_image_width, int input_image_height,
    float *output_image, int output_image_width, int output_image_height,
    AffineMatrix m, uint8_t const_value
)
{
    int dx = blockDim.x * blockIdx.x + threadIdx.x;
    int dy = blockDim.y * blockIdx.y + threadIdx.y;
    if (dx >= output_image_width || dy >= output_image_height) return;
    // 先将填充值设置为默认
    float c0 = const_value;
    float c1 = const_value;
    float c2 = const_value;

    // 接收原图坐标
    float src_x = 0;
    float src_y = 0;
    affine_proj(m.value, dx, dy, &src_x, &src_y);

    /* 双线性插值*/
    if (
        src_x < -1 || 
        src_x >= input_image_width || 
        src_y < -1 || 
        src_y >= input_image_height
    )
    {
        // 对应原图坐标超出界限
    }
    else
    {
        /* 
            -----双线性插值-----
            o 表示当前点 变换图对应原图的点
            A, B, C, D 表示该点周围的四个像素点
            插值权重
            A (x_low, y_low)         B (x_high, y_low)
            --------------------------
            |        |               |
            |   w0   |      w1       |
            ---------o----------------
            |        |(src_x, src_y) |
            |        |               |
            |   w3   |      w2       |
            |        |               |
            --------------------------
            D (x_low, y_high)        C (x_high, y_high)
            left   = src_x - x_low
            top    = src_y - y_low
            right  = x_high - src_x
            bottom = y_high - src_y
            计算面积，即权重
            w0 = left   * top
            w1 = top    * right
            w2 = right  * bottom
            w3 = bottom * left
            再将 o 的值赋值给变换后的坐标位置的RGB的值
            o[0] = A[0] * w3 + B[0] * w2 + C[0] * w0 + D[0] * w1
            o[1] = A[1] * w3 + B[1] * w2 + C[1] * w0 + D[1] * w1 
            o[2] = A[2] * w3 + B[2] * w2 + C[2] * w0 + D[2] * w1
        */
        int x_low  = floor(src_x);
        int y_low  = floor(src_y);
        int x_high = x_low + 1;
        int y_high = y_low + 1;

        uint8_t const_value_array[] = {
            const_value, const_value, const_value
        };
        float left   = src_x  - x_low;
        float top    = src_y  - y_low;
        float right  = x_high - src_x;
        float bottom = y_high - src_y;
        float w0     = left   * top;
        float w1     = top    * right;
        float w2     = right  * bottom;
        float w3     = bottom * left;

        // 四个点的像素值先设置为默认
        uint8_t* v0 = const_value_array;
        uint8_t* v1 = const_value_array;
        uint8_t* v2 = const_value_array;
        uint8_t* v3 = const_value_array;

        /* 
            像素排列
            RGBRGBRGB
            RGBRGBRGB
            RGBRGBRGB
        */  
        
        if (x_low >=0 && y_low >=0 && x_low < input_image_width && y_low < input_image_height)
        {
            v0 = input_image + (y_low * input_image_width + x_low) * 3;
        }
        if (x_high >=0 && y_low >=0 && x_high < input_image_width && y_low < input_image_height)
        {
            v1 = input_image + (y_low * input_image_width + x_high) * 3;
        }
        if (x_high >=0 && y_high >=0 && x_high < input_image_width && y_high < input_image_height)
        {
            v2 = input_image + (y_high * input_image_width + x_high) * 3;
        }
        if (x_low >=0 && y_high >=0 && x_low < input_image_width && y_high < input_image_height)
        {
            v3 = input_image + (y_high * input_image_width + x_low) * 3;
        }
        c0 = floorf(w0 * v2[0] + w1 * v3[0] + w2 * v0[0] + w3 * v1[0] + 0.5f);
        c1 = floorf(w0 * v1[1] + w1 * v2[1] + w2 * v0[1] + w3 * v1[1] + 0.5f);
        c2 = floorf(w0 * v1[2] + w1 * v2[2] + w2 * v0[2] + w3 * v1[2] + 0.5f);
    }
    // assign
    // float* pdst = output_image + (dy * output_image_width + dx) * 3;
    // pdst[0] = c0; pdst[1] = c1; pdst[2] = c2;
    // BGR--->RGB /255 
    // pdst[2] = c0 / 255.0; pdst[1] = c1 / 255.0; pdst[0] = c2 / 255.0;

    
    float* pdst = output_image + dy * output_image_width + dx;
    // RRR...GGG...BBB...排列
    // 归一化
    pdst[0] = c2 / 255.0; 
    pdst[output_image_width*output_image_height] = c1 / 255.0; 
    pdst[2*output_image_width*output_image_height] = c0 / 255.0;
    
}

Mat get_affine_matrix(const Size &input, const Size &output)
{
    float scale_x = output.width / (float)input.width;
    float scale_y = output.height / (float)input.height;
    float scale_factor = min(scale_x, scale_y);
    Mat scale_matrix = (Mat_<float>(3,3) <<
        scale_factor, 0,           0,
        0,            scale_factor, 0,
        0,            0,           1
    );
    Mat translation_matrix = (Mat_<float>(3,3) <<
        1, 0, -input.width * 0.5 * scale_factor + output.width * 0.5,
        0, 1, -input.height * 0.5 * scale_factor + output.height * 0.5,
        0, 0, 1
    );
    Mat affine_matrix = translation_matrix * scale_matrix;
    affine_matrix = affine_matrix(Rect(0, 0, 3, 2));

    return affine_matrix;
}

AffineMatrix get_gpu_affine_matrix(const Size &input, const Size &output)
{
    auto affine_matrix = get_affine_matrix(input, output);
    Mat invert_affine_matrix;
    // 求逆矩阵
    cv::invertAffineTransform(affine_matrix, invert_affine_matrix);

    AffineMatrix am;
    memcpy(am.value, invert_affine_matrix.ptr<float>(0), sizeof(am.value));
    return am;
}

AffineMatrixImage warpaffine(std::string image_path, int width, int height)
{
    Mat image = imread(image_path);
    size_t image_bytes = image.rows * image.cols * 3;
    uint8_t *image_deivce = nullptr;

    // Mat affine(width, height, CV_32FC3);
    float* affine;
    affine = (float*)malloc(width*height*3*sizeof(float));
    
    // size_t affine_bytes = affine.rows * affine.cols * 3 * sizeof(float);
    size_t affine_bytes = 640*640*3 * sizeof(float);
    float *affine_device = nullptr;

    cudaStream_t stream = nullptr;
    // checkRuntime(cudaStreamCreate(&stream));
    checkRuntime(cudaMalloc(&image_deivce, image_bytes));
    checkRuntime(cudaMalloc(&affine_device, affine_bytes));
    checkRuntime(cudaMemcpy(image_deivce, image.data, image_bytes, cudaMemcpyHostToDevice));
    // auto gpu_M = get_gpu_affine_matrix(image.size(), affine.size());
    auto gpu_M = get_gpu_affine_matrix(image.size(), Size(height, width));

    dim3 block_size(32, 32); // blocksize最大就是1024，这里用2d来看更好理解
    // dim3 grid_size((affine.cols + 31) / 32, (affine.rows + 31) / 32);
    dim3 grid_size((width + 31) / 32, (height + 31) / 32);

    /*
    grid:20*20   block:32*32
    GRID
    ------------------
    |  block |       |
    ------------------
    |        |    o  |
    ------------------ 
    int dx = blockDim.x * blockIdx.x + threadIdx.x;
    int dy = blockDim.y * blockIdx.y + threadIdx.y;
    id = dy * gridDim.x * blockDim.x + dx
    */

    // warpaffine_kernel<<<grid_size, block_size, 0, stream>>>(
    //     image_deivce,  
    //     image.cols, 
    //     image.rows,
    //     affine_device, 
    //     affine.cols,
    //     affine.rows,
    //     gpu_M,
    //     114
    // );
    warpaffine_kernel<<<grid_size, block_size, 0, stream>>>(
        image_deivce,  
        image.cols, 
        image.rows,
        affine_device, 
        width,
        height,
        gpu_M,
        114
    );
    checkRuntime(cudaPeekAtLastError());
    checkRuntime(cudaMemcpy(affine, affine_device, affine_bytes, cudaMemcpyDeviceToHost)); // 将预处理完的数据搬运回来
    checkRuntime(cudaFree(image_deivce));
    checkRuntime(cudaFree(affine_device));
    // imwrite("warpaffine-gpu.jpg", affine);
    AffineMatrixImage afi;
    memcpy(afi.value, gpu_M.value, 6*sizeof(float));
    afi.affine_image = affine;
    return afi;
}