前言
测试环境
- OS: ubuntu 20.04
- CUDA: v11
- GCC: v10.3
矩阵转置GPU实现
矩阵转置方法
代码实现
main.cu
#include <iostream>
#include <cuda_runtime.h>
#include <math.h>
#include "utils.cuh"
__global__ void transpose1(const int* d_in, int* d_out, int N) {
int nx = blockIdx.x * blockDim.x + threadIdx.x;
int ny = blockIdx.y * blockDim.y + threadIdx.y;
// d_in 合并的读取, d_out 非合并的写入
if(nx < N && ny < N) {
d_out[nx * N + ny] = d_in[ny * N + nx];
}
}
__global__ void transpose2(const int* d_in, int* d_out, int N) {
int nx = blockIdx.x * blockDim.x + threadIdx.x;
int ny = blockIdx.y * blockDim.y + threadIdx.y;
// d_in 非合并的读取, d_out 合并的写入
if(nx < N && ny < N) {
d_out[ny * N + nx] = d_in[nx * N + ny];
}
}
int main() {
const int N = 4096;
int* in_h = new int[N * N];
int* out_h = new int[N * N];
int* in_d = nullptr;
int* out_d = nullptr;
CHECK(cudaMalloc((void**)&in_d, sizeof(int)*N*N));
CHECK(cudaMalloc((void**)&out_d, sizeof(int)*N*N));
CHECK(cudaMemcpy(in_d, in_h, sizeof(int)*N*N, cudaMemcpyHostToDevice));
//
int TILE_WIDTH = 32;
dim3 block_size(TILE_WIDTH, TILE_WIDTH);
dim3 grid_size;
grid_size.x = grid_size.y = (N + TILE_WIDTH - 1) / TILE_WIDTH;
for(int i=0;i<3;i++) {
transpose1<<<grid_size, block_size>>>(in_d, out_d, N);
transpose2<<<grid_size, block_size>>>(in_d, out_d, N);
}
cudaDeviceSynchronize();
GPUTimer timer;
timer.start();
transpose1<<<grid_size, block_size>>>(in_d, out_d, N);
timer.stop();
std::cout << "transpose1 time: " << timer.elpased_ms() << " ms" << std::endl;
timer.start();
transpose2<<<grid_size, block_size>>>(in_d, out_d, N);
timer.stop();
std::cout << "transpose1 time: " << timer.elpased_ms() << " ms" << std::endl;
}
#pragma once
#include <stdio.h>
#include <cuda_runtime.h>
#define CHECK(call) \
do \
{ \
const cudaError_t error_code = call; \
if (error_code != cudaSuccess) \
{ \
printf("CUDA Error:\n"); \
printf(" File: %s\n", __FILE__); \
printf(" Line: %d\n", __LINE__); \
printf(" Error code: %d\n", error_code); \
printf(" Error text: %s\n", \
cudaGetErrorString(error_code)); \
exit(1); \
} \
} while (0)
class GPUTimer {
public:
GPUTimer(){
cudaEventCreate(&m_start);
cudaEventCreate(&m_end);
}
~GPUTimer(){
cudaEventDestroy(m_start);
cudaEventDestroy(m_end);
}
void start() {
cudaEventRecord(m_start);
}
void stop() {
cudaEventRecord(m_end);
cudaEventSynchronize(m_end);
}
float elpased_ms(){
float ms = 0.0f;
cudaEventElapsedTime(&ms, m_start, m_end);
return ms;
}
private:
cudaEvent_t m_start;
cudaEvent_t m_end;
};
测试结果
矩阵大小N | 128 | 512 | 1024 | 4096 |
---|---|---|---|---|
transpose1 | 0.014336ms | 0.110592ms | 0.432128ms | 6.77786ms |
transpose2 | 0.008192ms | 0.092064 | 0.366592ms | 5.75283ms |