cuda transpose 可以使用 thrust ,cublas,cublasLt来实现
以下这段代码使用 cublasLt 的api cublasLtMatrixTransform
实现 cutlass tensorView的transpose.
基本思路是首先配置各个指针类型(handle)的属性,调用即可。
template <typename T> inline cudaDataType_t GetCublasDType();
template <> inline cudaDataType_t GetCublasDType<float>() { return CUDA_R_32F; }
template <> inline cudaDataType_t GetCublasDType<half>() { return CUDA_R_16F; }
template <> inline cudaDataType_t GetCublasDType<int8_t>() { return CUDA_R_8I; }
template <> inline cudaDataType_t GetCublasDType<int>() { return CUDA_R_32I; }
template <> inline cudaDataType_t GetCublasDType<double>() {
return CUDA_R_64F;
}
template <> inline cudaDataType_t GetCublasDType<cutlass::half_t>() {
return CUDA_R_16F;
}
template <typename T> inline cublasLtOrder_t GetCublasLtLayout();
template <>
inline cublasLtOrder_t GetCublasLtLayout<cutlass::layout::ColumnMajor>() {
return CUBLASLT_ORDER_COL;
}
template <>
inline cublasLtOrder_t GetCublasLtLayout<cutlass::layout::RowMajor>() {
return CUBLASLT_ORDER_ROW;
}
template <>
inline cublasLtOrder_t
GetCublasLtLayout<cutlass::layout::ColumnMajorInterleaved<32>>() {
return CUBLASLT_ORDER_COL32;
}
template <typename Element, typename LayoutSrc, typename LayoutDes>
void change_layout_with_cublasLT(
const cutlass::TensorView<Element, LayoutSrc> &src_view,
cutlass::TensorView<Element, LayoutDes> &des_view,
cudaStream_t stream = nullptr) {
const auto src_extent = src_view.extent();
const auto des_extent = des_view.extent();
const auto src_cols = src_extent.column();
const auto src_rows = src_extent.row();
const auto des_cols = des_extent.column();
const auto des_rows = des_extent.row();
if ((src_cols != des_cols) || (src_rows != des_rows)) {
throw std::logic_error(
" change_layout_with_cublasLT: the function change layout,the cols and "
"rows of des and src have to be same respectively\n");
}
cublasLtMatrixLayout_t src_layout;
cublasLtMatrixLayout_t des_layout;
cublasLtOrder_t src_order = GetCublasLtLayout<LayoutSrc>();
cublasLtOrder_t des_order = GetCublasLtLayout<LayoutDes>();
cublasLtMatrixTransformDesc_t transformDesc = nullptr;
cublasLtHandle_t ltHandle;
const auto element_type = GetCublasDType<Element>();
checkCublasStatus(cublasLtCreate(<Handle));
checkCublasStatus(cublasLtMatrixLayoutCreate(&src_layout, element_type,
src_rows, src_cols,
src_view.stride(0))); // rowMajor
checkCublasStatus(cublasLtMatrixLayoutCreate(&des_layout, element_type,
des_rows, des_cols,
des_view.stride(0))); // colMajor
checkCublasStatus(cublasLtMatrixLayoutSetAttribute(
src_layout, CUBLASLT_MATRIX_LAYOUT_ORDER, &src_order, sizeof(src_order)));
checkCublasStatus(cublasLtMatrixLayoutSetAttribute(
des_layout, CUBLASLT_MATRIX_LAYOUT_ORDER, &des_order, sizeof(des_order)));
checkCublasStatus(
cublasLtMatrixTransformDescCreate(&transformDesc, CUDA_R_32F));
float alpha = 1.0f;
float beta = 0.0f;
checkCublasStatus(cublasLtMatrixTransform(
ltHandle, transformDesc, &alpha, src_view.data(), src_layout, &beta,
nullptr, nullptr, des_view.data(), des_layout, stream));
}
以下的代码包含 thrust,cublas和手写的利用share memory完成转置的例子
#include <assert.h>
#include <cublas_v2.h>
#include <iomanip>
#include <iostream>
#include <limits>
#include <random>
#include<algorithm>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/gather.h>
#include <thrust/host_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/random.h>
#include <thrust/scan.h>
#include <vector>
/**********************/
/* cuBLAS ERROR CHECK */
/**********************/
#ifndef cublasSafeCall
#define cublasSafeCall(err) __cublasSafeCall(err, __FILE__, __LINE__)
#endif
inline void __cublasSafeCall(cublasStatus_t err, const char *file,
const int line) {
if (CUBLAS_STATUS_SUCCESS != err) {
fprintf(stderr,
"CUBLAS error in file '%s', line %d\n \nerror %d \nterminating!\n",
__FILE__, __LINE__, err);
// getch();
cudaDeviceReset();
assert(0);
}
}
// convert a linear index to a linear index in the transpose
struct transpose_index : public thrust::unary_function<size_t, size_t> {
size_t m, n;
__host__ __device__ transpose_index(size_t _m, size_t _n) : m(_m), n(_n) {}
__host__ __device__ size_t operator()(size_t linear_index) {
size_t i = linear_index / n;
size_t j = linear_index % n;
return m * j + i;
}
};
// convert a linear index to a row index
struct row_index : public thrust::unary_function<size_t, size_t> {
size_t n;
__host__ __device__ row_index(size_t _n) : n(_n) {}
__host__ __device__
size_t
operator()(size_t i) {
return i / n;
}
};
// transpose an M-by-N array
template <typename T>
void transpose(size_t m, size_t n, thrust::device_vector<T> &src,
thrust::device_vector<T> &dst) {
thrust::counting_iterator<size_t> indices(0);
thrust::gather(
thrust::make_transform_iterator(indices, transpose_index(n, m)),
thrust::make_transform_iterator(indices, transpose_index(n, m)) +
dst.size(),
src.begin(), dst.begin());
}
// share memory transpose
template <typename T>
__global__ void __launch_bounds__(1024)
transpose_kernel(const T *src, T *dst, int dstM, int dstN) {
__shared__ T share_arrary[32][33];
const int tx = threadIdx.x;
const int ty = threadIdx.y;
for (int block_offest_y = blockIdx.y * blockDim.y; block_offest_y < dstM;
block_offest_y += blockDim.y * gridDim.y) {
for (int block_offest_x = blockIdx.x * blockDim.x; block_offest_x < dstN;
block_offest_x += blockDim.x * gridDim.x) {
// src coordinate
int src_col = block_offest_y + tx;
int src_row = block_offest_x + ty;
if (src_col < dstM && src_row < dstN) {
share_arrary[ty][tx] = src[src_row * dstM + src_col]; //合并访存
}
__syncthreads();
// dst coordinate
// Block thread的坐标映射是根据 dst来着
int dst_row = block_offest_y + ty;
int dst_col = block_offest_x + tx;
if (dst_row < dstM && dst_col < dstN) {
dst[dst_row * dstN + dst_col] = share_arrary[tx][ty];
}
}
}
}
// print an M-by-N array
template <typename T>
void print(size_t m, size_t n, thrust::device_vector<T> &d_data) {
thrust::host_vector<T> h_data = d_data;
for (size_t i = 0; i < m; i++) {
for (size_t j = 0; j < n; j++)
std::cout << std::setw(1) << h_data[i * n + j] << " ";
std::cout << "\n";
}
}
template <typename T>
bool verify_res(size_t m, size_t n, const thrust::device_vector<T> &ref_data,
const thrust::device_vector<T> &res_data,
T abs_error = T(1e-4)) {
thrust::host_vector<T> href_data = ref_data;
thrust::host_vector<T> hres_data = res_data;
T max_error = std::numeric_limits<T>::lowest();
int num_errors = 0;
for (size_t i = 0; i < m; i++) {
for (size_t j = 0; j < n; j++) {
auto tmp_error = std::abs(hres_data[i * n + j] - href_data[i * n + j]);
// std::cout<<tmp_error<<"\n";
if (tmp_error > abs_error)
num_errors++;
max_error = max_error > tmp_error ? max_error : tmp_error;
}
}
std::cout << "num_error: " << num_errors << " max error= " << max_error
<< " \n";
return num_errors == 0;
}
// nvcc transpose.cu -O2 --extended-lambda -l cublas -o transpose_sample
int main(void) {
size_t m = 321; // number of rows
size_t n = 344; // number of columns
// 2d array stored in row-major order [(0,0), (0,1), (0,2) ... ]
std::vector<double> h_data(m * n);
std::random_device rd; // 将用于获得随机数引擎的种子
std::mt19937 gen(rd()); // 以 rd() 播种的标准 mersenne_twister_engine
std::uniform_real_distribution<double> dis(1, 10);
std::generate(h_data.begin(), h_data.end(),[&rd,&gen,&dis](){return dis(gen);});
thrust::device_vector<double> data=h_data;
/*
data[1] = 2.;
data[3] = 3.;
data[m * n - 1] = 8.;
thrust::generate(data.begin(), data.end(), [] __device__() {
thrust::default_random_engine random_engine;
thrust::uniform_real_distribution<double> dist(0.1, 10.);
return dist(random_engine);
});
*/
std::cout << "Initial array" << std::endl;
// print(m, n, data);
std::cout << "Transpose array - Thrust" << std::endl;
thrust::device_vector<double> transposed_thrust(m * n);
transpose(m, n, data, transposed_thrust);
// print(n, m, transposed_thrust);
std::cout << "Transpose array - cuBLAS" << std::endl;
thrust::device_vector<double> transposed_cuBLAS(m * n);
double *dv_ptr_in = thrust::raw_pointer_cast(data.data());
double *dv_ptr_out = thrust::raw_pointer_cast(transposed_cuBLAS.data());
double alpha = 1.;
double beta = 0.;
cublasHandle_t handle;
cublasSafeCall(cublasCreate(&handle));
cublasSafeCall(cublasDgeam(handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, &alpha,
dv_ptr_in, n, &beta, dv_ptr_in, n, dv_ptr_out, m));
// print(n, m, transposed_cuBLAS);
/*
cudaStream_t stream0;
cudaStreamCreate(&stream0);
cublasSetStream_v2(handle, stream0);
*/
// getch();
std::cout << "Transpose array - kernel" << std::endl;
thrust::device_vector<double> transposed_kernel_data(m * n);
double *dv_ptr_kernel_out =
thrust::raw_pointer_cast(transposed_kernel_data.data());
const dim3 block(32, 32);
const dim3 grid((n+31)/32,(m+31)/32);
//const dim3 grid(2, 3);
transpose_kernel<<<grid, block>>>(dv_ptr_in, dv_ptr_kernel_out, n, m);
verify_res(n, m, transposed_thrust, transposed_kernel_data);
// cudaDeviceSynchronize();
//
// print(n, m, transposed_kernel_data);
return 0;
}
手写 kernel
// share memory transpose
template <typename T>
__global__ void __launch_bounds__(1024)
transpose_kernel(const T *src, T *dst, int dstM, int dstN) {
__shared__ T share_arrary[32][33];
const int tx = threadIdx.x;
const int ty = threadIdx.y;
for (int block_offest_y = blockIdx.y * blockDim.y; block_offest_y < dstM;
block_offest_y += blockDim.y * gridDim.y) {
for (int block_offest_x = blockIdx.x * blockDim.x; block_offest_x < dstN;
block_offest_x += blockDim.x * gridDim.x) {
// src coordinate
int src_col = block_offest_y + tx;
int src_row = block_offest_x + ty;
if (src_col < dstM && src_row < dstN) {
share_arrary[ty][tx] = src[src_row * dstM + src_col]; //合并访存
}
__syncthreads();
// dst coordinate
// Block thread的坐标映射是根据 dst来着
int dst_row = block_offest_y + ty;
int dst_col = block_offest_x + tx;
if (dst_row < dstM && dst_col < dstN) {
dst[dst_row * dstN + dst_col] = share_arrary[tx][ty];
}
}
}
}
一般来说 对 global memory进行读写需要 predicate 保护(if,防止越界,基本上只需要对 读写进行predicate即可。
block grid 是映射到 dst 矩阵逻辑坐标的,以此为基础计算出 src的逻辑坐标。
[32][33]减少 bank confilt.