cuda transpose

cuda transpose 可以使用 thrust ,cublas,cublasLt来实现
以下这段代码使用 cublasLt 的api cublasLtMatrixTransform 实现 cutlass tensorView的transpose.
基本思路是首先配置各个指针类型（handle)的属性，调用即可。

template <typename T> inline cudaDataType_t GetCublasDType();
template <> inline cudaDataType_t GetCublasDType<float>() { return CUDA_R_32F; }
template <> inline cudaDataType_t GetCublasDType<half>() { return CUDA_R_16F; }
template <> inline cudaDataType_t GetCublasDType<int8_t>() { return CUDA_R_8I; }
template <> inline cudaDataType_t GetCublasDType<int>() { return CUDA_R_32I; }
template <> inline cudaDataType_t GetCublasDType<double>() {
  return CUDA_R_64F;
}

template <> inline cudaDataType_t GetCublasDType<cutlass::half_t>() {
  return CUDA_R_16F;
}

template <typename T> inline cublasLtOrder_t GetCublasLtLayout();
template <>
inline cublasLtOrder_t GetCublasLtLayout<cutlass::layout::ColumnMajor>() {
  return CUBLASLT_ORDER_COL;
}

template <>
inline cublasLtOrder_t GetCublasLtLayout<cutlass::layout::RowMajor>() {
  return CUBLASLT_ORDER_ROW;
}

template <>
inline cublasLtOrder_t
GetCublasLtLayout<cutlass::layout::ColumnMajorInterleaved<32>>() {
  return CUBLASLT_ORDER_COL32;
}

template <typename Element, typename LayoutSrc, typename LayoutDes>
void change_layout_with_cublasLT(
    const cutlass::TensorView<Element, LayoutSrc> &src_view,
    cutlass::TensorView<Element, LayoutDes> &des_view,
    cudaStream_t stream = nullptr) {

  const auto src_extent = src_view.extent();
  const auto des_extent = des_view.extent();
  const auto src_cols = src_extent.column();
  const auto src_rows = src_extent.row();
  const auto des_cols = des_extent.column();
  const auto des_rows = des_extent.row();

  if ((src_cols != des_cols) || (src_rows != des_rows)) {
    throw std::logic_error(
        " change_layout_with_cublasLT: the function change layout,the cols and "
        "rows of des and src have to be same respectively\n");
  }
  cublasLtMatrixLayout_t src_layout;
  cublasLtMatrixLayout_t des_layout;
  cublasLtOrder_t src_order = GetCublasLtLayout<LayoutSrc>();
  cublasLtOrder_t des_order = GetCublasLtLayout<LayoutDes>();
  cublasLtMatrixTransformDesc_t transformDesc = nullptr;
  cublasLtHandle_t ltHandle;
  const auto element_type = GetCublasDType<Element>();
  checkCublasStatus(cublasLtCreate(&ltHandle));

  checkCublasStatus(cublasLtMatrixLayoutCreate(&src_layout, element_type,
                                               src_rows, src_cols,
                                               src_view.stride(0))); // rowMajor
  checkCublasStatus(cublasLtMatrixLayoutCreate(&des_layout, element_type,
                                               des_rows, des_cols,
                                               des_view.stride(0))); // colMajor
  checkCublasStatus(cublasLtMatrixLayoutSetAttribute(
      src_layout, CUBLASLT_MATRIX_LAYOUT_ORDER, &src_order, sizeof(src_order)));

  checkCublasStatus(cublasLtMatrixLayoutSetAttribute(
      des_layout, CUBLASLT_MATRIX_LAYOUT_ORDER, &des_order, sizeof(des_order)));

  checkCublasStatus(
      cublasLtMatrixTransformDescCreate(&transformDesc, CUDA_R_32F));
  float alpha = 1.0f;
  float beta = 0.0f;
  checkCublasStatus(cublasLtMatrixTransform(
      ltHandle, transformDesc, &alpha, src_view.data(), src_layout, &beta,
      nullptr, nullptr, des_view.data(), des_layout, stream));
}

以下的代码包含 thrust,cublas和手写的利用share memory完成转置的例子


#include <assert.h>
#include <cublas_v2.h>
#include <iomanip>
#include <iostream>
#include <limits>
#include <random>
#include<algorithm>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/gather.h>
#include <thrust/host_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/random.h>
#include <thrust/scan.h>
#include <vector>

/**********************/
/* cuBLAS ERROR CHECK */
/**********************/
#ifndef cublasSafeCall
#define cublasSafeCall(err) __cublasSafeCall(err, __FILE__, __LINE__)
#endif

inline void __cublasSafeCall(cublasStatus_t err, const char *file,
                             const int line) {
  if (CUBLAS_STATUS_SUCCESS != err) {
    fprintf(stderr,
            "CUBLAS error in file '%s', line %d\n \nerror %d \nterminating!\n",
            __FILE__, __LINE__, err);
    // getch();
    cudaDeviceReset();
    assert(0);
  }
}

// convert a linear index to a linear index in the transpose
struct transpose_index : public thrust::unary_function<size_t, size_t> {
  size_t m, n;

  __host__ __device__ transpose_index(size_t _m, size_t _n) : m(_m), n(_n) {}

  __host__ __device__ size_t operator()(size_t linear_index) {
    size_t i = linear_index / n;
    size_t j = linear_index % n;

    return m * j + i;
  }
};

// convert a linear index to a row index
struct row_index : public thrust::unary_function<size_t, size_t> {
  size_t n;

  __host__ __device__ row_index(size_t _n) : n(_n) {}

  __host__ __device__

      size_t
      operator()(size_t i) {
    return i / n;
  }
};

// transpose an M-by-N array
template <typename T>
void transpose(size_t m, size_t n, thrust::device_vector<T> &src,
               thrust::device_vector<T> &dst) {
  thrust::counting_iterator<size_t> indices(0);

  thrust::gather(
      thrust::make_transform_iterator(indices, transpose_index(n, m)),
      thrust::make_transform_iterator(indices, transpose_index(n, m)) +
          dst.size(),
      src.begin(), dst.begin());
}

// share memory transpose
template <typename T>
__global__ void __launch_bounds__(1024)
    transpose_kernel(const T *src, T *dst, int dstM, int dstN) {
  __shared__ T share_arrary[32][33];
  const int tx = threadIdx.x;
  const int ty = threadIdx.y;

  for (int block_offest_y = blockIdx.y * blockDim.y; block_offest_y < dstM;
       block_offest_y += blockDim.y * gridDim.y) {
    for (int block_offest_x = blockIdx.x * blockDim.x; block_offest_x < dstN;
         block_offest_x += blockDim.x * gridDim.x) {
      // src coordinate
      int src_col = block_offest_y + tx;
      int src_row = block_offest_x + ty;

      if (src_col < dstM && src_row < dstN) {
        share_arrary[ty][tx] = src[src_row * dstM + src_col]; //合并访存
      }
      __syncthreads();
      // dst coordinate
      // Block thread的坐标映射是根据 dst来着
      int dst_row = block_offest_y + ty;
      int dst_col = block_offest_x + tx;
      if (dst_row < dstM && dst_col < dstN) {
        dst[dst_row * dstN + dst_col] = share_arrary[tx][ty];
      }
    }
  }
}
// print an M-by-N array
template <typename T>
void print(size_t m, size_t n, thrust::device_vector<T> &d_data) {
  thrust::host_vector<T> h_data = d_data;

  for (size_t i = 0; i < m; i++) {
    for (size_t j = 0; j < n; j++)
      std::cout << std::setw(1) << h_data[i * n + j] << " ";
    std::cout << "\n";
  }
}

template <typename T>
bool verify_res(size_t m, size_t n, const thrust::device_vector<T> &ref_data,
                const thrust::device_vector<T> &res_data,
                T abs_error = T(1e-4)) {
  thrust::host_vector<T> href_data = ref_data;
  thrust::host_vector<T> hres_data = res_data;

  T max_error = std::numeric_limits<T>::lowest();
  int num_errors = 0;
  for (size_t i = 0; i < m; i++) {
    for (size_t j = 0; j < n; j++) {
      auto tmp_error = std::abs(hres_data[i * n + j] - href_data[i * n + j]);
      // std::cout<<tmp_error<<"\n";
      if (tmp_error > abs_error)
        num_errors++;
      max_error = max_error > tmp_error ? max_error : tmp_error;
    }
  }
  std::cout << "num_error: " << num_errors << " max error= " << max_error
            << " \n";
  return num_errors == 0;
}
// nvcc transpose.cu -O2 --extended-lambda  -l cublas -o transpose_sample
int main(void) {
  size_t m = 321; // number of rows
  size_t n = 344; // number of columns

  // 2d array stored in row-major order [(0,0), (0,1), (0,2) ... ]
  std::vector<double> h_data(m * n);

  std::random_device rd;  // 将用于获得随机数引擎的种子
  std::mt19937 gen(rd()); // 以 rd() 播种的标准 mersenne_twister_engine
  std::uniform_real_distribution<double> dis(1, 10);
  std::generate(h_data.begin(), h_data.end(),[&rd,&gen,&dis](){return dis(gen);});
  thrust::device_vector<double> data=h_data;
  /*
    data[1] = 2.;
    data[3] = 3.;
    data[m * n - 1] = 8.;

    thrust::generate(data.begin(), data.end(), [] __device__() {
      thrust::default_random_engine random_engine;

      thrust::uniform_real_distribution<double> dist(0.1, 10.);

      return dist(random_engine);
    });
    */
  std::cout << "Initial array" << std::endl;
  // print(m, n, data);

  std::cout << "Transpose array - Thrust" << std::endl;
  thrust::device_vector<double> transposed_thrust(m * n);
  transpose(m, n, data, transposed_thrust);
  // print(n, m, transposed_thrust);

  std::cout << "Transpose array - cuBLAS" << std::endl;
  thrust::device_vector<double> transposed_cuBLAS(m * n);
  double *dv_ptr_in = thrust::raw_pointer_cast(data.data());
  double *dv_ptr_out = thrust::raw_pointer_cast(transposed_cuBLAS.data());
  double alpha = 1.;
  double beta = 0.;
  cublasHandle_t handle;
  cublasSafeCall(cublasCreate(&handle));
  cublasSafeCall(cublasDgeam(handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, &alpha,
                             dv_ptr_in, n, &beta, dv_ptr_in, n, dv_ptr_out, m));
  // print(n, m, transposed_cuBLAS);
  /*
cudaStream_t stream0;
  cudaStreamCreate(&stream0);

  cublasSetStream_v2(handle, stream0);
*/
  // getch();
  std::cout << "Transpose array - kernel" << std::endl;

  thrust::device_vector<double> transposed_kernel_data(m * n);
  double *dv_ptr_kernel_out =
      thrust::raw_pointer_cast(transposed_kernel_data.data());
  const dim3 block(32, 32);
  const dim3 grid((n+31)/32,(m+31)/32);
  //const dim3 grid(2, 3);

  transpose_kernel<<<grid, block>>>(dv_ptr_in, dv_ptr_kernel_out, n, m);
  verify_res(n, m, transposed_thrust, transposed_kernel_data);
  //   cudaDeviceSynchronize();
  // 
  // print(n, m, transposed_kernel_data);

  return 0;
}

手写 kernel

// share memory transpose
template <typename T>
__global__ void __launch_bounds__(1024)
    transpose_kernel(const T *src, T *dst, int dstM, int dstN) {
  __shared__ T share_arrary[32][33];
  const int tx = threadIdx.x;
  const int ty = threadIdx.y;

  for (int block_offest_y = blockIdx.y * blockDim.y; block_offest_y < dstM;
       block_offest_y += blockDim.y * gridDim.y) {
    for (int block_offest_x = blockIdx.x * blockDim.x; block_offest_x < dstN;
         block_offest_x += blockDim.x * gridDim.x) {
      // src coordinate
      int src_col = block_offest_y + tx;
      int src_row = block_offest_x + ty;

      if (src_col < dstM && src_row < dstN) {
        share_arrary[ty][tx] = src[src_row * dstM + src_col]; //合并访存
      }
      __syncthreads();
      // dst coordinate
      // Block thread的坐标映射是根据 dst来着
      int dst_row = block_offest_y + ty;
      int dst_col = block_offest_x + tx;
      if (dst_row < dstM && dst_col < dstN) {
        dst[dst_row * dstN + dst_col] = share_arrary[tx][ty];
      }
    }
  }
}

一般来说对 global memory进行读写需要 predicate 保护(if,防止越界，基本上只需要对读写进行predicate即可。
block grid 是映射到 dst 矩阵逻辑坐标的，以此为基础计算出 src的逻辑坐标。
[32][33]减少 bank confilt.

cuda transpose

推荐阅读更多精彩内容