Cutlass Reduce 代码阅读笔记

/cutlass/include/cutlass/reduction/device/tensor_reduce.h

__forceinline__ __host__ __device__ int least_pow2_bound(int value) {
  unsigned int value_ = static_cast<unsigned int>(value);
  --value_;
  value_ |= value_ >> 1;
  value_ |= value_ >> 2;
  value_ |= value_ >> 4;
  value_ |= value_ >> 8;
  value_ |= value_ >> 16;
  return static_cast<int>(++value_);
}

怎么感觉只对4维的tensor有良好支持
src_extent{2,3,4,64}
当reduce_stride的时候，reduce_index不是最后一维，会把reduce_index那一维一到倒数第二维?

TensorReduction(
    TensorCoord extent, 
    int reduction_index_
  ): 
    reduction_index(reduction_index_) {

    Coord<4> extent_affine;

    switch (reduction_index) {
    case 0:
      extent_affine[0] = extent[1];
      extent_affine[1] = extent[2];
      extent_affine[2] = extent[0];
      extent_affine[3] = extent[3];
      break;
    case 1:
      extent_affine[0] = extent[0];
      extent_affine[1] = extent[2];
      extent_affine[2] = extent[1];
      extent_affine[3] = extent[3];
      break;
    case 2:
      extent_affine[0] = extent[0];
      extent_affine[1] = extent[1];
      extent_affine[2] = extent[2];
      extent_affine[3] = extent[3];
      break;
    case 3:
      extent_affine[0] = extent[0];
      extent_affine[1] = extent[1];
      extent_affine[2] = extent[2];
      extent_affine[3] = extent[3];
      break;
    default: break;
    }



    if (reduction_index == 3) {
      reduction_contiguous = ReductionDeviceContiguousOperator(extent_affine);  
    }

outer_count,inner_count这里类似于矩阵的m和n
block grid参数根据 outer_count与 inner_count设置，有最大的block数值，
如果 outer_count比较小，而inner_count比较大
inner_count会分配到 grid_z维。
// Priority 1. Assign threadblocks to outer indices if possible 对应 cta_y cta_count_x一直为1
// Priority 2. Assign inner dimensions to one CTA

    // Priority 1. Assign threadblocks to outer indices if possible
    if (outer_count > target_threadblock_count) {
      cta_count_x = 1;
      cta_count_y = target_threadblock_count;
      cta_count_z = 1;
    }
    else {

      cta_count_y = int(outer_count);
      int remaining_ctas = target_threadblock_count / cta_count_y;

      // Priority 2. Assign inner dimensions to one CTA
      if (inner_vector_count > cta_threads_x) {
        int64_t cta_z_bound = inner_vector_count / cta_threads_x;
        if (cta_z_bound > remaining_ctas) {
          cta_count_z = remaining_ctas;
        }
        else {
          cta_count_z = int(cta_z_bound);
        }
      }
      else {
        cta_threads_x = reshape_pow2(int(inner_vector_count), cta_threads_x);
        cta_count_z = 1;
      }
    }

/// Helper to reshape 'count' such that it is less than 2 x 'ext'
// 我没看懂这里把count变化成什么玩意，没看懂 ,这里感觉是写错了，blockDim.x也太小了，应该返回count。  参考 onnxruntime的 leastpow2
  static int reshape_pow2(int ext, int count) {
    if (ext > count) {
      return 1;
    } // ext>count 是无效值 ，直接返回1 count是最大的 thread_x数目
    int x = 1;
    for (; count >= ext * 2; ) {
      count >>= 1;
      x <<= 1;
    }
    return x;
  }

当使用到了cta_count_z 即连续reduce的值分到了不同的 block上会使用 workspace

     workspace_count = (cta_count_z > 1 ? cta_count_z : 0);

reduce的构造函数主要是确定grid和 thread的参数，以及是否启动第二个kernel workspace_count.
当用到多个grid.z处理 inner_count,共需要 outer_count* grid.z*sizeof(element)字节 global memory

reduece函数

reduce 函数是可以指定stream的

     case 3:
      src_stride[0] = src_ref.stride()[2];
      src_stride[1] = src_ref.stride()[1];
      src_stride[2] = src_ref.stride()[0];

      dst_stride[0] = dst_ref.stride()[2];
      dst_stride[1] = dst_ref.stride()[1];
      dst_stride[2] = dst_ref.stride()[0];

倒置 stride.
核函数的参数构造函数


    // Initialize divisors for fast div-mod
    for (int p = 1; p < kRank; ++p) {
      divmod[p - 1] = FastDivmodU64(uint64_t(extent[p]));
    }

    for (int p = 0; p < kReducedRank; ++p) {
      dst_stride[p] = dst_stride_[p] * output_size_bits / 8;
    }  

    for (int p = 0; p < kRank - 1; ++p) {
      src_stride[p] = src_stride_[p] * input_size_bits / 8;
    }
``` 步长是以bit为单位的。
kernel内的 stride是以bit为单位

uint64_t idx_linear = blockIdx.y * blockDim.y + threadIdx.y;

outer count的索引??

grid(1,24,1) block(4,1,1)

快速除法
```     Coord<kReducedRank> & coord, 

    coord = CoordinateDecomposition<kReducedRank>(linear_idx, params.divmod);

将 linear_idx分解为坐标

可以研究下
FastDivmodU64

    CUTLASS_PRAGMA_UNROLL
    for (int i = 0; i < kReducedRank; ++i) {
      dst_offset += params.dst_stride[i] * coord[i];
      src_offset += params.src_stride[i] * coord[i];
    }
``` 这里为什么变成列主序了，要知道 stride是倒叙的。映射关系不填明朗

`ComputeFragment` 的长度是 kVectorLength

batchSIze 是4

      for (int b = 0; b < kBatchSize; ++b) {

        if (linear_idx < params.inner_count) {
          source_fragment[b] = *reinterpret_cast<SourceFragment const *>(src_byte_ptr + src_byte_offset);
          guards[b] = true;
        }
        else {
          guards[b] = false;
          not_done = false;
        }

        linear_idx += (blockDim.z * gridDim.z * blockDim.x) * kVectorLength;
        compute_inner_coord_and_offset_(params, coord, src_byte_offset, linear_idx);
      }
``` 不断循环加载值，值的索引 变化是  linear_idx`
超界就是 not_done=false 已完成

    if (linear_idx < params.inner_count) {
      source_fragment[b] = *reinterpret_cast<SourceFragment const *>(src_byte_ptr + src_byte_offset);
      guards[b] = true;
    }
    else {
      guards[b] = false;
      not_done = false;
    }

这是个技巧可以代替纯循环。
```循环reduce 核心代码
      CUTLASS_PRAGMA_UNROLL
      for (int b = 0; b < kBatchSize; ++b) {

        if (linear_idx < params.inner_count) {
          source_fragment[b] = *reinterpret_cast<SourceFragment const *>(src_byte_ptr + src_byte_offset);
          guards[b] = true;
        }
        else {
          guards[b] = false;
          not_done = false;
        }

        linear_idx += (blockDim.z * gridDim.z * blockDim.x) * kVectorLength;
        compute_inner_coord_and_offset_(params, coord, src_byte_offset, linear_idx);
      }

      // Perform a batch of reduction operations
      CUTLASS_PRAGMA_UNROLL
      for (int b = 0; b < kBatchSize; ++b) {
        if (guards[b]) {
          auto cvt = convert_source(source_fragment[b]);

          accumulator = cutlass::reduction::thread::detail::ApplyArrayOperator(
            reduction_op, 
            accumulator, 
            cvt);
        }  //通常情况下 , accumulator是 Array<ele,1> ，applyArrayOperator实际上也是单个值reduce
      }
    };


  using ComputeFragment = Array<ElementCompute, VectorLength>;
  using SourceFragment = AlignedArray<ElementSource, VectorLength>;

相比我写的玩意，他这个计算循环展开有batchSize
收益来自于循环展开

for(auto id=global_threadIdx.x;id<num_cols;id+=blockDim.x*gridDim.x*batch)
{
#progma unroll  
for(auto bi=0;bi<bath;bi++)
  {
    auto src_offset =id+bi*blockDim.x+gridDim.x;
if(src_offset<num_cols)
    {
      accumlotor=reduceOp(accumlotro,src_ptr[src_offset];
    }
  }
 }

idx_linear是外循环，linear_idx是内循环
感觉thread是分少了，虽然结果对，但这个利用率低。

        linear_idx += (blockDim.z * gridDim.z * blockDim.x) * kVectorLength;

blockDim.x连一个warp都没有
每个thread的值写到共享内存然后

 //···

CUTLASS_PRAGMA_NO_UNROLL
while (thread_count > 1) {
  thread_count /= 2;

  __syncthreads();

  if (thread_j < thread_count) {
    ElementCompute other = frag_ptr[thread_j + thread_count];

    reduced_accumulator = reduction_op(reduced_accumulator, other);

    frag_ptr[thread_j] = reduced_accumulator;
  }

  __syncthreads();
}


```workspacesize
  int64_t workspace_size() const {

    // Error condition
    if (!good()) {
      return 0;
    }

    // No reduction across CTAs
    if (grid_shape.z == 1) {
      return 0;
    }

    return workspace_stride() * grid_shape.z;
  }

  /// Size (in bytes) of <outer_count> workspace elements which are densely packed together
  int64_t workspace_stride() const {
    
    // Error condition
    if (!good()) {
      return 0;
    }

    return outer_count * sizeof_bits<ElementCompute>::value / 8;
  }

至少这个continuos的reduce感觉也没啥...

reduce Stride

    int cta_width = kThreads * kVectorLength;
    int cta_ways = reshape_pow2(extent_c, cta_width);
    int cta_threads_x = kThreads / cta_ways;

    threadblock_shape = dim3(cta_threads_x, 1, std::min(cta_ways, 64));

这个看起来可靠许多

  using ReductionDeviceStridedOperator = TensorReductionAffineStrided<
    4, 3, ElementOutput, ElementSource, ReductionOp, kVectorLength, ElementCompute
  >;

reduceRanked被定到第3维

switch (reduction_index) {
    case 0:
      extent_affine[0] = extent[1];
      extent_affine[1] = extent[2];
      extent_affine[2] = extent[0];
      extent_affine[3] = extent[3];
      break;
    case 1:
      extent_affine[0] = extent[0];
      extent_affine[1] = extent[2];
      extent_affine[2] = extent[1];
      extent_affine[3] = extent[3];
      break;
    case 2:
      extent_affine[0] = extent[0];
      extent_affine[1] = extent[1];
      extent_affine[2] = extent[2];
      extent_affine[3] = extent[3];
      break;

与 reduce continue不同

    // Compute number of elements in strided ranks
    for (int p = 0; p < kReducedRank - 1; ++p) {
      outer_count *= extent[p];
    }

    for (int p = 0; p < kInnerRank; ++p) {
      inner_count *= extent[kReducedRank + p - 1];
    }
``` 这里 outer_count 只到 kReducedRank-1
inner_count实际上不包括最后一维

int cta_width = kThreads * kVectorLength;
int cta_ways = reshape_pow2(extent_c, cta_width);
int cta_threads_x = kThreads / cta_ways;

threadblock_shape = dim3(cta_threads_x, 1, std::min(cta_ways, 64));

// This leads to an error.
if (threadblock_shape.z > 1) {
  if (threadblock_shape.y != 1) {
    status = Status::kErrorInternal;
    return;
  }
}

blockDIm.y=1 blockDIm.x和 blockDim,z由col决定。


inner_count即reduce 维度的数量 由z处理
内部的stride也不一样了

switch (reduction_index) {
case 0:
  src_stride[0] = src_ref.stride()[1];
  src_stride[1] = src_ref.stride()[0];
  src_stride[2] = src_ref.stride()[2];
  dst_stride[0] = dst_ref.stride()[1];
  dst_stride[1] = dst_ref.stride()[0];
  break;
case 1:
  src_stride[0] = src_ref.stride()[2];
  src_stride[1] = src_ref.stride()[0];
  src_stride[2] = src_ref.stride()[1];
  dst_stride[0] = dst_ref.stride()[2];
  dst_stride[1] = dst_ref.stride()[0];
  break;
case 2:
  src_stride[0] = src_ref.stride()[2];
  src_stride[1] = src_ref.stride()[1];
  src_stride[2] = src_ref.stride()[0];
  dst_stride[0] = dst_ref.stride()[2];
  dst_stride[1] = dst_ref.stride()[1];
  break;
case 3:
  src_stride[0] = src_ref.stride()[2];
  src_stride[1] = src_ref.stride()[1];
  src_stride[2] = src_ref.stride()[0];

  dst_stride[0] = dst_ref.stride()[2];
  dst_stride[1] = dst_ref.stride()[1];
  dst_stride[2] = dst_ref.stride()[0];

//只有threadIdx.z==0
// Linearized thread ID
int thread_idx = threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);

  // all threads store to workspace
  ComputeFragment *frag_ptr = reinterpret_cast<ComputeFragment *>(threadblock_workspace);

  frag_ptr[thread_idx] = accumulator;

  __syncthreads();

  if (threadIdx.z == 0) {
    // Load all additional block indices
    for (int z = 1; z < blockDim.z; ++z) {
      ComputeFragment frag = frag_ptr[thread_idx + z * blockDim.x * blockDim.y];

      accumulator = cutlass::reduction::thread::detail::ApplyArrayOperator(
        reduction_op, 
        accumulator, 
        frag);
    } 
  }