/cutlass/include/cutlass/reduction/device/tensor_reduce.h
__forceinline__ __host__ __device__ int least_pow2_bound(int value) {
unsigned int value_ = static_cast<unsigned int>(value);
--value_;
value_ |= value_ >> 1;
value_ |= value_ >> 2;
value_ |= value_ >> 4;
value_ |= value_ >> 8;
value_ |= value_ >> 16;
return static_cast<int>(++value_);
}
怎么感觉只对4维的tensor有良好支持
src_extent{2,3,4,64}
当reduce_stride的时候,reduce_index不是最后一维,会把reduce_index那一维一到倒数第二维?
TensorReduction(
TensorCoord extent,
int reduction_index_
):
reduction_index(reduction_index_) {
Coord<4> extent_affine;
switch (reduction_index) {
case 0:
extent_affine[0] = extent[1];
extent_affine[1] = extent[2];
extent_affine[2] = extent[0];
extent_affine[3] = extent[3];
break;
case 1:
extent_affine[0] = extent[0];
extent_affine[1] = extent[2];
extent_affine[2] = extent[1];
extent_affine[3] = extent[3];
break;
case 2:
extent_affine[0] = extent[0];
extent_affine[1] = extent[1];
extent_affine[2] = extent[2];
extent_affine[3] = extent[3];
break;
case 3:
extent_affine[0] = extent[0];
extent_affine[1] = extent[1];
extent_affine[2] = extent[2];
extent_affine[3] = extent[3];
break;
default: break;
}
if (reduction_index == 3) {
reduction_contiguous = ReductionDeviceContiguousOperator(extent_affine);
}
outer_count,inner_count这里类似于 矩阵的m和n
block grid参数根据 outer_count与 inner_count设置,有最大的block数值,
如果 outer_count比较小,而inner_count比较大
inner_count会分配到 grid_z维。
// Priority 1. Assign threadblocks to outer indices if possible 对应 cta_y cta_count_x一直为1
// Priority 2. Assign inner dimensions to one CTA
// Priority 1. Assign threadblocks to outer indices if possible
if (outer_count > target_threadblock_count) {
cta_count_x = 1;
cta_count_y = target_threadblock_count;
cta_count_z = 1;
}
else {
cta_count_y = int(outer_count);
int remaining_ctas = target_threadblock_count / cta_count_y;
// Priority 2. Assign inner dimensions to one CTA
if (inner_vector_count > cta_threads_x) {
int64_t cta_z_bound = inner_vector_count / cta_threads_x;
if (cta_z_bound > remaining_ctas) {
cta_count_z = remaining_ctas;
}
else {
cta_count_z = int(cta_z_bound);
}
}
else {
cta_threads_x = reshape_pow2(int(inner_vector_count), cta_threads_x);
cta_count_z = 1;
}
}
/// Helper to reshape 'count' such that it is less than 2 x 'ext'
// 我没看懂这里把count变化成什么玩意,没看懂 ,这里感觉是写错了,blockDim.x也太小了,应该返回count。 参考 onnxruntime的 leastpow2
static int reshape_pow2(int ext, int count) {
if (ext > count) {
return 1;
} // ext>count 是无效值 ,直接返回1 count是最大的 thread_x数目
int x = 1;
for (; count >= ext * 2; ) {
count >>= 1;
x <<= 1;
}
return x;
}
当 使用到了cta_count_z
即连续reduce的值分到了不同的 block上会使用 workspace
workspace_count = (cta_count_z > 1 ? cta_count_z : 0);
reduce的构造函数主要是 确定grid和 thread的参数,以及是否启动第二个kernel workspace_count.
当用到多个grid.z处理 inner_count,共需要 outer_count* grid.z*sizeof(element)字节 global memory
reduece函数
reduce 函数是可以指定stream的
case 3:
src_stride[0] = src_ref.stride()[2];
src_stride[1] = src_ref.stride()[1];
src_stride[2] = src_ref.stride()[0];
dst_stride[0] = dst_ref.stride()[2];
dst_stride[1] = dst_ref.stride()[1];
dst_stride[2] = dst_ref.stride()[0];
倒置 stride.
核函数的参数构造函数
// Initialize divisors for fast div-mod
for (int p = 1; p < kRank; ++p) {
divmod[p - 1] = FastDivmodU64(uint64_t(extent[p]));
}
for (int p = 0; p < kReducedRank; ++p) {
dst_stride[p] = dst_stride_[p] * output_size_bits / 8;
}
for (int p = 0; p < kRank - 1; ++p) {
src_stride[p] = src_stride_[p] * input_size_bits / 8;
}
``` 步长是以bit为单位的。
kernel内的 stride是以bit为单位
uint64_t idx_linear = blockIdx.y * blockDim.y + threadIdx.y;
outer count的索引??
grid(1,24,1) block(4,1,1)
快速除法
``` Coord<kReducedRank> & coord,
coord = CoordinateDecomposition<kReducedRank>(linear_idx, params.divmod);
将 linear_idx分解为坐标
可以研究下
FastDivmodU64
CUTLASS_PRAGMA_UNROLL
for (int i = 0; i < kReducedRank; ++i) {
dst_offset += params.dst_stride[i] * coord[i];
src_offset += params.src_stride[i] * coord[i];
}
``` 这里为什么变成列主序了,要知道 stride是倒叙的。映射关系不填明朗
`ComputeFragment` 的长度是 kVectorLength
batchSIze
是4
for (int b = 0; b < kBatchSize; ++b) {
if (linear_idx < params.inner_count) {
source_fragment[b] = *reinterpret_cast<SourceFragment const *>(src_byte_ptr + src_byte_offset);
guards[b] = true;
}
else {
guards[b] = false;
not_done = false;
}
linear_idx += (blockDim.z * gridDim.z * blockDim.x) * kVectorLength;
compute_inner_coord_and_offset_(params, coord, src_byte_offset, linear_idx);
}
``` 不断循环加载值,值的索引 变化是 linear_idx`
超界就是 not_done=false 已完成
if (linear_idx < params.inner_count) {
source_fragment[b] = *reinterpret_cast<SourceFragment const *>(src_byte_ptr + src_byte_offset);
guards[b] = true;
}
else {
guards[b] = false;
not_done = false;
}
这是个技巧可以代替纯循环。
```循环reduce 核心代码
CUTLASS_PRAGMA_UNROLL
for (int b = 0; b < kBatchSize; ++b) {
if (linear_idx < params.inner_count) {
source_fragment[b] = *reinterpret_cast<SourceFragment const *>(src_byte_ptr + src_byte_offset);
guards[b] = true;
}
else {
guards[b] = false;
not_done = false;
}
linear_idx += (blockDim.z * gridDim.z * blockDim.x) * kVectorLength;
compute_inner_coord_and_offset_(params, coord, src_byte_offset, linear_idx);
}
// Perform a batch of reduction operations
CUTLASS_PRAGMA_UNROLL
for (int b = 0; b < kBatchSize; ++b) {
if (guards[b]) {
auto cvt = convert_source(source_fragment[b]);
accumulator = cutlass::reduction::thread::detail::ApplyArrayOperator(
reduction_op,
accumulator,
cvt);
} //通常情况下 , accumulator是 Array<ele,1> ,applyArrayOperator实际上也是单个值reduce
}
};
using ComputeFragment = Array<ElementCompute, VectorLength>;
using SourceFragment = AlignedArray<ElementSource, VectorLength>;
相比我写的玩意,他这个计算循环展开有batchSize
收益来自于循环展开
for(auto id=global_threadIdx.x;id<num_cols;id+=blockDim.x*gridDim.x*batch)
{
#progma unroll
for(auto bi=0;bi<bath;bi++)
{
auto src_offset =id+bi*blockDim.x+gridDim.x;
if(src_offset<num_cols)
{
accumlotor=reduceOp(accumlotro,src_ptr[src_offset];
}
}
}
idx_linear
是外循环,linear_idx
是内循环
感觉thread是分少了,虽然结果对,但这个利用率低。
linear_idx += (blockDim.z * gridDim.z * blockDim.x) * kVectorLength;
blockDim.x连一个warp都没有
每个thread的值写到共享内存然后
//···
CUTLASS_PRAGMA_NO_UNROLL
while (thread_count > 1) {
thread_count /= 2;
__syncthreads();
if (thread_j < thread_count) {
ElementCompute other = frag_ptr[thread_j + thread_count];
reduced_accumulator = reduction_op(reduced_accumulator, other);
frag_ptr[thread_j] = reduced_accumulator;
}
__syncthreads();
}
```workspacesize
int64_t workspace_size() const {
// Error condition
if (!good()) {
return 0;
}
// No reduction across CTAs
if (grid_shape.z == 1) {
return 0;
}
return workspace_stride() * grid_shape.z;
}
/// Size (in bytes) of <outer_count> workspace elements which are densely packed together
int64_t workspace_stride() const {
// Error condition
if (!good()) {
return 0;
}
return outer_count * sizeof_bits<ElementCompute>::value / 8;
}
至少这个continuos的reduce感觉也没啥...
reduce Stride
int cta_width = kThreads * kVectorLength;
int cta_ways = reshape_pow2(extent_c, cta_width);
int cta_threads_x = kThreads / cta_ways;
threadblock_shape = dim3(cta_threads_x, 1, std::min(cta_ways, 64));
这个 看起来可靠许多
using ReductionDeviceStridedOperator = TensorReductionAffineStrided<
4, 3, ElementOutput, ElementSource, ReductionOp, kVectorLength, ElementCompute
>;
reduceRanked被定到第3维
switch (reduction_index) {
case 0:
extent_affine[0] = extent[1];
extent_affine[1] = extent[2];
extent_affine[2] = extent[0];
extent_affine[3] = extent[3];
break;
case 1:
extent_affine[0] = extent[0];
extent_affine[1] = extent[2];
extent_affine[2] = extent[1];
extent_affine[3] = extent[3];
break;
case 2:
extent_affine[0] = extent[0];
extent_affine[1] = extent[1];
extent_affine[2] = extent[2];
extent_affine[3] = extent[3];
break;
与 reduce continue不同
// Compute number of elements in strided ranks
for (int p = 0; p < kReducedRank - 1; ++p) {
outer_count *= extent[p];
}
for (int p = 0; p < kInnerRank; ++p) {
inner_count *= extent[kReducedRank + p - 1];
}
``` 这里 outer_count 只到 kReducedRank-1
inner_count实际上不包括最后一维
int cta_width = kThreads * kVectorLength;
int cta_ways = reshape_pow2(extent_c, cta_width);
int cta_threads_x = kThreads / cta_ways;
threadblock_shape = dim3(cta_threads_x, 1, std::min(cta_ways, 64));
// This leads to an error.
if (threadblock_shape.z > 1) {
if (threadblock_shape.y != 1) {
status = Status::kErrorInternal;
return;
}
}
blockDIm.y=1 blockDIm.x和 blockDim,z由col决定。
inner_count即reduce 维度的数量 由z处理
内部的stride也不一样了
switch (reduction_index) {
case 0:
src_stride[0] = src_ref.stride()[1];
src_stride[1] = src_ref.stride()[0];
src_stride[2] = src_ref.stride()[2];
dst_stride[0] = dst_ref.stride()[1];
dst_stride[1] = dst_ref.stride()[0];
break;
case 1:
src_stride[0] = src_ref.stride()[2];
src_stride[1] = src_ref.stride()[0];
src_stride[2] = src_ref.stride()[1];
dst_stride[0] = dst_ref.stride()[2];
dst_stride[1] = dst_ref.stride()[0];
break;
case 2:
src_stride[0] = src_ref.stride()[2];
src_stride[1] = src_ref.stride()[1];
src_stride[2] = src_ref.stride()[0];
dst_stride[0] = dst_ref.stride()[2];
dst_stride[1] = dst_ref.stride()[1];
break;
case 3:
src_stride[0] = src_ref.stride()[2];
src_stride[1] = src_ref.stride()[1];
src_stride[2] = src_ref.stride()[0];
dst_stride[0] = dst_ref.stride()[2];
dst_stride[1] = dst_ref.stride()[1];
dst_stride[2] = dst_ref.stride()[0];
//只有threadIdx.z==0
// Linearized thread ID
int thread_idx = threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
// all threads store to workspace
ComputeFragment *frag_ptr = reinterpret_cast<ComputeFragment *>(threadblock_workspace);
frag_ptr[thread_idx] = accumulator;
__syncthreads();
if (threadIdx.z == 0) {
// Load all additional block indices
for (int z = 1; z < blockDim.z; ++z) {
ComputeFragment frag = frag_ptr[thread_idx + z * blockDim.x * blockDim.y];
accumulator = cutlass::reduction::thread::detail::ApplyArrayOperator(
reduction_op,
accumulator,
frag);
}
}