32*32宏块的RGBA2GRAY转换过程
1)
void cvtColor( InputArray _src, OutputArray _dst, int code = 10, int dcn = 0 )
{
//是opencv性能检测框架,跟踪opencv的函数执行情况
CV_INSTRUMENT_REGION();
CV_Assert(!_src.empty());
//根据传入的转换方式,获取通道数量
if(dcn <= 0)
dcn = dstChannels(code);
//`CV_OCL_RUN`用于OpenCL代码
CV_OCL_RUN( _src.dims() <= 2 && _dst.isUMat() &&
!(CV_MAT_DEPTH(_src.type()) == CV_8U && (code == COLOR_Luv2BGR || code == COLOR_Luv2RGB)),
ocl_cvtColor(_src, _dst, code, dcn) )
//无法使用opencl时,进入switch
switch( code ){
……
case COLOR_RGB2GRAY: case COLOR_RGBA2GRAY:
cvtColorBGR2Gray(_src, _dst, swapBlue(code));
break;
……
}
ref : CV_INSTRUMENT_REGION
ref : CV_OCL_RUN
ref : umat
2)
void cvtColorBGR2Gray( InputArray _src, OutputArray _dst, bool swapb)
{
CvtHelper< Set<3, 4>, Set<1>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, 1);
hal::cvtBGRtoGray(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
h.depth, h.scn, swapb);
}
这里的set定义如下:
template<int i0, int i1 = -1, int i2 = -1>
struct Set
{
static inline bool contains(int i)
{
return (i == i0 || i == i1 || i == i2);
}
};
template<int i0, int i1>
struct Set<i0, i1, -1>
{
static inline bool contains(int i)
{
return (i == i0 || i == i1);
}
};
template<int i0>
struct Set<i0, -1, -1>
{
static inline bool contains(int i)
{
return (i == i0);
}
};
ref : 非类型模板参数
可以看出这里set只是为了比较,这点的使用,可以在CvtHelpter的构造函数中看到:
CvtHelper(InputArray _src, OutputArray _dst, int dcn)
{
CV_Assert(!_src.empty());
int stype = _src.type();
scn = CV_MAT_CN(stype), depth = CV_MAT_DEPTH(stype);
//`set`用于比较 CvtHelper< Set<3, 4>, Set<1>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, 1);
CV_Check(scn, VScn::contains(scn), "Invalid number of channels in input image");
CV_Check(dcn, VDcn::contains(dcn), "Invalid number of channels in output image");
CV_CheckDepth(depth, VDepth::contains(depth), "Unsupported depth of input image");
if (_src.getObj() == _dst.getObj()) // inplace processing (#6653)
_src.copyTo(src);
else
src = _src.getMat();
Size sz = src.size();
switch (sizePolicy)
{
case TO_YUV:
CV_Assert( sz.width % 2 == 0 && sz.height % 2 == 0);
dstSz = Size(sz.width, sz.height / 2 * 3);
break;
case FROM_YUV:
CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0);
dstSz = Size(sz.width, sz.height * 2 / 3);
break;
case NONE:
default:
dstSz = sz;
break;
}
_dst.create(dstSz, CV_MAKETYPE(depth, dcn));
dst = _dst.getMat();
}
这里会构造出结构如下的CvtHelper:
struct CvtHelper
{
Mat src ;{由传入参数 InputArray _src 转换得的,step = 128 [一行的像素数量]}
Mat dst; {由传入参数 OutputArray _dst 转换得的,step = 128[一行的像素数量]}
int depth; {0}
int scn;{4}
Size dstSz;{32*32}
};
3)
// 8u, 16u, 32f
void cvtBGRtoBGR(const uchar * src_data, size_t src_step,
uchar * dst_data, size_t dst_step,
int width, int height,
int depth, int scn, int dcn, bool swapBlue)
{
CV_INSTRUMENT_REGION();
//对函数进行HAL优化。如果能够优化,函数就执行结束,否则须要执行后绪的根据指令集优化。
CALL_HAL(cvtBGRtoBGR, cv_hal_cvtBGRtoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, scn, dcn, swapBlue);
#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700
CV_IPP_CHECK()
{
if(scn == 3 && dcn == 4 && !swapBlue)
{
if ( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
IPPReorderFunctor(ippiSwapChannelsC3C4RTab[depth], 0, 1, 2)) )
return;
}
else if(scn == 4 && dcn == 3 && !swapBlue)
{
if ( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
IPPGeneralFunctor(ippiCopyAC4C3RTab[depth])) )
return;
}
else if(scn == 3 && dcn == 4 && swapBlue)
{
if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
IPPReorderFunctor(ippiSwapChannelsC3C4RTab[depth], 2, 1, 0)) )
return;
}
else if(scn == 4 && dcn == 3 && swapBlue)
{
if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
IPPReorderFunctor(ippiSwapChannelsC4C3RTab[depth], 2, 1, 0)) )
return;
}
else if(scn == 3 && dcn == 3 && swapBlue)
{
if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, scn), dst_data, dst_step, width, height,
IPPReorderFunctor(ippiSwapChannelsC3RTab[depth], 2, 1, 0)) )
return;
}
#if IPP_VERSION_X100 >= 810
else if(scn == 4 && dcn == 4 && swapBlue)
{
if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, scn), dst_data, dst_step, width, height,
IPPReorderFunctor(ippiSwapChannelsC4RTab[depth], 2, 1, 0)) )
return;
}
}
#endif
#endif
//对函数进行依据cpu指令集优化
CV_CPU_DISPATCH(cvtBGRtoBGR, (src_data, src_step, dst_data, dst_step, width, height, depth, scn, dcn, swapBlue),
CV_CPU_DISPATCH_MODES_ALL);
}
CPU_DISPATCH有如下选项:
/* CPU features and intrinsics support */
#define CV_CPU_NONE 0
#define CV_CPU_MMX 1
#define CV_CPU_SSE 2
#define CV_CPU_SSE2 3
#define CV_CPU_SSE3 4
#define CV_CPU_SSSE3 5
#define CV_CPU_SSE4_1 6
#define CV_CPU_SSE4_2 7
#define CV_CPU_POPCNT 8
#define CV_CPU_FP16 9
#define CV_CPU_AVX 10
#define CV_CPU_AVX2 11
#define CV_CPU_FMA3 12
#define CV_CPU_AVX_512F 13
#define CV_CPU_AVX_512BW 14
#define CV_CPU_AVX_512CD 15
#define CV_CPU_AVX_512DQ 16
#define CV_CPU_AVX_512ER 17
#define CV_CPU_AVX_512IFMA512 18 // deprecated
#define CV_CPU_AVX_512IFMA 18
#define CV_CPU_AVX_512PF 19
#define CV_CPU_AVX_512VBMI 20
#define CV_CPU_AVX_512VL 21
#define CV_CPU_AVX_512VBMI2 22
#define CV_CPU_AVX_512VNNI 23
#define CV_CPU_AVX_512BITALG 24
#define CV_CPU_AVX_512VPOPCNTDQ 25
#define CV_CPU_AVX_5124VNNIW 26
#define CV_CPU_AVX_5124FMAPS 27
#define CV_CPU_NEON 100
#define CV_CPU_MSA 150
#define CV_CPU_VSX 200
#define CV_CPU_VSX3 201
#define CV_CPU_RVV 210
// CPU features groups
#define CV_CPU_AVX512_SKX 256
#define CV_CPU_AVX512_COMMON 257
#define CV_CPU_AVX512_KNL 258
#define CV_CPU_AVX512_KNM 259
#define CV_CPU_AVX512_CNL 260
#define CV_CPU_AVX512_CLX 261
#define CV_CPU_AVX512_ICL 262
// when adding to this list remember to update the following enum
#define CV_HARDWARE_MAX_FEATURE 512
4)
void cvtBGRtoBGR(const uchar * src_data, size_t src_step,
uchar * dst_data, size_t dst_step,
int width, int height,
int depth, int scn, int dcn, bool swapBlue)
{
CV_INSTRUMENT_REGION();
int blueIdx = swapBlue ? 2 : 0;
if( depth == CV_8U )
CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB<uchar>(scn, dcn, blueIdx));
else if( depth == CV_16U )
CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB<ushort>(scn, dcn, blueIdx));
else
CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB<float>(scn, dcn, blueIdx));
}
结构体RGB2Gray是颜色转换函数的一层封装,其构造函数如下:
RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : srccn(_srccn)
{
const int coeffs0[] = { RY, GY, BY };
for(int i = 0; i < 3; i++)
coeffs[i] = (short)(_coeffs ? _coeffs[i] : coeffs0[i]);
if(blueIdx == 0)
std::swap(coeffs[0], coeffs[2]);
CV_Assert(coeffs[0] + coeffs[1] + coeffs[2] == (1 << shift));
}
其内部实现了灰度的转换:
void operator()(const uchar* src, uchar* dst, int n) const
{
int scn = srccn;
short cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
int i = 0;
#if CV_SIMD
const int vsize = v_uint8::nlanes;
v_int16 bg2y;
v_int16 r12y;
v_int16 dummy;
v_zip(vx_setall_s16(cb), vx_setall_s16(cg), bg2y, dummy);
v_zip(vx_setall_s16(cr), vx_setall_s16( 1), r12y, dummy);
v_int16 delta = vx_setall_s16(1 << (shift-1));
for( ; i <= n-vsize;
i += vsize, src += scn*vsize, dst += vsize)
{
v_uint8 r, g, b, a;
if(scn == 3)
{
v_load_deinterleave(src, b, g, r);
}
else
{
v_load_deinterleave(src, b, g, r, a);
}
//TODO: shorten registers use when v_deinterleave is available
v_uint16 r0, r1, g0, g1, b0, b1;
v_expand(r, r0, r1);
v_expand(g, g0, g1);
v_expand(b, b0, b1);
v_int16 bg00, bg01, bg10, bg11;
v_int16 rd00, rd01, rd10, rd11;
v_zip(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(g0), bg00, bg01);
v_zip(v_reinterpret_as_s16(b1), v_reinterpret_as_s16(g1), bg10, bg11);
v_zip(v_reinterpret_as_s16(r0), delta, rd00, rd01);
v_zip(v_reinterpret_as_s16(r1), delta, rd10, rd11);
v_uint32 y00, y01, y10, y11;
y00 = v_reinterpret_as_u32(v_dotprod(bg00, bg2y) + v_dotprod(rd00, r12y)) >> shift;
y01 = v_reinterpret_as_u32(v_dotprod(bg01, bg2y) + v_dotprod(rd01, r12y)) >> shift;
y10 = v_reinterpret_as_u32(v_dotprod(bg10, bg2y) + v_dotprod(rd10, r12y)) >> shift;
y11 = v_reinterpret_as_u32(v_dotprod(bg11, bg2y) + v_dotprod(rd11, r12y)) >> shift;
v_uint16 y0, y1;
y0 = v_pack(y00, y01);
y1 = v_pack(y10, y11);
v_uint8 y = v_pack(y0, y1);
v_store(dst, y);
}
vx_cleanup();
#endif
for( ; i < n; i++, src += scn, dst++)
{
int b = src[0], g = src[1], r = src[2];
uchar y = (uchar)CV_DESCALE(b*cb + g*cg + r*cr, shift);
dst[0] = y;
}
}
int srccn;
short coeffs[3];
};
5)
void CvtColorLoop(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, const Cvt& cvt)
{
CV_AVX_GUARD
//第三个参数:step执行次数 = 线程数 * 每个线程执行的step次数
parallel_for_(Range(0, height),
CvtColorLoop_Invoker<Cvt>(src_data, src_step, dst_data, dst_step, width, cvt),
(width * height) / static_cast<double>(1<<16));
}
CvtColorLoop_Invoker用于封装对RGB2Gray的调用
virtual void operator()(const Range& range) const CV_OVERRIDE
{
CV_TRACE_FUNCTION();
const uchar* yS = src_data + static_cast<size_t>(range.start) * src_step;
uchar* yD = dst_data + static_cast<size_t>(range.start) * dst_step;
//每次转化128个像素
for( int i = range.start; i < range.end; ++i, yS += src_step, yD += dst_step )
cvt(reinterpret_cast<const _Tp*>(yS), reinterpret_cast<_Tp*>(yD), width);
}
6)
void parallel_for_(const cv::Range& range, const cv::ParallelLoopBody& body, double nstripes)
{
#ifdef OPENCV_TRACE
CV__TRACE_OPENCV_FUNCTION_NAME_("parallel_for", 0);
CV_TRACE_ARG_VALUE(range_start, "range.start", (int64)range.start);
CV_TRACE_ARG_VALUE(range_end, "range.end", (int64)range.end);
CV_TRACE_ARG_VALUE(nstripes, "nstripes", (int64)nstripes);
#endif
CV_INSTRUMENT_REGION_MT_FORK();
if (range.empty())
return;
#ifdef CV_PARALLEL_FRAMEWORK
static std::atomic<bool> flagNestedParallelFor(false);
bool isNotNestedRegion = !flagNestedParallelFor.load();
if (isNotNestedRegion)
isNotNestedRegion = !flagNestedParallelFor.exchange(true);
if (isNotNestedRegion)
{
try
{
parallel_for_impl(range, body, nstripes);
flagNestedParallelFor = false;
}
catch (...)
{
flagNestedParallelFor = false;
throw;
}
}
else // nested parallel_for_() calls are not parallelized
#endif // CV_PARALLEL_FRAMEWORK
{
CV_UNUSED(nstripes);
body(range);
}
}
static void parallel_for_impl(const cv::Range& range, const cv::ParallelLoopBody& body, double nstripes)
{
if ((numThreads < 0 || numThreads > 1) && range.end - range.start > 1)
{
ParallelLoopBodyWrapperContext ctx(body, range, nstripes);
ProxyLoopBody pbody(ctx);
cv::Range stripeRange = pbody.stripeRange();
if( stripeRange.end - stripeRange.start == 1 )
{
//即CvtColorLoop_Invoker
body(range);
return;
}
#if defined HAVE_TBB
#if TBB_INTERFACE_VERSION >= 8000
tbbArena.execute(pbody);
#else
pbody();
#endif
#elif defined HAVE_HPX
pbody();
#elif defined HAVE_OPENMP
#pragma omp parallel for schedule(dynamic) num_threads(numThreads > 0 ? numThreads : numThreadsMax)
for (int i = stripeRange.start; i < stripeRange.end; ++i)
pbody(Range(i, i + 1));
#elif defined HAVE_GCD
dispatch_queue_t concurrent_queue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
dispatch_apply_f(stripeRange.end - stripeRange.start, concurrent_queue, &pbody, block_function);
#elif defined WINRT
Concurrency::parallel_for(stripeRange.start, stripeRange.end, pbody);
#elif defined HAVE_CONCURRENCY
if(!pplScheduler || pplScheduler->Id() == Concurrency::CurrentScheduler::Id())
{
Concurrency::parallel_for(stripeRange.start, stripeRange.end, pbody);
}
else
{
pplScheduler->Attach();
Concurrency::parallel_for(stripeRange.start, stripeRange.end, pbody);
Concurrency::CurrentScheduler::Detach();
}
#elif defined HAVE_PTHREADS_PF
parallel_for_pthreads(pbody.stripeRange(), pbody, pbody.stripeRange().size());
#else
#error You have hacked and compiling with unsupported parallel framework
#endif
ctx.finalize(); // propagate exceptions if exists
}
else
{
body(range);
}
}
ParallelLoopBodyWrapperContext(const cv::ParallelLoopBody& _body, const cv::Range& _r, double _nstripes) :
is_rng_used(false), hasException(false)
{
body = &_body;
wholeRange = _r;
double len = wholeRange.end - wholeRange.start;
//这里使用_nstripes计算步长
nstripes = cvRound(_nstripes <= 0 ? len : MIN(MAX(_nstripes, 1.), len));
// propagate main thread state
// cv::theRNG返回默认随机数生成器
rng = cv::theRNG();
#ifdef OPENCV_TRACE
traceRootRegion = CV_TRACE_NS::details::getCurrentRegion();
traceRootContext = CV_TRACE_NS::details::getTraceManager().tls.get();
#endif
#ifdef ENABLE_INSTRUMENTATION
pThreadRoot = cv::instr::getInstrumentTLSStruct().pCurrentNode;
#endif
}