32*32宏块的RGBA2GRAY转换过程

1）

void cvtColor( InputArray _src, OutputArray _dst, int code = 10, int dcn = 0 )
{
//是opencv性能检测框架，跟踪opencv的函数执行情况
    CV_INSTRUMENT_REGION();

    CV_Assert(!_src.empty());
//根据传入的转换方式，获取通道数量
    if(dcn <= 0)
            dcn = dstChannels(code);
//`CV_OCL_RUN`用于OpenCL代码
    CV_OCL_RUN( _src.dims() <= 2 && _dst.isUMat() &&
                !(CV_MAT_DEPTH(_src.type()) == CV_8U && (code == COLOR_Luv2BGR || code == COLOR_Luv2RGB)),
                ocl_cvtColor(_src, _dst, code, dcn) )
//无法使用opencl时，进入switch
    switch( code ){
  ……     
            case COLOR_RGB2GRAY: case COLOR_RGBA2GRAY:
            cvtColorBGR2Gray(_src, _dst, swapBlue(code));
            break;
……
}

ref : CV_INSTRUMENT_REGION
ref : CV_OCL_RUN
ref : umat

2）

void cvtColorBGR2Gray( InputArray _src, OutputArray _dst, bool swapb)
{
    CvtHelper< Set<3, 4>, Set<1>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, 1);

    hal::cvtBGRtoGray(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
                      h.depth, h.scn, swapb);
}

这里的set定义如下：

template<int i0, int i1 = -1, int i2 = -1>
struct Set
{
    static inline bool contains(int i)
    {
        return (i == i0 || i == i1 || i == i2);
    }
};

template<int i0, int i1>
struct Set<i0, i1, -1>
{
    static inline bool contains(int i)
    {
        return (i == i0 || i == i1);
    }
};
template<int i0>
struct Set<i0, -1, -1>
{
    static inline bool contains(int i)
    {
        return (i == i0);
    }
};

ref : 非类型模板参数
可以看出这里set只是为了比较，这点的使用，可以在CvtHelpter的构造函数中看到：

CvtHelper(InputArray _src, OutputArray _dst, int dcn)
    {
        CV_Assert(!_src.empty());

        int stype = _src.type();
        scn = CV_MAT_CN(stype), depth = CV_MAT_DEPTH(stype);
//`set`用于比较 CvtHelper< Set<3, 4>, Set<1>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, 1);
        CV_Check(scn, VScn::contains(scn), "Invalid number of channels in input image");
        CV_Check(dcn, VDcn::contains(dcn), "Invalid number of channels in output image");
        CV_CheckDepth(depth, VDepth::contains(depth), "Unsupported depth of input image");

        if (_src.getObj() == _dst.getObj()) // inplace processing (#6653)
            _src.copyTo(src);
        else
            src = _src.getMat();
        Size sz = src.size();
        switch (sizePolicy)
        {
        case TO_YUV:
            CV_Assert( sz.width % 2 == 0 && sz.height % 2 == 0);
            dstSz = Size(sz.width, sz.height / 2 * 3);
            break;
        case FROM_YUV:
            CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0);
            dstSz = Size(sz.width, sz.height * 2 / 3);
            break;
        case NONE:
        default:
            dstSz = sz;
            break;
        }
        _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
        dst = _dst.getMat();
    }

这里会构造出结构如下的CvtHelper：

struct CvtHelper
{
    Mat src ;{由传入参数 InputArray _src 转换得的，step = 128 [一行的像素数量]}
    Mat dst; {由传入参数 OutputArray _dst 转换得的，step = 128[一行的像素数量]}
    int depth; {0}
    int scn;{4}
    Size dstSz;{32*32}
};

3）

// 8u, 16u, 32f
void cvtBGRtoBGR(const uchar * src_data, size_t src_step,
                 uchar * dst_data, size_t dst_step,
                 int width, int height,
                 int depth, int scn, int dcn, bool swapBlue)
{
    CV_INSTRUMENT_REGION();
//对函数进行HAL优化。如果能够优化，函数就执行结束，否则须要执行后绪的根据指令集优化。
    CALL_HAL(cvtBGRtoBGR, cv_hal_cvtBGRtoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, scn, dcn, swapBlue);

#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700
    CV_IPP_CHECK()
    {
    if(scn == 3 && dcn == 4 && !swapBlue)
    {
        if ( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
                             IPPReorderFunctor(ippiSwapChannelsC3C4RTab[depth], 0, 1, 2)) )
            return;
    }
    else if(scn == 4 && dcn == 3 && !swapBlue)
    {
        if ( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
                             IPPGeneralFunctor(ippiCopyAC4C3RTab[depth])) )
            return;
    }
    else if(scn == 3 && dcn == 4 && swapBlue)
    {
        if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
                            IPPReorderFunctor(ippiSwapChannelsC3C4RTab[depth], 2, 1, 0)) )
            return;
    }
    else if(scn == 4 && dcn == 3 && swapBlue)
    {
        if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
                            IPPReorderFunctor(ippiSwapChannelsC4C3RTab[depth], 2, 1, 0)) )
            return;
    }
    else if(scn == 3 && dcn == 3 && swapBlue)
    {
        if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, scn), dst_data, dst_step, width, height,
                                IPPReorderFunctor(ippiSwapChannelsC3RTab[depth], 2, 1, 0)) )
            return;
    }
#if IPP_VERSION_X100 >= 810
    else if(scn == 4 && dcn == 4 && swapBlue)
    {
        if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, scn), dst_data, dst_step, width, height,
                                IPPReorderFunctor(ippiSwapChannelsC4RTab[depth], 2, 1, 0)) )
            return;
    }
    }
#endif
#endif
//对函数进行依据cpu指令集优化
    CV_CPU_DISPATCH(cvtBGRtoBGR, (src_data, src_step, dst_data, dst_step, width, height, depth, scn, dcn, swapBlue),
        CV_CPU_DISPATCH_MODES_ALL);
}

CPU_DISPATCH有如下选项：

/* CPU features and intrinsics support */
#define CV_CPU_NONE             0
#define CV_CPU_MMX              1
#define CV_CPU_SSE              2
#define CV_CPU_SSE2             3
#define CV_CPU_SSE3             4
#define CV_CPU_SSSE3            5
#define CV_CPU_SSE4_1           6
#define CV_CPU_SSE4_2           7
#define CV_CPU_POPCNT           8
#define CV_CPU_FP16             9
#define CV_CPU_AVX              10
#define CV_CPU_AVX2             11
#define CV_CPU_FMA3             12

#define CV_CPU_AVX_512F         13
#define CV_CPU_AVX_512BW        14
#define CV_CPU_AVX_512CD        15
#define CV_CPU_AVX_512DQ        16
#define CV_CPU_AVX_512ER        17
#define CV_CPU_AVX_512IFMA512   18 // deprecated
#define CV_CPU_AVX_512IFMA      18
#define CV_CPU_AVX_512PF        19
#define CV_CPU_AVX_512VBMI      20
#define CV_CPU_AVX_512VL        21
#define CV_CPU_AVX_512VBMI2     22
#define CV_CPU_AVX_512VNNI      23
#define CV_CPU_AVX_512BITALG    24
#define CV_CPU_AVX_512VPOPCNTDQ 25
#define CV_CPU_AVX_5124VNNIW    26
#define CV_CPU_AVX_5124FMAPS    27

#define CV_CPU_NEON             100

#define CV_CPU_MSA              150

#define CV_CPU_VSX              200
#define CV_CPU_VSX3             201

#define CV_CPU_RVV              210

// CPU features groups
#define CV_CPU_AVX512_SKX       256
#define CV_CPU_AVX512_COMMON    257
#define CV_CPU_AVX512_KNL       258
#define CV_CPU_AVX512_KNM       259
#define CV_CPU_AVX512_CNL       260
#define CV_CPU_AVX512_CLX       261
#define CV_CPU_AVX512_ICL       262

// when adding to this list remember to update the following enum
#define CV_HARDWARE_MAX_FEATURE 512

ref : cpu dispatch

4）

void cvtBGRtoBGR(const uchar * src_data, size_t src_step,
                 uchar * dst_data, size_t dst_step,
                 int width, int height,
                 int depth, int scn, int dcn, bool swapBlue)
{
    CV_INSTRUMENT_REGION();

    int blueIdx = swapBlue ? 2 : 0;
    if( depth == CV_8U )
        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB<uchar>(scn, dcn, blueIdx));
    else if( depth == CV_16U )
        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB<ushort>(scn, dcn, blueIdx));
    else
        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB<float>(scn, dcn, blueIdx));
}

结构体RGB2Gray是颜色转换函数的一层封装，其构造函数如下：

    RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : srccn(_srccn)
    {
        const int coeffs0[] = { RY, GY, BY };
        for(int i = 0; i < 3; i++)
                coeffs[i] = (short)(_coeffs ? _coeffs[i] : coeffs0[i]);
        if(blueIdx == 0)
            std::swap(coeffs[0], coeffs[2]);

        CV_Assert(coeffs[0] + coeffs[1] + coeffs[2] == (1 << shift));
    }

其内部实现了灰度的转换：

void operator()(const uchar* src, uchar* dst, int n) const
    {
        int scn = srccn;
        short cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
        int i = 0;

#if CV_SIMD
        const int vsize = v_uint8::nlanes;
        v_int16 bg2y;
        v_int16 r12y;
        v_int16 dummy;
        v_zip(vx_setall_s16(cb), vx_setall_s16(cg), bg2y, dummy);
        v_zip(vx_setall_s16(cr), vx_setall_s16( 1), r12y, dummy);
        v_int16 delta = vx_setall_s16(1 << (shift-1));

        for( ; i <= n-vsize;
             i += vsize, src += scn*vsize, dst += vsize)
        {
            v_uint8 r, g, b, a;
            if(scn == 3)
            {
                v_load_deinterleave(src, b, g, r);
            }
            else
            {
                v_load_deinterleave(src, b, g, r, a);
            }

            //TODO: shorten registers use when v_deinterleave is available

            v_uint16 r0, r1, g0, g1, b0, b1;
            v_expand(r, r0, r1);
            v_expand(g, g0, g1);
            v_expand(b, b0, b1);

            v_int16 bg00, bg01, bg10, bg11;
            v_int16 rd00, rd01, rd10, rd11;
            v_zip(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(g0), bg00, bg01);
            v_zip(v_reinterpret_as_s16(b1), v_reinterpret_as_s16(g1), bg10, bg11);
            v_zip(v_reinterpret_as_s16(r0), delta, rd00, rd01);
            v_zip(v_reinterpret_as_s16(r1), delta, rd10, rd11);

            v_uint32 y00, y01, y10, y11;
            y00 = v_reinterpret_as_u32(v_dotprod(bg00, bg2y) + v_dotprod(rd00, r12y)) >> shift;
            y01 = v_reinterpret_as_u32(v_dotprod(bg01, bg2y) + v_dotprod(rd01, r12y)) >> shift;
            y10 = v_reinterpret_as_u32(v_dotprod(bg10, bg2y) + v_dotprod(rd10, r12y)) >> shift;
            y11 = v_reinterpret_as_u32(v_dotprod(bg11, bg2y) + v_dotprod(rd11, r12y)) >> shift;

            v_uint16 y0, y1;
            y0 = v_pack(y00, y01);
            y1 = v_pack(y10, y11);

            v_uint8 y = v_pack(y0, y1);
            v_store(dst, y);
        }
        vx_cleanup();
#endif

        for( ; i < n; i++, src += scn, dst++)
        {
            int b = src[0], g = src[1], r = src[2];
            uchar y = (uchar)CV_DESCALE(b*cb + g*cg + r*cr, shift);
            dst[0] = y;
        }
    }

    int srccn;
    short coeffs[3];
};

5）

void CvtColorLoop(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, const Cvt& cvt)
{
    CV_AVX_GUARD
//第三个参数：step执行次数 = 线程数 * 每个线程执行的step次数
    parallel_for_(Range(0, height),
                  CvtColorLoop_Invoker<Cvt>(src_data, src_step, dst_data, dst_step, width, cvt),
                  (width * height) / static_cast<double>(1<<16));
}

CvtColorLoop_Invoker用于封装对RGB2Gray的调用

 virtual void operator()(const Range& range) const CV_OVERRIDE
    {
        CV_TRACE_FUNCTION();

        const uchar* yS = src_data + static_cast<size_t>(range.start) * src_step;
        uchar* yD = dst_data + static_cast<size_t>(range.start) * dst_step;
//每次转化128个像素
        for( int i = range.start; i < range.end; ++i, yS += src_step, yD += dst_step )
            cvt(reinterpret_cast<const _Tp*>(yS), reinterpret_cast<_Tp*>(yD), width);
    }

6）

void parallel_for_(const cv::Range& range, const cv::ParallelLoopBody& body, double nstripes)
{
#ifdef OPENCV_TRACE
    CV__TRACE_OPENCV_FUNCTION_NAME_("parallel_for", 0);
    CV_TRACE_ARG_VALUE(range_start, "range.start", (int64)range.start);
    CV_TRACE_ARG_VALUE(range_end, "range.end", (int64)range.end);
    CV_TRACE_ARG_VALUE(nstripes, "nstripes", (int64)nstripes);
#endif

    CV_INSTRUMENT_REGION_MT_FORK();
    if (range.empty())
        return;

#ifdef CV_PARALLEL_FRAMEWORK
    static std::atomic<bool> flagNestedParallelFor(false);
    bool isNotNestedRegion = !flagNestedParallelFor.load();
    if (isNotNestedRegion)
      isNotNestedRegion = !flagNestedParallelFor.exchange(true);
    if (isNotNestedRegion)
    {
        try
        {
            parallel_for_impl(range, body, nstripes);
            flagNestedParallelFor = false;
        }
        catch (...)
        {
            flagNestedParallelFor = false;
            throw;
        }
    }
    else // nested parallel_for_() calls are not parallelized
#endif // CV_PARALLEL_FRAMEWORK
    {
        CV_UNUSED(nstripes);
        body(range);
    }
}

static void parallel_for_impl(const cv::Range& range, const cv::ParallelLoopBody& body, double nstripes)
{
    if ((numThreads < 0 || numThreads > 1) && range.end - range.start > 1)
    {
        ParallelLoopBodyWrapperContext ctx(body, range, nstripes);
        ProxyLoopBody pbody(ctx);
        cv::Range stripeRange = pbody.stripeRange();
        if( stripeRange.end - stripeRange.start == 1 )
        {
            //即CvtColorLoop_Invoker
            body(range);
            return;
        }

#if defined HAVE_TBB

#if TBB_INTERFACE_VERSION >= 8000
        tbbArena.execute(pbody);
#else
        pbody();
#endif

#elif defined HAVE_HPX
        pbody();

#elif defined HAVE_OPENMP

        #pragma omp parallel for schedule(dynamic) num_threads(numThreads > 0 ? numThreads : numThreadsMax)
        for (int i = stripeRange.start; i < stripeRange.end; ++i)
            pbody(Range(i, i + 1));

#elif defined HAVE_GCD

        dispatch_queue_t concurrent_queue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
        dispatch_apply_f(stripeRange.end - stripeRange.start, concurrent_queue, &pbody, block_function);

#elif defined WINRT

        Concurrency::parallel_for(stripeRange.start, stripeRange.end, pbody);

#elif defined HAVE_CONCURRENCY

        if(!pplScheduler || pplScheduler->Id() == Concurrency::CurrentScheduler::Id())
        {
            Concurrency::parallel_for(stripeRange.start, stripeRange.end, pbody);
        }
        else
        {
            pplScheduler->Attach();
            Concurrency::parallel_for(stripeRange.start, stripeRange.end, pbody);
            Concurrency::CurrentScheduler::Detach();
        }

#elif defined HAVE_PTHREADS_PF

        parallel_for_pthreads(pbody.stripeRange(), pbody, pbody.stripeRange().size());

#else

#error You have hacked and compiling with unsupported parallel framework

#endif

        ctx.finalize();  // propagate exceptions if exists
    }
    else
    {
        body(range);
    }
}

ParallelLoopBodyWrapperContext(const cv::ParallelLoopBody& _body, const cv::Range& _r, double _nstripes) :
            is_rng_used(false), hasException(false)
{

            body = &_body;
            wholeRange = _r;
            double len = wholeRange.end - wholeRange.start;
//这里使用_nstripes计算步长
            nstripes = cvRound(_nstripes <= 0 ? len : MIN(MAX(_nstripes, 1.), len));

            // propagate main thread state
            // cv::theRNG返回默认随机数生成器
            rng = cv::theRNG();

#ifdef OPENCV_TRACE
            traceRootRegion = CV_TRACE_NS::details::getCurrentRegion();
            traceRootContext = CV_TRACE_NS::details::getTraceManager().tls.get();
#endif

#ifdef ENABLE_INSTRUMENTATION
            pThreadRoot = cv::instr::getInstrumentTLSStruct().pCurrentNode;
#endif
}

RGBA2GRAY 内部实现

RGBA2GRAY 内部实现

1）

2）

3）

4）

5）

6）

推荐阅读更多精彩内容