Merge pull request #10553 from GlueCrow:bgfg_knn_opencl

Add ocl version BackgroundSubtractorKNN (#10553) * Add ocl version bgfg_knn * Add ocl KNN perf test * ocl KNN: Avoid unnecessary initializing when non-UMat parameters are used * video: turn off OpenCL for color KNN on Intel devices due performance degradation * video: turn off KNN OpenCL on Apple devices with Intel iGPU due process freeze during clBuildProgram() call
7 years ago · c8d8b1fbcd
parent 92fb3fb33f
commit c8d8b1fbcd
3 changed files with 583 additions and 37 deletions
--- a/modules/video/perf/opencl/perf_bgfg_knn.cpp
+++ b/modules/video/perf/opencl/perf_bgfg_knn.cpp
@ -0,0 +1,95 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 #include "../perf_precomp.hpp"
 #include "opencv2/ts/ocl_perf.hpp"
 #ifdef HAVE_OPENCL
 #ifdef HAVE_VIDEO_INPUT
 #include "../perf_bgfg_utils.hpp"
 namespace cvtest {
 namespace ocl {
 //////////////////////////// KNN//////////////////////////
 typedef tuple<string, int> VideoKNNParamType;
 typedef TestBaseWithParam<VideoKNNParamType> KNN_Apply;
 typedef TestBaseWithParam<VideoKNNParamType> KNN_GetBackgroundImage;
 using namespace opencv_test;
 OCL_PERF_TEST_P(KNN_Apply, KNN, Combine(Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"), Values(1,3)))
 {
    VideoKNNParamType params = GetParam();
    const string inputFile = getDataPath(get<0>(params));
    const int cn = get<1>(params);
    int nFrame = 5;
    vector<Mat> frame_buffer(nFrame);
    cv::VideoCapture cap(inputFile);
    ASSERT_TRUE(cap.isOpened());
    prepareData(cap, cn, frame_buffer);
    UMat u_foreground;
    OCL_TEST_CYCLE()
    {
        Ptr<cv::BackgroundSubtractorKNN> knn = createBackgroundSubtractorKNN();
        knn->setDetectShadows(false);
        u_foreground.release();
        for (int i = 0; i < nFrame; i++)
        {
            knn->apply(frame_buffer[i], u_foreground);
        }
    }
    SANITY_CHECK_NOTHING();
 }
 OCL_PERF_TEST_P(KNN_GetBackgroundImage, KNN, Values(
        std::make_pair<string, int>("gpu/video/768x576.avi", 5),
        std::make_pair<string, int>("gpu/video/1920x1080.avi", 5)))
 {
    VideoKNNParamType params = GetParam();
    const string inputFile = getDataPath(get<0>(params));
    const int cn = 3;
    const int skipFrames = get<1>(params);
    int nFrame = 10;
    vector<Mat> frame_buffer(nFrame);
    cv::VideoCapture cap(inputFile);
    ASSERT_TRUE(cap.isOpened());
    prepareData(cap, cn, frame_buffer, skipFrames);
    UMat u_foreground, u_background;
    OCL_TEST_CYCLE()
    {
        Ptr<cv::BackgroundSubtractorKNN> knn = createBackgroundSubtractorKNN();
        knn->setDetectShadows(false);
        u_foreground.release();
        u_background.release();
        for (int i = 0; i < nFrame; i++)
        {
            knn->apply(frame_buffer[i], u_foreground);
        }
        knn->getBackgroundImage(u_background);
    }
 #ifdef DEBUG_BGFG
    imwrite(format("fg_%d_%d_knn_ocl.png", frame_buffer[0].rows, cn), u_foreground.getMat(ACCESS_READ));
    imwrite(format("bg_%d_%d_knn_ocl.png", frame_buffer[0].rows, cn), u_background.getMat(ACCESS_READ));
 #endif
    SANITY_CHECK_NOTHING();
 }
 }}// namespace cvtest::ocl
 #endif
 #endif
--- a/modules/video/src/bgfg_KNN.cpp
+++ b/modules/video/src/bgfg_KNN.cpp
@ -42,6 +42,7 @@
 //#include <math.h>
 #include "precomp.hpp"
 #include "opencl_kernels_video.hpp"
 namespace cv
 {
@ -92,6 +93,9 @@ public:
    nLongCounter = 0;
    nMidCounter = 0;
    nShortCounter = 0;
 #ifdef HAVE_OPENCL
    opencl_ON = true;
 #endif
    }
    //! the full constructor that takes the length of the history,
    // the number of gaussian mixtures, the background ratio parameter and the noise strength
@ -119,6 +123,9 @@ public:
    nLongCounter = 0;
    nMidCounter = 0;
    nShortCounter = 0;
 #ifdef HAVE_OPENCL
    opencl_ON = true;
 #endif
    }
    //! the destructor
    ~BackgroundSubtractorKNNImpl() {}
@ -140,6 +147,50 @@ public:
        // Reserve memory for the model
        int size=frameSize.height*frameSize.width;
        //Reset counters
        nShortCounter = 0;
        nMidCounter = 0;
        nLongCounter = 0;
 #ifdef HAVE_OPENCL
        if (ocl::isOpenCLActivated() && opencl_ON)
        {
            create_ocl_apply_kernel();
            kernel_getBg.create("getBackgroundImage2_kernel", ocl::video::bgfg_knn_oclsrc, format( "-D CN=%d -D NSAMPLES=%d", nchannels, nN));
            if (kernel_apply.empty() || kernel_getBg.empty())
                opencl_ON = false;
        }
        else opencl_ON = false;
        if (opencl_ON)
        {
            u_flag.create(frameSize.height * nN * 3, frameSize.width, CV_8UC1);
            u_flag.setTo(Scalar::all(0));
            if (nchannels==3)
                nchannels=4;
            u_sample.create(frameSize.height * nN * 3, frameSize.width, CV_32FC(nchannels));
            u_sample.setTo(Scalar::all(0));
            u_aModelIndexShort.create(frameSize.height, frameSize.width, CV_8UC1);
            u_aModelIndexShort.setTo(Scalar::all(0));
            u_aModelIndexMid.create(frameSize.height, frameSize.width, CV_8UC1);
            u_aModelIndexMid.setTo(Scalar::all(0));
            u_aModelIndexLong.create(frameSize.height, frameSize.width, CV_8UC1);
            u_aModelIndexLong.setTo(Scalar::all(0));
            u_nNextShortUpdate.create(frameSize.height, frameSize.width, CV_8UC1);
            u_nNextShortUpdate.setTo(Scalar::all(0));
            u_nNextMidUpdate.create(frameSize.height, frameSize.width, CV_8UC1);
            u_nNextMidUpdate.setTo(Scalar::all(0));
            u_nNextLongUpdate.create(frameSize.height, frameSize.width, CV_8UC1);
            u_nNextLongUpdate.setTo(Scalar::all(0));
        }
        else
 #endif
        {
            // for each sample of 3 speed pixel models each pixel bg model we store ...
            // values + flag (nchannels+1 values)
            bgmodel.create( 1,(nN * 3) * (nchannels+1)* size,CV_8U);
@ -154,11 +205,6 @@ public:
            nNextMidUpdate.create(1,size,CV_8U);
            nNextLongUpdate.create(1,size,CV_8U);
    //Reset counters
    nShortCounter = 0;
    nMidCounter = 0;
    nLongCounter = 0;
            aModelIndexShort = Scalar::all(0);//random? //((m_nN)*rand())/(RAND_MAX+1);//0...m_nN-1
            aModelIndexMid = Scalar::all(0);
            aModelIndexLong = Scalar::all(0);
@ -166,6 +212,7 @@ public:
            nNextMidUpdate = Scalar::all(0);
            nNextLongUpdate = Scalar::all(0);
        }
    }
    virtual int getHistory() const { return history; }
    virtual void setHistory(int _nframes) { history = _nframes; }
@ -180,7 +227,19 @@ public:
    virtual void setDist2Threshold(double _dist2Threshold) { fTb = (float)_dist2Threshold; }
    virtual bool getDetectShadows() const { return bShadowDetection; }
-    virtual void setDetectShadows(bool detectshadows) { bShadowDetection = detectshadows; }
+    virtual void setDetectShadows(bool detectshadows)
    {
        if ((bShadowDetection && detectshadows) || (!bShadowDetection && !detectshadows))
            return;
        bShadowDetection = detectshadows;
 #ifdef HAVE_OPENCL
        if (!kernel_apply.empty())
        {
            create_ocl_apply_kernel();
            CV_Assert( !kernel_apply.empty() );
        }
 #endif
    }
    virtual int getShadowValue() const { return nShadowDetection; }
    virtual void setShadowValue(int value) { nShadowDetection = (uchar)value; }
@ -256,7 +315,29 @@ protected:
    Mat nNextMidUpdate;
    Mat nNextLongUpdate;
 #ifdef HAVE_OPENCL
    mutable bool opencl_ON;
    UMat u_flag;
    UMat u_sample;
    UMat u_aModelIndexShort;
    UMat u_aModelIndexMid;
    UMat u_aModelIndexLong;
    UMat u_nNextShortUpdate;
    UMat u_nNextMidUpdate;
    UMat u_nNextLongUpdate;
    mutable ocl::Kernel kernel_apply;
    mutable ocl::Kernel kernel_getBg;
 #endif
    String name_;
 #ifdef HAVE_OPENCL
    bool ocl_getBackgroundImage(OutputArray backgroundImage) const;
    bool ocl_apply(InputArray _image, OutputArray _fgmask, double learningRate=-1);
    void create_ocl_apply_kernel();
 #endif
 };
 CV_INLINE void
@ -328,7 +409,6 @@ CV_INLINE int
    include=0;//do we include this pixel into background model?
    int ndata=nchannels+1;
 //	float k;
    // now increase the probability for each pixel
    for (int n = 0; n < m_nN*3; n++)
    {
@ -546,18 +626,132 @@ public:
    uchar m_nShadowDetection;
 };
 #ifdef HAVE_OPENCL
 bool BackgroundSubtractorKNNImpl::ocl_apply(InputArray _image, OutputArray _fgmask, double learningRate)
 {
    bool needToInitialize = nframes == 0 || learningRate >= 1 || _image.size() != frameSize || _image.type() != frameType;
    if( needToInitialize )
        initialize(_image.size(), _image.type());
    ++nframes;
    learningRate = learningRate >= 0 && nframes > 1 ? learningRate : 1./std::min( 2*nframes, history );
    CV_Assert(learningRate >= 0);
    _fgmask.create(_image.size(), CV_8U);
    UMat fgmask = _fgmask.getUMat();
    UMat frame = _image.getUMat();
    //recalculate update rates - in case alpha is changed
    // calculate update parameters (using alpha)
    int Kshort,Kmid,Klong;
    //approximate exponential learning curve
    Kshort=(int)(log(0.7)/log(1-learningRate))+1;//Kshort
    Kmid=(int)(log(0.4)/log(1-learningRate))-Kshort+1;//Kmid
    Klong=(int)(log(0.1)/log(1-learningRate))-Kshort-Kmid+1;//Klong
    //refresh rates
    int nShortUpdate = (Kshort/nN)+1;
    int nMidUpdate = (Kmid/nN)+1;
    int nLongUpdate = (Klong/nN)+1;
    int idxArg = 0;
    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::ReadOnly(frame));
    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::PtrReadOnly(u_nNextLongUpdate));
    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::PtrReadOnly(u_nNextMidUpdate));
    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::PtrReadOnly(u_nNextShortUpdate));
    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::PtrReadWrite(u_aModelIndexLong));
    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::PtrReadWrite(u_aModelIndexMid));
    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::PtrReadWrite(u_aModelIndexShort));
    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::PtrReadWrite(u_flag));
    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::PtrReadWrite(u_sample));
    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::WriteOnlyNoSize(fgmask));
    idxArg = kernel_apply.set(idxArg, nLongCounter);
    idxArg = kernel_apply.set(idxArg, nMidCounter);
    idxArg = kernel_apply.set(idxArg, nShortCounter);
    idxArg = kernel_apply.set(idxArg, fTb);
    idxArg = kernel_apply.set(idxArg, nkNN);
    idxArg = kernel_apply.set(idxArg, fTau);
    if (bShadowDetection)
        kernel_apply.set(idxArg, nShadowDetection);
    size_t globalsize[2] = {(size_t)frame.cols, (size_t)frame.rows};
    if(!kernel_apply.run(2, globalsize, NULL, true))
        return false;
    nShortCounter++;//0,1,...,nShortUpdate-1
    nMidCounter++;
    nLongCounter++;
    if (nShortCounter >= nShortUpdate)
    {
        nShortCounter = 0;
        randu(u_nNextShortUpdate, Scalar::all(0),  Scalar::all(nShortUpdate));
    }
    if (nMidCounter >= nMidUpdate)
    {
        nMidCounter = 0;
        randu(u_nNextMidUpdate, Scalar::all(0),  Scalar::all(nMidUpdate));
    }
    if (nLongCounter >= nLongUpdate)
    {
        nLongCounter = 0;
        randu(u_nNextLongUpdate, Scalar::all(0),  Scalar::all(nLongUpdate));
    }
    return true;
 }
 bool BackgroundSubtractorKNNImpl::ocl_getBackgroundImage(OutputArray _backgroundImage) const
 {
    _backgroundImage.create(frameSize, frameType);
    UMat dst = _backgroundImage.getUMat();
    int idxArg = 0;
    idxArg = kernel_getBg.set(idxArg, ocl::KernelArg::PtrReadOnly(u_flag));
    idxArg = kernel_getBg.set(idxArg, ocl::KernelArg::PtrReadOnly(u_sample));
    idxArg = kernel_getBg.set(idxArg, ocl::KernelArg::WriteOnly(dst));
    size_t globalsize[2] = {(size_t)dst.cols, (size_t)dst.rows};
    return kernel_getBg.run(2, globalsize, NULL, false);
 }
 void BackgroundSubtractorKNNImpl::create_ocl_apply_kernel()
 {
    int nchannels = CV_MAT_CN(frameType);
    String opts = format("-D CN=%d -D NSAMPLES=%d%s", nchannels, nN, bShadowDetection ? " -D SHADOW_DETECT" : "");
    kernel_apply.create("knn_kernel", ocl::video::bgfg_knn_oclsrc, opts);
 }
 #endif
 void BackgroundSubtractorKNNImpl::apply(InputArray _image, OutputArray _fgmask, double learningRate)
 {
    CV_INSTRUMENT_REGION()
-    Mat image = _image.getMat();
+#ifdef HAVE_OPENCL
-    bool needToInitialize = nframes == 0 || learningRate >= 1 || image.size() != frameSize || image.type() != frameType;
+    if (opencl_ON)
    {
 #ifndef __APPLE__
        CV_OCL_RUN(_fgmask.isUMat() && OCL_PERFORMANCE_CHECK(!ocl::Device::getDefault().isIntel() || _image.channels() == 1),
                   ocl_apply(_image, _fgmask, learningRate))
 #else
        CV_OCL_RUN(_fgmask.isUMat() && OCL_PERFORMANCE_CHECK(!ocl::Device::getDefault().isIntel()),
                   ocl_apply(_image, _fgmask, learningRate))
 #endif
        opencl_ON = false;
        nframes = 0;
    }
 #endif
    bool needToInitialize = nframes == 0 || learningRate >= 1 || _image.size() != frameSize || _image.type() != frameType;
    if( needToInitialize )
-        initialize(image.size(), image.type());
+        initialize(_image.size(), _image.type());
    Mat image = _image.getMat();
    _fgmask.create( image.size(), CV_8U );
    Mat fgmask = _fgmask.getMat();
@ -622,6 +816,15 @@ void BackgroundSubtractorKNNImpl::getBackgroundImage(OutputArray backgroundImage
 {
    CV_INSTRUMENT_REGION()
 #ifdef HAVE_OPENCL
    if (opencl_ON)
    {
        CV_OCL_RUN(opencl_ON, ocl_getBackgroundImage(backgroundImage))
        opencl_ON = false;
    }
 #endif
    int nchannels = CV_MAT_CN(frameType);
    //CV_Assert( nchannels == 3 );
    Mat meanBackground(frameSize, CV_8UC3, Scalar::all(0));
--- a/modules/video/src/opencl/bgfg_knn.cl
+++ b/modules/video/src/opencl/bgfg_knn.cl
@ -0,0 +1,248 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2018 Ya-Chiu Wu, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
 //    Ya-Chiu Wu, yacwu@cs.nctu.edu.tw
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 #if CN==1
 #define T_MEAN float
 #define F_ZERO (0.0f)
 #define frameToMean(a, b) (b) = *(a);
 #define meanToFrame(a, b) *b = convert_uchar_sat(a);
 #else
 #define T_MEAN float4
 #define F_ZERO (0.0f, 0.0f, 0.0f, 0.0f)
 #define meanToFrame(a, b)\
    b[0] = convert_uchar_sat(a.x); \
    b[1] = convert_uchar_sat(a.y); \
    b[2] = convert_uchar_sat(a.z);
 #define frameToMean(a, b)\
    b.x = a[0]; \
    b.y = a[1]; \
    b.z = a[2]; \
    b.w = 0.0f;
 #endif
 __kernel void knn_kernel(__global const uchar* frame, int frame_step, int frame_offset, int frame_row, int frame_col,
                         __global const uchar* nNextLongUpdate,
                         __global const uchar* nNextMidUpdate,
                         __global const uchar* nNextShortUpdate,
                         __global uchar* aModelIndexLong,
                         __global uchar* aModelIndexMid,
                         __global uchar* aModelIndexShort,
                         __global uchar* flag,
                         __global uchar* sample,
                         __global uchar* fgmask, int fgmask_step, int fgmask_offset,
                         int nLongCounter, int nMidCounter, int nShortCounter,
                         float c_Tb, int c_nkNN, float c_tau
 #ifdef SHADOW_DETECT
                         , uchar c_shadowVal
 #endif
                         )
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
    if( x < frame_col && y < frame_row)
    {
        __global const uchar* _frame = (frame + mad24(y, frame_step, mad24(x, CN, frame_offset)));
        T_MEAN pix;
        frameToMean(_frame, pix);
        uchar foreground = 255; // 0 - the pixel classified as background
        int Pbf = 0;
        int Pb = 0;
        uchar include = 0;
        int pt_idx =  mad24(y, frame_col, x);
        int idx_step = frame_row * frame_col;
        __global T_MEAN* _sample = (__global T_MEAN*)(sample);
        for (uchar n = 0; n < (NSAMPLES) * 3 ; ++n)
        {
            int n_idx = mad24(n, idx_step, pt_idx);
            T_MEAN c_mean = _sample[n_idx];
            uchar c_flag = flag[n_idx];
            T_MEAN diff = c_mean - pix;
            float dist2 = dot(diff, diff);
            if (dist2 < c_Tb)
            {
                Pbf++;
                if (c_flag)
                {
                    Pb++;
                    if (Pb >= c_nkNN)
                    {
                        include = 1;
                        foreground = 0;
                        break;
                    }
                }
            }
        }
        if (Pbf >= c_nkNN)
        {
            include = 1;
        }
 #ifdef SHADOW_DETECT
        if (foreground)
        {
            int Ps = 0;
            for (uchar n = 0; n < (NSAMPLES) * 3 ; ++n)
            {
                int n_idx = mad24(n, idx_step, pt_idx);
                uchar c_flag = flag[n_idx];
                if (c_flag)
                {
                    T_MEAN c_mean = _sample[n_idx];
                    float numerator = dot(pix, c_mean);
                    float denominator = dot(c_mean, c_mean);
                    if (denominator == 0)
                        break;
                    if (numerator <= denominator && numerator >= c_tau * denominator)
                    {
                        float a = numerator / denominator;
                        T_MEAN dD = mad(a, c_mean, -pix);
                        if (dot(dD, dD) < c_Tb * a * a)
                        {
                            Ps++;
                            if (Ps >= c_nkNN)
                            {
                                foreground = c_shadowVal;
                                break;
                            }
                        }
                    }
                }
            }
        }
 #endif
        __global uchar* _fgmask = fgmask + mad24(y, fgmask_step, x + fgmask_offset);
        *_fgmask = (uchar)foreground;
        __global const uchar* _nNextLongUpdate = nNextLongUpdate + pt_idx;
        __global const uchar* _nNextMidUpdate = nNextMidUpdate + pt_idx;
        __global const uchar* _nNextShortUpdate = nNextShortUpdate + pt_idx;
        __global uchar* _aModelIndexLong = aModelIndexLong + pt_idx;
        __global uchar* _aModelIndexMid = aModelIndexMid + pt_idx;
        __global uchar* _aModelIndexShort = aModelIndexShort + pt_idx;
        uchar nextLongUpdate = _nNextLongUpdate[0];
        uchar nextMidUpdate = _nNextMidUpdate[0];
        uchar nextShortUpdate = _nNextShortUpdate[0];
        uchar modelIndexLong = _aModelIndexLong[0];
        uchar modelIndexMid = _aModelIndexMid[0];
        uchar modelIndexShort = _aModelIndexShort[0];
        int offsetLong = mad24(mad24(2, (NSAMPLES), modelIndexLong), idx_step, pt_idx);
        int offsetMid = mad24((NSAMPLES)+modelIndexMid, idx_step, pt_idx);
        int offsetShort = mad24(modelIndexShort, idx_step, pt_idx);
        if (nextLongUpdate == nLongCounter)
        {
            _sample[offsetLong] = _sample[offsetMid];
            flag[offsetLong] = flag[offsetMid];
            _aModelIndexLong[0] = (modelIndexLong >= ((NSAMPLES)-1)) ? 0 : (modelIndexLong + 1);
        }
        if (nextMidUpdate == nMidCounter)
        {
            _sample[offsetMid] = _sample[offsetShort];
            flag[offsetMid] = flag[offsetShort];
            _aModelIndexMid[0] = (modelIndexMid >= ((NSAMPLES)-1)) ? 0 : (modelIndexMid + 1);
        }
        if (nextShortUpdate == nShortCounter)
        {
            _sample[offsetShort] = pix;
            flag[offsetShort] = include;
            _aModelIndexShort[0] = (modelIndexShort >= ((NSAMPLES)-1)) ? 0 : (modelIndexShort + 1);
        }
    }
 }
 __kernel void getBackgroundImage2_kernel(__global const uchar* flag,
                                         __global const uchar* sample,
                                         __global uchar* dst, int dst_step, int dst_offset, int dst_row, int dst_col)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
    if(x < dst_col && y < dst_row)
    {
        int pt_idx =  mad24(y, dst_col, x);
        T_MEAN meanVal = (T_MEAN)F_ZERO;
        __global T_MEAN* _sample = (__global T_MEAN*)(sample);
        int idx_step = dst_row * dst_col;
        for (uchar n = 0; n < (NSAMPLES) * 3 ; ++n)
        {
            int n_idx = mad24(n, idx_step, pt_idx);
            uchar c_flag = flag[n_idx];
            if(c_flag)
            {
                meanVal = _sample[n_idx];
                break;
            }
        }
        __global uchar* _dst = dst + mad24(y, dst_step, mad24(x, CN, dst_offset));
        meanToFrame(meanVal, _dst);
    }
 }