Merge pull request #1045 from bitwangyaoyao:2.4_fix

12 years ago · bf6b119a3f
parent 1cfe5aa41e 2ec1140b25
commit bf6b119a3f
6 changed files with 202 additions and 206 deletions
--- a/modules/ocl/perf/main.cpp
+++ b/modules/ocl/perf/main.cpp
@ -44,43 +44,21 @@

 int main(int argc, const char *argv[])
 {
-    vector<ocl::Info> oclinfo;
-    int num_devices = getDevice(oclinfo);
-
-    if (num_devices < 1)
-    {
-        cerr << "no device found\n";
-        return -1;
-    }
-    // set this to overwrite binary cache every time the test starts
-    ocl::setBinaryDiskCache(ocl::CACHE_UPDATE);
-
-    int devidx = 0;
-
-    for (size_t i = 0; i < oclinfo.size(); i++)
-    {
-        for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++)
-        {
-            printf("device %d: %s\n", devidx++, oclinfo[i].DeviceName[j].c_str());
-        }
-    }
-
-    redirectError(cvErrorCallback);
-
    const char *keys =
        "{ h | help    | false | print help message }"
        "{ f | filter  |       | filter for test }"
        "{ w | workdir |       | set working directory }"
        "{ l | list    | false | show all tests }"
        "{ d | device  | 0     | device id }"
+        "{ c | cpu_ocl | false | use cpu as ocl device}"
        "{ i | iters   | 10    | iteration count }"
        "{ m | warmup  | 1     | gpu warm up iteration count}"
-        "{ t | xtop    | 1.1	  | xfactor top boundary}"
-        "{ b | xbottom | 0.9	  | xfactor bottom boundary}"
+        "{ t | xtop    | 1.1   | xfactor top boundary}"
+        "{ b | xbottom | 0.9   | xfactor bottom boundary}"
        "{ v | verify  | false | only run gpu once to verify if problems occur}";

+    redirectError(cvErrorCallback);
    CommandLineParser cmd(argc, argv, keys);
-
    if (cmd.get<bool>("help"))
    {
        cout << "Avaible options:" << endl;
@ -88,14 +66,40 @@ int main(int argc, const char *argv[])
        return 0;
    }

-    int device = cmd.get<int>("device");
+    // get ocl devices
+    bool use_cpu = cmd.get<bool>("c");
+    vector<ocl::Info> oclinfo;
+    int num_devices = 0;
+    if(use_cpu)
+        num_devices = getDevice(oclinfo, ocl::CVCL_DEVICE_TYPE_CPU);
+    else
+        num_devices = getDevice(oclinfo);
+    if (num_devices < 1)
+    {
+        cerr << "no device found\n";
+        return -1;
+    }

+    // show device info
+    int devidx = 0;
+    for (size_t i = 0; i < oclinfo.size(); i++)
+    {
+        for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++)
+        {
+            cout << "device " << devidx++ << ": " << oclinfo[i].DeviceName[j] << endl;
+        }
+    }
+
+    int device = cmd.get<int>("device");
    if (device < 0 || device >= num_devices)
    {
        cerr << "Invalid device ID" << endl;
        return -1;
    }

+    // set this to overwrite binary cache every time the test starts
+    ocl::setBinaryDiskCache(ocl::CACHE_UPDATE);
+    
    if (cmd.get<bool>("verify"))
    {
        TestSystem::instance().setNumIters(1);
@ -104,7 +108,6 @@ int main(int argc, const char *argv[])
    }

    devidx = 0;
-
    for (size_t i = 0; i < oclinfo.size(); i++)
    {
        for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++, devidx++)
@ -113,7 +116,7 @@ int main(int argc, const char *argv[])
            {
                ocl::setDevice(oclinfo[i], (int)j);
                TestSystem::instance().setRecordName(oclinfo[i].DeviceName[j]);
-                printf("\nuse %d: %s\n", devidx, oclinfo[i].DeviceName[j].c_str());
+                cout << "use " << devidx << ": " <<oclinfo[i].DeviceName[j] << endl;
                goto END_DEV;
            }
        }
--- a/modules/ocl/src/mcwutil.cpp
+++ b/modules/ocl/src/mcwutil.cpp
@ -149,7 +149,7 @@ namespace cv
            cl_image_format format;
            int err;
            int depth    = mat.depth();
-            int channels = mat.channels();
+            int channels = mat.oclchannels();

            switch(depth)
            {
--- a/modules/ocl/src/moments.cpp
+++ b/modules/ocl/src/moments.cpp
@ -16,7 +16,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Sen Liu, sen@multicorewareinc.com
+//    Sen Liu, swjtuls1987@126.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@ -277,8 +277,8 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
        blocky = size.height/TILE_SIZE;
    else
        blocky = size.height/TILE_SIZE + 1;
-    cv::ocl::oclMat dst_m(blocky * 10, blockx, CV_64FC1);
-    cl_mem sum = openCLCreateBuffer(src.clCxt,CL_MEM_READ_WRITE,10*sizeof(double));
+    oclMat dst_m(blocky * 10, blockx, CV_64FC1);
+    oclMat sum(1, 10, CV_64FC1);
    int tile_width  = std::min(size.width,TILE_SIZE);
    int tile_height = std::min(size.height,TILE_SIZE);
    size_t localThreads[3]  = { tile_height, 1, 1};
@ -288,19 +288,16 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
    args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows ));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols ));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step ));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&tileSize.width ));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&tileSize.height ));
    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data ));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.cols ));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.step ));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&blocky ));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&type ));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&depth ));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&cn ));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&coi ));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&binary ));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&TILE_SIZE ));
-    openCLExecuteKernel(dst_m.clCxt, &moments, "CvMoments", globalThreads, localThreads, args, -1, depth);
+    openCLExecuteKernel(Context::getContext(), &moments, "CvMoments", globalThreads, localThreads, args, -1, depth);

    size_t localThreadss[3]  = { 128, 1, 1};
    size_t globalThreadss[3] = { 128, 1, 1};
@ -309,25 +306,23 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
    args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&tile_height ));
    args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&tile_width ));
    args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&TILE_SIZE ));
-    args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&sum ));
+    args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&sum.data ));
    args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data ));
    args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.step ));
-    openCLExecuteKernel(dst_m.clCxt, &moments, "dst_sum", globalThreadss, localThreadss, args_sum, -1, -1);
-    double* dstsum = new double[10];
-    memset(dstsum,0,10*sizeof(double));
-    openCLReadBuffer(dst_m.clCxt,sum,(void *)dstsum,10*sizeof(double));
-    mom->m00 = dstsum[0];
-    mom->m10 = dstsum[1];
-    mom->m01 = dstsum[2];
-    mom->m20 = dstsum[3];
-    mom->m11 = dstsum[4];
-    mom->m02 = dstsum[5];
-    mom->m30 = dstsum[6];
-    mom->m21 = dstsum[7];
-    mom->m12 = dstsum[8];
-    mom->m03 = dstsum[9];
-    delete [] dstsum;
-    openCLSafeCall(clReleaseMemObject(sum));
+    openCLExecuteKernel(Context::getContext(), &moments, "dst_sum", globalThreadss, localThreadss, args_sum, -1, -1);
+
+    Mat dstsum(sum);
+    mom->m00 = dstsum.at<double>(0, 0);
+    mom->m10 = dstsum.at<double>(0, 1);
+    mom->m01 = dstsum.at<double>(0, 2);
+    mom->m20 = dstsum.at<double>(0, 3);
+    mom->m11 = dstsum.at<double>(0, 4);
+    mom->m02 = dstsum.at<double>(0, 5);
+    mom->m30 = dstsum.at<double>(0, 6);
+    mom->m21 = dstsum.at<double>(0, 7);
+    mom->m12 = dstsum.at<double>(0, 8);
+    mom->m03 = dstsum.at<double>(0, 9);
+
    icvCompleteMomentState( mom );
 }

--- a/modules/ocl/src/opencl/moments.cl
+++ b/modules/ocl/src/opencl/moments.cl
@ -173,10 +173,10 @@ __kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_widt
            sum[i] = dst_sum[i][0];
 }

-__kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
+__kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step,
                           __global F* dst_m,
                           int dst_cols, int dst_step, int blocky,
-                           int type, int depth, int cn, int coi, int binary, int TILE_SIZE)
+                           int depth, int cn, int coi, int binary, int TILE_SIZE)
 {
    uchar tmp_coi[16]; // get the coi data
    uchar16 tmp[16];
@ -192,35 +192,43 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col
    int x = wgidx*TILE_SIZE;  // vector length of uchar
    int kcn = (cn==2)?2:4;
    int rstep = min(src_step, TILE_SIZE);
-    tileSize_height = min(TILE_SIZE, src_rows - y);
-    tileSize_width = min(TILE_SIZE, src_cols - x);
-
-    if( tileSize_width < TILE_SIZE )
-        for(int i = tileSize_width; i < rstep; i++ )
-            *((__global uchar*)src_data+(y+lidy)*src_step+x+i) = 0;
-    if( coi > 0 )	//channel of interest
-        for(int i = 0; i < tileSize_width; i += VLEN_C)
-        {
-            for(int j=0; j<VLEN_C; j++)
-                tmp_coi[j] = *((__global uchar*)src_data+(y+lidy)*src_step+(x+i+j)*kcn+coi-1);
-            tmp[i/VLEN_C] = (uchar16)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7],
-                                      tmp_coi[8],tmp_coi[9],tmp_coi[10],tmp_coi[11],tmp_coi[12],tmp_coi[13],tmp_coi[14],tmp_coi[15]);
-        }
-    else
-        for(int i=0; i < tileSize_width; i+=VLEN_C)
-            tmp[i/VLEN_C] = *(src_data+(y+lidy)*src_step/VLEN_C+(x+i)/VLEN_C);
+    int tileSize_height = min(TILE_SIZE, src_rows - y);
+    int tileSize_width = min(TILE_SIZE, src_cols - x);
+
+    if ( y+lidy < src_rows )
+    {
+        if( tileSize_width < TILE_SIZE )
+            for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
+                *((__global uchar*)src_data+(y+lidy)*src_step+x+i) = 0;
+
+        if( coi > 0 )	//channel of interest
+            for(int i = 0; i < tileSize_width; i += VLEN_C)
+            {
+                for(int j=0; j<VLEN_C; j++)
+                    tmp_coi[j] = *((__global uchar*)src_data+(y+lidy)*src_step+(x+i+j)*kcn+coi-1);
+                tmp[i/VLEN_C] = (uchar16)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7],
+                                          tmp_coi[8],tmp_coi[9],tmp_coi[10],tmp_coi[11],tmp_coi[12],tmp_coi[13],tmp_coi[14],tmp_coi[15]);
+            }
+        else
+            for(int i=0; i < tileSize_width; i+=VLEN_C)
+                tmp[i/VLEN_C] = *(src_data+(y+lidy)*src_step/VLEN_C+(x+i)/VLEN_C);
+    }
+
    uchar16 zero = (uchar16)(0);
    uchar16 full = (uchar16)(255);
    if( binary )
        for(int i=0; i < tileSize_width; i+=VLEN_C)
            tmp[i/VLEN_C] = (tmp[i/VLEN_C]!=zero)?full:zero;
+
    F mom[10];
    __local int m[10][128];
-    if(lidy == 0)
+    if(lidy < 128)
+    {
        for(int i=0; i<10; i++)
-            for(int j=0; j<128; j++)
-                m[i][j]=0;
+            m[i][lidy]=0;
+    }
    barrier(CLK_LOCAL_MEM_FENCE);
+
    int lm[10] = {0};
    int16 x0 = (int16)(0);
    int16 x1 = (int16)(0);
@ -281,6 +289,7 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col
                m[i][lidy-j/2] = lm[i];
        barrier(CLK_LOCAL_MEM_FENCE);
    }
+
    if(lidy == 0&&lidx == 0)
    {
        for( int mt = 0; mt < 10; mt++ )
@ -328,10 +337,10 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col
    }
 }

-__kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
+__kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step,
                           __global F* dst_m,
                           int dst_cols, int dst_step, int blocky,
-                           int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
+                           int depth, int cn, int coi, int binary, const int TILE_SIZE)
 {
    ushort tmp_coi[8]; // get the coi data
    ushort8 tmp[32];
@ -346,21 +355,26 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
    int x = wgidx*TILE_SIZE;  // real X index of pixel
    int kcn = (cn==2)?2:4;
    int rstep = min(src_step/2, TILE_SIZE);
-    tileSize_height = min(TILE_SIZE, src_rows - y);
-    tileSize_width = min(TILE_SIZE, src_cols -x);
-    if(src_cols > TILE_SIZE && tileSize_width < TILE_SIZE)
-        for(int i=tileSize_width; i < rstep; i++ )
-            *((__global ushort*)src_data+(y+lidy)*src_step/2+x+i) = 0;
-    if( coi > 0 )
-        for(int i=0; i < tileSize_width; i+=VLEN_US)
-        {
-            for(int j=0; j<VLEN_US; j++)
-                tmp_coi[j] = *((__global ushort*)src_data+(y+lidy)*(int)src_step/2+(x+i+j)*kcn+coi-1);
-            tmp[i/VLEN_US] = (ushort8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
-        }
-    else
-        for(int i=0; i < tileSize_width; i+=VLEN_US)
-            tmp[i/VLEN_US] = *(src_data+(y+lidy)*src_step/(2*VLEN_US)+(x+i)/VLEN_US);
+    int tileSize_height = min(TILE_SIZE, src_rows - y);
+    int tileSize_width = min(TILE_SIZE, src_cols -x);
+
+    if ( y+lidy < src_rows )
+    {
+        if(src_cols > TILE_SIZE && tileSize_width < TILE_SIZE)
+            for(int i=tileSize_width; i < rstep && (x+i) < src_cols; i++ )
+                *((__global ushort*)src_data+(y+lidy)*src_step/2+x+i) = 0;
+        if( coi > 0 )
+            for(int i=0; i < tileSize_width; i+=VLEN_US)
+            {
+                for(int j=0; j<VLEN_US; j++)
+                    tmp_coi[j] = *((__global ushort*)src_data+(y+lidy)*(int)src_step/2+(x+i+j)*kcn+coi-1);
+                tmp[i/VLEN_US] = (ushort8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
+            }
+        else
+            for(int i=0; i < tileSize_width; i+=VLEN_US)
+                tmp[i/VLEN_US] = *(src_data+(y+lidy)*src_step/(2*VLEN_US)+(x+i)/VLEN_US);
+    }
+
    ushort8 zero = (ushort8)(0);
    ushort8 full = (ushort8)(255);
    if( binary )
@ -368,11 +382,11 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
            tmp[i/VLEN_US] = (tmp[i/VLEN_US]!=zero)?full:zero;
    F mom[10];
    __local long m[10][128];
-    if(lidy == 0)
+    if(lidy < 128)
        for(int i=0; i<10; i++)
-            for(int j=0; j<128; j++)
-                m[i][j]=0;
+            m[i][lidy]=0;
    barrier(CLK_LOCAL_MEM_FENCE);
+
    long lm[10] = {0};
    int8 x0 = (int8)(0);
    int8 x1 = (int8)(0);
@ -422,17 +436,22 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
        lm[0] = x0.s0;             // m00
    }
    barrier(CLK_LOCAL_MEM_FENCE);
+
    for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
    {
        if(lidy < j)
            for( int i = 0; i < 10; i++ )
                lm[i] = lm[i] + m[i][lidy];
-        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
+    {
        if(lidy >= j/2&&lidy < j)
            for( int i = 0; i < 10; i++ )
                m[i][lidy-j/2] = lm[i];
-        barrier(CLK_LOCAL_MEM_FENCE);
    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
    if(lidy == 0&&lidx == 0)
    {
        for(int mt = 0; mt < 10; mt++ )
@ -482,10 +501,10 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
    }
 }

-__kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
+__kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step,
                           __global F* dst_m,
                           int dst_cols, int dst_step, int blocky,
-                           int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
+                           int depth, int cn, int coi, int binary, const int TILE_SIZE)
 {
    short tmp_coi[8]; // get the coi data
    short8 tmp[32];
@ -500,21 +519,26 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols
    int x = wgidx*TILE_SIZE;  // real X index of pixel
    int kcn = (cn==2)?2:4;
    int rstep = min(src_step/2, TILE_SIZE);
-    tileSize_height = min(TILE_SIZE, src_rows - y);
-    tileSize_width = min(TILE_SIZE, src_cols -x);
-    if(tileSize_width < TILE_SIZE)
-        for(int i = tileSize_width; i < rstep; i++ )
-            *((__global short*)src_data+(y+lidy)*src_step/2+x+i) = 0;
-    if( coi > 0 )
-        for(int i=0; i < tileSize_width; i+=VLEN_S)
-        {
-            for(int j=0; j<VLEN_S; j++)
-                tmp_coi[j] = *((__global short*)src_data+(y+lidy)*src_step/2+(x+i+j)*kcn+coi-1);
-            tmp[i/VLEN_S] = (short8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
-        }
-    else
-        for(int i=0; i < tileSize_width; i+=VLEN_S)
-            tmp[i/VLEN_S] = *(src_data+(y+lidy)*src_step/(2*VLEN_S)+(x+i)/VLEN_S);
+    int tileSize_height = min(TILE_SIZE, src_rows - y);
+    int tileSize_width = min(TILE_SIZE, src_cols -x);
+
+    if ( y+lidy < src_rows )
+    {
+        if(tileSize_width < TILE_SIZE)
+            for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
+                *((__global short*)src_data+(y+lidy)*src_step/2+x+i) = 0;
+        if( coi > 0 )
+            for(int i=0; i < tileSize_width; i+=VLEN_S)
+            {
+                for(int j=0; j<VLEN_S; j++)
+                    tmp_coi[j] = *((__global short*)src_data+(y+lidy)*src_step/2+(x+i+j)*kcn+coi-1);
+                tmp[i/VLEN_S] = (short8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
+            }
+        else
+            for(int i=0; i < tileSize_width; i+=VLEN_S)
+                tmp[i/VLEN_S] = *(src_data+(y+lidy)*src_step/(2*VLEN_S)+(x+i)/VLEN_S);
+    }
+
    short8 zero = (short8)(0);
    short8 full = (short8)(255);
    if( binary )
@ -523,10 +547,9 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols

    F mom[10];
    __local long m[10][128];
-    if(lidy == 0)
+    if(lidy < 128)
        for(int i=0; i<10; i++)
-            for(int j=0; j<128; j++)
-                m[i][j]=0;
+            m[i][lidy]=0;
    barrier(CLK_LOCAL_MEM_FENCE);
    long lm[10] = {0};
    int8 x0 = (int8)(0);
@ -637,10 +660,10 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols
    }
 }

-__kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
+__kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step,
                            __global F* dst_m,
                            int dst_cols, int dst_step, int blocky,
-                            int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
+                            int depth, int cn, int coi, int binary, const int TILE_SIZE)
 {
    float tmp_coi[4]; // get the coi data
    float4 tmp[64] ;
@ -654,33 +677,30 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
    int y = wgidy*TILE_SIZE;  // real Y index of pixel
    int x = wgidx*TILE_SIZE;  // real X index of pixel
    int kcn = (cn==2)?2:4;
-    src_step /= sizeof(*src_data);
-    int rstep = min(src_step, TILE_SIZE);
-    tileSize_height = min(TILE_SIZE, src_rows - y);
-    tileSize_width = min(TILE_SIZE, src_cols -x);
+    int rstep = min(src_step/4, TILE_SIZE);
+    int tileSize_height = min(TILE_SIZE, src_rows - y);
+    int tileSize_width = min(TILE_SIZE, src_cols -x);
    int maxIdx = mul24(src_rows, src_cols);
    int yOff = (y+lidy)*src_step;
    int index;
-    if(tileSize_width < TILE_SIZE && yOff < src_rows)
-        for(int i = tileSize_width; i < rstep && (yOff+x+i) < maxIdx; i++ )
-            *(src_data+yOff+x+i) = 0;
-    if( coi > 0 )
-        for(int i=0; i < tileSize_width; i+=VLEN_F)
-        {
-#pragma unroll
-            for(int j=0; j<4; j++)
+
+    if ( y+lidy < src_rows )
+    {
+        if(tileSize_width < TILE_SIZE)
+            for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
+                *((__global float*)src_data+(y+lidy)*src_step/4+x+i) = 0;
+        if( coi > 0 )
+            for(int i=0; i < tileSize_width; i+=VLEN_F)
            {
-                index = yOff+(x+i+j)*kcn+coi-1;
-                if (index < maxIdx)
-                    tmp_coi[j] = *(src_data+index);
-                else
-                    tmp_coi[j] = 0;
+                for(int j=0; j<4; j++)
+                    tmp_coi[j] = *(src_data+(y+lidy)*src_step/4+(x+i+j)*kcn+coi-1);
+                tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
            }
-            tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
-        }
-    else
-        for(int i=0; i < tileSize_width && (yOff+x+i) < maxIdx; i+=VLEN_F)
-            tmp[i/VLEN_F] = (*(__global float4 *)(src_data+yOff+x+i));
+        else
+            for(int i=0; i < tileSize_width; i+=VLEN_F)
+                tmp[i/VLEN_F] = (float4)(*(src_data+(y+lidy)*src_step/4+x+i),*(src_data+(y+lidy)*src_step/4+x+i+1),*(src_data+(y+lidy)*src_step/4+x+i+2),*(src_data+(y+lidy)*src_step/4+x+i+3));
+    }
+
    float4 zero = (float4)(0);
    float4 full = (float4)(255);
    if( binary )
@ -688,10 +708,9 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
            tmp[i/VLEN_F] = (tmp[i/VLEN_F]!=zero)?full:zero;
    F mom[10];
    __local F m[10][128];
-    if(lidy == 0)
+    if(lidy < 128)
        for(int i = 0; i < 10; i ++)
-            for(int j = 0; j < 128; j ++)
-                m[i][j] = 0;
+            m[i][lidy] = 0;
    barrier(CLK_LOCAL_MEM_FENCE);
    F lm[10] = {0};
    F4 x0 = (F4)(0);
@ -770,66 +789,42 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
        // accumulate moments computed in each tile
        dst_step /= sizeof(F);

-        int dst_x_off = mad24(wgidy, dst_cols, wgidx);
-        int dst_off = 0;
-        int max_dst_index = 10 * blocky * get_global_size(1);
-
        // + m00 ( = m00' )
-        dst_off = mad24(DST_ROW_00 * blocky, dst_step, dst_x_off);
-        if (dst_off < max_dst_index)
-            *(dst_m + dst_off) = mom[0];
+        *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];

        // + m10 ( = m10' + x*m00' )
-        dst_off = mad24(DST_ROW_10 * blocky, dst_step, dst_x_off);
-        if (dst_off < max_dst_index)
-            *(dst_m + dst_off) = mom[1] + xm;
+        *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;

        // + m01 ( = m01' + y*m00' )
-        dst_off = mad24(DST_ROW_01 * blocky, dst_step, dst_x_off);
-        if (dst_off < max_dst_index)
-            *(dst_m + dst_off) = mom[2] + ym;
+        *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;

        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
-        dst_off = mad24(DST_ROW_20 * blocky, dst_step, dst_x_off);
-        if (dst_off < max_dst_index)
-            *(dst_m + dst_off) = mom[3] + x * (mom[1] * 2 + xm);
+        *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);

        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
-        dst_off = mad24(DST_ROW_11 * blocky, dst_step, dst_x_off);
-        if (dst_off < max_dst_index)
-            *(dst_m + dst_off) = mom[4] + x * (mom[2] + ym) + y * mom[1];
+        *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];

        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
-        dst_off = mad24(DST_ROW_02 * blocky, dst_step, dst_x_off);
-        if (dst_off < max_dst_index)
-            *(dst_m + dst_off) = mom[5] + y * (mom[2] * 2 + ym);
+        *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);

        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
-        dst_off = mad24(DST_ROW_30 * blocky, dst_step, dst_x_off);
-        if (dst_off < max_dst_index)
-            *(dst_m + dst_off) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+        *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));

        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
-        dst_off = mad24(DST_ROW_21 * blocky, dst_step, dst_x_off);
-        if (dst_off < max_dst_index)
-            *(dst_m + dst_off) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+        *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];

        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
-        dst_off = mad24(DST_ROW_12 * blocky, dst_step, dst_x_off);
-        if (dst_off < max_dst_index)
-            *(dst_m + dst_off) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+        *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];

        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
-        dst_off = mad24(DST_ROW_03 * blocky, dst_step, dst_x_off);
-        if (dst_off < max_dst_index)
-            *(dst_m + dst_off) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+        *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
    }
 }

-__kernel void CvMoments_D6(__global F* src_data,  int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
+__kernel void CvMoments_D6(__global F* src_data,  int src_rows, int src_cols, int src_step,
                           __global F* dst_m,
                           int dst_cols, int dst_step, int blocky,
-                           int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
+                           int depth, int cn, int coi, int binary, const int TILE_SIZE)
 {
    F tmp_coi[4]; // get the coi data
    F4 tmp[64];
@ -844,22 +839,26 @@ __kernel void CvMoments_D6(__global F* src_data,  int src_rows, int src_cols, in
    int x = wgidx*TILE_SIZE;  // real X index of pixel
    int kcn = (cn==2)?2:4;
    int rstep = min(src_step/8, TILE_SIZE);
-    tileSize_height = min(TILE_SIZE,  src_rows - y);
-    tileSize_width = min(TILE_SIZE, src_cols - x);
+    int tileSize_height = min(TILE_SIZE,  src_rows - y);
+    int tileSize_width = min(TILE_SIZE, src_cols - x);
+
+    if ( y+lidy < src_rows )
+    {
+        if(tileSize_width < TILE_SIZE)
+            for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
+                *((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0;
+        if( coi > 0 )
+            for(int i=0; i < tileSize_width; i+=VLEN_D)
+            {
+                for(int j=0; j<4 && ((x+i+j)*kcn+coi-1)<src_cols; j++)
+                    tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1);
+                tmp[i/VLEN_D] = (F4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
+            }
+        else
+            for(int i=0; i < tileSize_width && (x+i+3) < src_cols; i+=VLEN_D)
+                tmp[i/VLEN_D] = (F4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3));
+    }

-    if(tileSize_width < TILE_SIZE)
-        for(int i = tileSize_width; i < rstep; i++ )
-            *((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0;
-    if( coi > 0 )
-        for(int i=0; i < tileSize_width; i+=VLEN_D)
-        {
-            for(int j=0; j<4; j++)
-                tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1);
-            tmp[i/VLEN_D] = (F4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
-        }
-    else
-        for(int i=0; i < tileSize_width; i+=VLEN_D)
-            tmp[i/VLEN_D] = (F4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3));
    F4 zero = (F4)(0);
    F4 full = (F4)(255);
    if( binary )
@ -867,10 +866,9 @@ __kernel void CvMoments_D6(__global F* src_data,  int src_rows, int src_cols, in
            tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero;
    F mom[10];
    __local F m[10][128];
-    if(lidy == 0)
+    if(lidy < 128)
        for(int i=0; i<10; i++)
-            for(int j=0; j<128; j++)
-                m[i][j]=0;
+            m[i][lidy]=0;
    barrier(CLK_LOCAL_MEM_FENCE);
    F lm[10] = {0};
    F4 x0 = (F4)(0);
@ -907,7 +905,6 @@ __kernel void CvMoments_D6(__global F* src_data,  int src_rows, int src_cols, in
        m[1][lidy-bheight] = x1.s0;             // m10
        m[0][lidy-bheight] = x0.s0;             // m00
    }
-
    else if(lidy < bheight)
    {
        lm[9] = ((F)py) * sy;  // m03
@ -922,6 +919,7 @@ __kernel void CvMoments_D6(__global F* src_data,  int src_rows, int src_cols, in
        lm[0] = x0.s0;             // m00
    }
    barrier(CLK_LOCAL_MEM_FENCE);
+
    for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
    {
        if(lidy < j)
--- a/modules/ocl/src/opencl/stereobm.cl
+++ b/modules/ocl/src/opencl/stereobm.cl
@ -162,8 +162,8 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
    int y_tex;
    int x_tex = X - radius;

-    if (x_tex >= cwidth)
-        return;
+    //if (x_tex >= cwidth)
+    //    return;

    for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP)
    {
--- a/modules/ocl/test/test_moments.cpp
+++ b/modules/ocl/test/test_moments.cpp
@ -45,12 +45,12 @@ TEST_P(MomentsTest, Mat)
    {
        if(test_contours)
        {
-            Mat src = imread( workdir + "../cpp/pic3.png", 1 );
-            Mat src_gray, canny_output;
-            cvtColor( src, src_gray, CV_BGR2GRAY );
+            Mat src = imread( workdir + "../cpp/pic3.png", IMREAD_GRAYSCALE );
+            ASSERT_FALSE(src.empty());
+            Mat canny_output;
            vector<vector<Point> > contours;
            vector<Vec4i> hierarchy;
-            Canny( src_gray, canny_output, 100, 200, 3 );
+            Canny( src, canny_output, 100, 200, 3 );
            findContours( canny_output, contours, hierarchy, CV_RETR_TREE, CV_CHAIN_APPROX_SIMPLE, Point(0, 0) );
            for( size_t i = 0; i < contours.size(); i++ )
            {