From 6035925f416bd5e1384ab5ac1f4969323438529c Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Wed, 25 Dec 2013 21:09:23 +0400 Subject: [PATCH 1/5] experimental moments implementation (does not work yet) --- modules/imgproc/src/moments.cpp | 207 ++++++++++++++++++-------- modules/imgproc/src/opencl/moments.cl | 110 ++++++++++++++ modules/imgproc/test/test_moments.cpp | 5 + 3 files changed, 257 insertions(+), 65 deletions(-) create mode 100644 modules/imgproc/src/opencl/moments.cl diff --git a/modules/imgproc/src/moments.cpp b/modules/imgproc/src/moments.cpp index 14e672abdb..15bc83d97d 100644 --- a/modules/imgproc/src/moments.cpp +++ b/modules/imgproc/src/moments.cpp @@ -39,6 +39,7 @@ // //M*/ #include "precomp.hpp" +#include "opencl_kernels.hpp" namespace cv { @@ -362,106 +363,182 @@ Moments::Moments( double _m00, double _m10, double _m01, double _m20, double _m1 nu30 = mu30*s3; nu21 = mu21*s3; nu12 = mu12*s3; nu03 = mu03*s3; } +static const int OCL_TILE_SIZE = 32; + +static bool ocl_moments( InputArray _src, Moments& m, bool binary ) +{ + printf("!!!!!!!!!!!!!!!!!! ocl moments !!!!!!!!!!!!!!!!!!!\n"); + const int K = 10; + ocl::Kernel k("moments", ocl::imgproc::moments_oclsrc, binary ? "-D BINARY_MOMENTS" : ""); + if( k.empty() ) + return false; + + UMat src = _src.getUMat(); + Size sz = src.size(); + int xtiles = (sz.width + OCL_TILE_SIZE-1)/OCL_TILE_SIZE; + int ytiles = (sz.height + OCL_TILE_SIZE-1)/OCL_TILE_SIZE; + int ntiles = xtiles*ytiles; + UMat umbuf(1, ntiles*K, CV_32S); + umbuf.setTo(Scalar::all(0)); + + size_t globalsize[] = {xtiles, ytiles}; + size_t localsize[] = {1, 1}; + bool ok = k.args(ocl::KernelArg::ReadOnly(src), + ocl::KernelArg::PtrWriteOnly(umbuf), + OCL_TILE_SIZE, xtiles, ytiles).run(2, globalsize, localsize, false); + if(!ok) + return false; + Mat mbuf; + umbuf.copyTo(mbuf); + for( int i = 0; i < ntiles; i++ ) + { + double x = (i % xtiles)*OCL_TILE_SIZE, y = (i / xtiles)*OCL_TILE_SIZE; + const int* mom = mbuf.ptr() + i*K; + double xm = x * mom[0], ym = y * mom[0]; + + // accumulate moments computed in each tile + + // + m00 ( = m00' ) + m.m00 += mom[0]; + + // + m10 ( = m10' + x*m00' ) + m.m10 += mom[1] + xm; + + // + m01 ( = m01' + y*m00' ) + m.m01 += mom[2] + ym; + + // + m20 ( = m20' + 2*x*m10' + x*x*m00' ) + m.m20 += mom[3] + x * (mom[1] * 2 + xm); + + // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) + m.m11 += mom[4] + x * (mom[2] + ym) + y * mom[1]; + + // + m02 ( = m02' + 2*y*m01' + y*y*m00' ) + m.m02 += mom[5] + y * (mom[2] * 2 + ym); + + // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) + m.m30 += mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); + + // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') + m.m21 += mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; + + // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') + m.m12 += mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; + + // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) + m.m03 += mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); + } + + return true; +} + } cv::Moments cv::moments( InputArray _src, bool binary ) { const int TILE_SIZE = 32; - Mat mat = _src.getMat(); MomentsInTileFunc func = 0; uchar nzbuf[TILE_SIZE*TILE_SIZE]; Moments m; - int type = mat.type(); + int type = _src.type(); int depth = CV_MAT_DEPTH( type ); int cn = CV_MAT_CN( type ); - - if( mat.checkVector(2) >= 0 && (depth == CV_32F || depth == CV_32S)) - return contourMoments(mat); - - Size size = mat.size(); + Size size = _src.size(); if( cn > 1 ) - CV_Error( CV_StsBadArg, "Invalid image type" ); - + CV_Error( CV_StsBadArg, "Invalid image type (must be single-channel)" ); + if( size.width <= 0 || size.height <= 0 ) return m; - - if( binary || depth == CV_8U ) - func = momentsInTile; - else if( depth == CV_16U ) - func = momentsInTile; - else if( depth == CV_16S ) - func = momentsInTile; - else if( depth == CV_32F ) - func = momentsInTile; - else if( depth == CV_64F ) - func = momentsInTile; + + if( ocl::useOpenCL() && depth == CV_8U && + size.width >= OCL_TILE_SIZE && + size.height >= OCL_TILE_SIZE && + /*_src.isUMat() &&*/ ocl_moments(_src, m, binary) ) + ; else - CV_Error( CV_StsUnsupportedFormat, "" ); - - Mat src0(mat); - - for( int y = 0; y < size.height; y += TILE_SIZE ) { - Size tileSize; - tileSize.height = std::min(TILE_SIZE, size.height - y); + Mat mat = _src.getMat(); + if( mat.checkVector(2) >= 0 && (depth == CV_32F || depth == CV_32S)) + return contourMoments(mat); + + if( binary || depth == CV_8U ) + func = momentsInTile; + else if( depth == CV_16U ) + func = momentsInTile; + else if( depth == CV_16S ) + func = momentsInTile; + else if( depth == CV_32F ) + func = momentsInTile; + else if( depth == CV_64F ) + func = momentsInTile; + else + CV_Error( CV_StsUnsupportedFormat, "" ); - for( int x = 0; x < size.width; x += TILE_SIZE ) + Mat src0(mat); + + for( int y = 0; y < size.height; y += TILE_SIZE ) { - tileSize.width = std::min(TILE_SIZE, size.width - x); - Mat src(src0, cv::Rect(x, y, tileSize.width, tileSize.height)); + Size tileSize; + tileSize.height = std::min(TILE_SIZE, size.height - y); - if( binary ) + for( int x = 0; x < size.width; x += TILE_SIZE ) { - cv::Mat tmp(tileSize, CV_8U, nzbuf); - cv::compare( src, 0, tmp, CV_CMP_NE ); - src = tmp; - } + tileSize.width = std::min(TILE_SIZE, size.width - x); + Mat src(src0, cv::Rect(x, y, tileSize.width, tileSize.height)); - double mom[10]; - func( src, mom ); + if( binary ) + { + cv::Mat tmp(tileSize, CV_8U, nzbuf); + cv::compare( src, 0, tmp, CV_CMP_NE ); + src = tmp; + } - if(binary) - { - double s = 1./255; - for( int k = 0; k < 10; k++ ) - mom[k] *= s; - } + double mom[10]; + func( src, mom ); - double xm = x * mom[0], ym = y * mom[0]; + if(binary) + { + double s = 1./255; + for( int k = 0; k < 10; k++ ) + mom[k] *= s; + } - // accumulate moments computed in each tile + double xm = x * mom[0], ym = y * mom[0]; - // + m00 ( = m00' ) - m.m00 += mom[0]; + // accumulate moments computed in each tile - // + m10 ( = m10' + x*m00' ) - m.m10 += mom[1] + xm; + // + m00 ( = m00' ) + m.m00 += mom[0]; - // + m01 ( = m01' + y*m00' ) - m.m01 += mom[2] + ym; + // + m10 ( = m10' + x*m00' ) + m.m10 += mom[1] + xm; - // + m20 ( = m20' + 2*x*m10' + x*x*m00' ) - m.m20 += mom[3] + x * (mom[1] * 2 + xm); + // + m01 ( = m01' + y*m00' ) + m.m01 += mom[2] + ym; - // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) - m.m11 += mom[4] + x * (mom[2] + ym) + y * mom[1]; + // + m20 ( = m20' + 2*x*m10' + x*x*m00' ) + m.m20 += mom[3] + x * (mom[1] * 2 + xm); - // + m02 ( = m02' + 2*y*m01' + y*y*m00' ) - m.m02 += mom[5] + y * (mom[2] * 2 + ym); + // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) + m.m11 += mom[4] + x * (mom[2] + ym) + y * mom[1]; - // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) - m.m30 += mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); + // + m02 ( = m02' + 2*y*m01' + y*y*m00' ) + m.m02 += mom[5] + y * (mom[2] * 2 + ym); - // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') - m.m21 += mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; + // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) + m.m30 += mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); - // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') - m.m12 += mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; + // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') + m.m21 += mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; - // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) - m.m03 += mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); + // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') + m.m12 += mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; + + // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) + m.m03 += mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); + } } } diff --git a/modules/imgproc/src/opencl/moments.cl b/modules/imgproc/src/opencl/moments.cl new file mode 100644 index 0000000000..190f201e61 --- /dev/null +++ b/modules/imgproc/src/opencl/moments.cl @@ -0,0 +1,110 @@ +/* See LICENSE file in the root OpenCV directory */ + +#ifdef BINARY_MOMENTS +#define READ_PIX(ref) (ref != 0) +#else +#define READ_PIX(ref) ref +#endif + +__kernel void moments(__global const uchar* src, int src_step, int src_offset, + int src_rows, int src_cols, __global int* mom0, + int tile_size, int xtiles, int ytiles) +{ + int x = get_global_id(0); + int y = get_global_id(1); + int x_min = x*tile_size; + int y_min = y*tile_size; + + if( x_min < src_cols && y_min < src_rows ) + { + int x_max = src_cols - x_min; + int y_max = src_rows - y_min; + int m[10]={0,0,0,0,0,0,0,0,0,0}; + __global const uchar* ptr = (src + src_offset);// + y_min*src_step + x_min; + __global int* mom = mom0 + (xtiles*y + x)*10; + + x_max = x_max < tile_size ? x_max : tile_size; + y_max = y_max < tile_size ? y_max : tile_size; + + for( y = 0; y < y_max; y++ ) + { + int x00, x10, x20, x30; + int sx, sy, p; + x00 = x10 = x20 = x30 = 0; + sy = y*y; + + for( x = 0; x < x_max; x++ ) + { + p = ptr[0];//READ_PIX(ptr[x]); + sx = x*x; + x00 += p; + x10 += x*p; + x20 += sx*p; + x30 += x*sx*p; + } + + m[0] += x00; + m[1] += x10; + m[2] += y*x00; + m[3] += x20; + m[4] += y*x10; + m[5] += sy*x00; + m[6] += x30; + m[7] += y*x20; + m[8] += sy*x10; + m[9] += y*sy*x00; + //ptr += src_step; + } + + mom[0] = m[0]; + + mom[1] = m[1]; + mom[2] = m[2]; + + mom[3] = m[3]; + mom[4] = m[4]; + mom[5] = m[5]; + + mom[6] = m[6]; + mom[7] = m[7]; + mom[8] = m[8]; + mom[9] = m[9]; + } +} + +/*__kernel void moments(__global const uchar* src, int src_step, int src_offset, + int src_rows, int src_cols, __global float* mom0, + int tile_size, int xtiles, int ytiles) +{ + int x = get_global_id(0); + int y = get_global_id(1); + if( x < xtiles && y < ytiles ) + { + //int x_min = x*tile_size; + //int y_min = y*tile_size; + //int x_max = src_cols - x_min; + //int y_max = src_rows - y_min; + __global const uchar* ptr = src + src_offset;// + src_step*y_min + x_min; + __global float* mom = mom0;// + (y*xtiles + x)*16; + //int x00, x10, x20, x30, m00=0; + //x_max = min(x_max, tile_size); + //y_max = min(y_max, tile_size); + //int m00 = 0; + + //for( y = 0; y < y_max; y++, ptr += src_step ) + //{ + //int x00 = 0, x10 = 0, x20 = 0, x30 = 0; + //for( x = 0; x < x_max; x++ ) + //{ + int p = ptr[x]; + //m00 = p; + //x10 += x*p; + /*x20 += x*x*p; + x30 += x*x*x*p; + //} + //m00 = m00 + x00; + //} + mom[0] = p; + } +}*/ + diff --git a/modules/imgproc/test/test_moments.cpp b/modules/imgproc/test/test_moments.cpp index c58d1f53be..5e14bdba0f 100644 --- a/modules/imgproc/test/test_moments.cpp +++ b/modules/imgproc/test/test_moments.cpp @@ -108,6 +108,7 @@ void CV_MomentsTest::get_test_array_types_and_sizes( int test_case_idx, if( cn == 2 ) cn = 1; + sizes[INPUT][0].height = sizes[INPUT][0].width; types[INPUT][0] = CV_MAKETYPE(depth, cn); types[OUTPUT][0] = types[REF_OUTPUT][0] = CV_64FC1; sizes[OUTPUT][0] = sizes[REF_OUTPUT][0] = cvSize(MOMENT_COUNT,1); @@ -274,6 +275,10 @@ void CV_MomentsTest::prepare_to_validation( int /*test_case_idx*/ ) mdata[6] = m.mu03 * s3; } + test_mat[REF_OUTPUT][0].copyTo(test_mat[OUTPUT][0]); + cout << "ref moments: " << test_mat[REF_OUTPUT][0] << "\n"; + cout << "fun moments: " << test_mat[OUTPUT][0] << "\n"; + double* a = test_mat[REF_OUTPUT][0].ptr(); double* b = test_mat[OUTPUT][0].ptr(); for( i = 0; i < MOMENT_COUNT; i++ ) From 83f749afd239dcac8fa75bdeaa6b9648a1d7edb2 Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Thu, 26 Dec 2013 02:57:08 +0400 Subject: [PATCH 2/5] moments work now and work more or less fast --- modules/core/src/matrix.cpp | 6 ++ modules/imgproc/src/moments.cpp | 27 ++--- modules/imgproc/src/opencl/moments.cl | 142 +++++++++----------------- modules/imgproc/test/test_moments.cpp | 31 ++++-- 4 files changed, 88 insertions(+), 118 deletions(-) diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp index 6f2580498f..3cc928471e 100644 --- a/modules/core/src/matrix.cpp +++ b/modules/core/src/matrix.cpp @@ -2261,6 +2261,12 @@ void _OutputArray::release() const ((Mat*)obj)->release(); return; } + + if( k == UMAT ) + { + ((UMat*)obj)->release(); + return; + } if( k == GPU_MAT ) { diff --git a/modules/imgproc/src/moments.cpp b/modules/imgproc/src/moments.cpp index 15bc83d97d..0813435684 100644 --- a/modules/imgproc/src/moments.cpp +++ b/modules/imgproc/src/moments.cpp @@ -363,36 +363,31 @@ Moments::Moments( double _m00, double _m10, double _m01, double _m20, double _m1 nu30 = mu30*s3; nu21 = mu21*s3; nu12 = mu12*s3; nu03 = mu03*s3; } -static const int OCL_TILE_SIZE = 32; - -static bool ocl_moments( InputArray _src, Moments& m, bool binary ) +static bool ocl_moments( InputArray _src, Moments& m) { - printf("!!!!!!!!!!!!!!!!!! ocl moments !!!!!!!!!!!!!!!!!!!\n"); + const int TILE_SIZE = 16; const int K = 10; - ocl::Kernel k("moments", ocl::imgproc::moments_oclsrc, binary ? "-D BINARY_MOMENTS" : ""); + ocl::Kernel k("moments", ocl::imgproc::moments_oclsrc, format("-D TILE_SIZE=%d", TILE_SIZE)); if( k.empty() ) return false; UMat src = _src.getUMat(); Size sz = src.size(); - int xtiles = (sz.width + OCL_TILE_SIZE-1)/OCL_TILE_SIZE; - int ytiles = (sz.height + OCL_TILE_SIZE-1)/OCL_TILE_SIZE; + int xtiles = (sz.width + TILE_SIZE-1)/TILE_SIZE; + int ytiles = (sz.height + TILE_SIZE-1)/TILE_SIZE; int ntiles = xtiles*ytiles; UMat umbuf(1, ntiles*K, CV_32S); - umbuf.setTo(Scalar::all(0)); size_t globalsize[] = {xtiles, ytiles}; - size_t localsize[] = {1, 1}; bool ok = k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::PtrWriteOnly(umbuf), - OCL_TILE_SIZE, xtiles, ytiles).run(2, globalsize, localsize, false); + xtiles).run(2, globalsize, 0, true); if(!ok) return false; - Mat mbuf; - umbuf.copyTo(mbuf); + Mat mbuf = umbuf.getMat(ACCESS_READ); for( int i = 0; i < ntiles; i++ ) { - double x = (i % xtiles)*OCL_TILE_SIZE, y = (i / xtiles)*OCL_TILE_SIZE; + double x = (i % xtiles)*TILE_SIZE, y = (i / xtiles)*TILE_SIZE; const int* mom = mbuf.ptr() + i*K; double xm = x * mom[0], ym = y * mom[0]; @@ -452,10 +447,8 @@ cv::Moments cv::moments( InputArray _src, bool binary ) if( size.width <= 0 || size.height <= 0 ) return m; - if( ocl::useOpenCL() && depth == CV_8U && - size.width >= OCL_TILE_SIZE && - size.height >= OCL_TILE_SIZE && - /*_src.isUMat() &&*/ ocl_moments(_src, m, binary) ) + if( ocl::useOpenCL() && depth == CV_8U && !binary && + _src.isUMat() && ocl_moments(_src, m) ) ; else { diff --git a/modules/imgproc/src/opencl/moments.cl b/modules/imgproc/src/opencl/moments.cl index 190f201e61..44c29d9c65 100644 --- a/modules/imgproc/src/opencl/moments.cl +++ b/modules/imgproc/src/opencl/moments.cl @@ -1,110 +1,70 @@ /* See LICENSE file in the root OpenCV directory */ -#ifdef BINARY_MOMENTS -#define READ_PIX(ref) (ref != 0) -#else -#define READ_PIX(ref) ref -#endif - __kernel void moments(__global const uchar* src, int src_step, int src_offset, - int src_rows, int src_cols, __global int* mom0, - int tile_size, int xtiles, int ytiles) + int src_rows, int src_cols, __global int* mom0, int xtiles) { int x = get_global_id(0); int y = get_global_id(1); - int x_min = x*tile_size; - int y_min = y*tile_size; + int x_min = x*TILE_SIZE; + int y_min = y*TILE_SIZE; if( x_min < src_cols && y_min < src_rows ) { - int x_max = src_cols - x_min; - int y_max = src_rows - y_min; - int m[10]={0,0,0,0,0,0,0,0,0,0}; - __global const uchar* ptr = (src + src_offset);// + y_min*src_step + x_min; + int x_max = min(src_cols - x_min, TILE_SIZE); + int y_max = min(src_rows - y_min, TILE_SIZE); + int m00=0, m10=0, m01=0, m20=0, m11=0, m02=0, m30=0, m21=0, m12=0, m03=0; + __global const uchar* ptr = src + src_offset + y_min*src_step + x_min; __global int* mom = mom0 + (xtiles*y + x)*10; - - x_max = x_max < tile_size ? x_max : tile_size; - y_max = y_max < tile_size ? y_max : tile_size; - for( y = 0; y < y_max; y++ ) + for( y = 0; y < y_max; y++, ptr += src_step ) { - int x00, x10, x20, x30; - int sx, sy, p; - x00 = x10 = x20 = x30 = 0; - sy = y*y; + int4 S = (int4)(0,0,0,0); - for( x = 0; x < x_max; x++ ) + for( x = 0; x <= x_max - 4; x += 4 ) { - p = ptr[0];//READ_PIX(ptr[x]); - sx = x*x; - x00 += p; - x10 += x*p; - x20 += sx*p; - x30 += x*sx*p; + int4 p = convert_int4(vload4(0, ptr + x)); + #define SUM_ELEM(elem, ofs) \ + (int4)(elem, (x+ofs)*elem, (x+ofs)*(x+ofs)*elem, (x+ofs)*(x+ofs)*(x+ofs)*elem) + S += SUM_ELEM(p.s0, 0) + SUM_ELEM(p.s1, 1) + SUM_ELEM(p.s2, 2) + SUM_ELEM(p.s3, 3); } - - m[0] += x00; - m[1] += x10; - m[2] += y*x00; - m[3] += x20; - m[4] += y*x10; - m[5] += sy*x00; - m[6] += x30; - m[7] += y*x20; - m[8] += sy*x10; - m[9] += y*sy*x00; - //ptr += src_step; + if( x < x_max ) + { + int ps = ptr[x]; + S += SUM_ELEM(ps, 0); + if( x+1 < x_max ) + { + ps = ptr[x+1]; + S += SUM_ELEM(ps, 1); + if( x+2 < x_max ) + { + ps = ptr[x+2]; + S += SUM_ELEM(ps, 2); + } + } + } + + int sy = y*y; + m00 += S.s0; + m10 += S.s1; + m01 += y*S.s0; + m20 += S.s2; + m11 += y*S.s1; + m02 += sy*S.s0; + m30 += S.s3; + m21 += y*S.s2; + m12 += sy*S.s1; + m03 += y*sy*S.s0; } - mom[0] = m[0]; - - mom[1] = m[1]; - mom[2] = m[2]; - - mom[3] = m[3]; - mom[4] = m[4]; - mom[5] = m[5]; - - mom[6] = m[6]; - mom[7] = m[7]; - mom[8] = m[8]; - mom[9] = m[9]; + mom[0] = m00; + mom[1] = m10; + mom[2] = m01; + mom[3] = m20; + mom[4] = m11; + mom[5] = m02; + mom[6] = m30; + mom[7] = m21; + mom[8] = m12; + mom[9] = m03; } } - -/*__kernel void moments(__global const uchar* src, int src_step, int src_offset, - int src_rows, int src_cols, __global float* mom0, - int tile_size, int xtiles, int ytiles) -{ - int x = get_global_id(0); - int y = get_global_id(1); - if( x < xtiles && y < ytiles ) - { - //int x_min = x*tile_size; - //int y_min = y*tile_size; - //int x_max = src_cols - x_min; - //int y_max = src_rows - y_min; - __global const uchar* ptr = src + src_offset;// + src_step*y_min + x_min; - __global float* mom = mom0;// + (y*xtiles + x)*16; - //int x00, x10, x20, x30, m00=0; - //x_max = min(x_max, tile_size); - //y_max = min(y_max, tile_size); - //int m00 = 0; - - //for( y = 0; y < y_max; y++, ptr += src_step ) - //{ - //int x00 = 0, x10 = 0, x20 = 0, x30 = 0; - //for( x = 0; x < x_max; x++ ) - //{ - int p = ptr[x]; - //m00 = p; - //x10 += x*p; - /*x20 += x*x*p; - x30 += x*x*x*p; - //} - //m00 = m00 + x00; - //} - mom[0] = p; - } -}*/ - diff --git a/modules/imgproc/test/test_moments.cpp b/modules/imgproc/test/test_moments.cpp index 5e14bdba0f..52bccd6e93 100644 --- a/modules/imgproc/test/test_moments.cpp +++ b/modules/imgproc/test/test_moments.cpp @@ -60,6 +60,7 @@ protected: void run_func(); int coi; bool is_binary; + bool try_umat; }; @@ -102,20 +103,25 @@ void CV_MomentsTest::get_test_array_types_and_sizes( int test_case_idx, { RNG& rng = ts->get_rng(); cvtest::ArrayTest::get_test_array_types_and_sizes( test_case_idx, sizes, types ); - int cn = cvtest::randInt(rng) % 4 + 1; + int cn = (cvtest::randInt(rng) % 4) + 1; int depth = cvtest::randInt(rng) % 4; depth = depth == 0 ? CV_8U : depth == 1 ? CV_16U : depth == 2 ? CV_16S : CV_32F; - if( cn == 2 ) + + is_binary = cvtest::randInt(rng) % 2 != 0; + if( depth == 0 && !is_binary ) + try_umat = cvtest::randInt(rng) % 5 != 0; + else + try_umat = cvtest::randInt(rng) % 2 != 0; + + if( cn == 2 || try_umat ) cn = 1; - sizes[INPUT][0].height = sizes[INPUT][0].width; types[INPUT][0] = CV_MAKETYPE(depth, cn); types[OUTPUT][0] = types[REF_OUTPUT][0] = CV_64FC1; sizes[OUTPUT][0] = sizes[REF_OUTPUT][0] = cvSize(MOMENT_COUNT,1); if(CV_MAT_DEPTH(types[INPUT][0])>=CV_32S) sizes[INPUT][0].width = MAX(sizes[INPUT][0].width, 3); - - is_binary = cvtest::randInt(rng) % 2 != 0; + coi = 0; cvmat_allowed = true; if( cn > 1 ) @@ -150,7 +156,16 @@ void CV_MomentsTest::run_func() { CvMoments* m = (CvMoments*)test_mat[OUTPUT][0].ptr(); double* others = (double*)(m + 1); - cvMoments( test_array[INPUT][0], m, is_binary ); + if( try_umat ) + { + UMat u; + test_mat[INPUT][0].clone().copyTo(u); + Moments new_m = moments(u, is_binary != 0); + *m = new_m; + } + else + cvMoments( test_array[INPUT][0], m, is_binary ); + others[0] = cvGetNormalizedCentralMoment( m, 2, 0 ); others[1] = cvGetNormalizedCentralMoment( m, 1, 1 ); others[2] = cvGetNormalizedCentralMoment( m, 0, 2 ); @@ -275,10 +290,6 @@ void CV_MomentsTest::prepare_to_validation( int /*test_case_idx*/ ) mdata[6] = m.mu03 * s3; } - test_mat[REF_OUTPUT][0].copyTo(test_mat[OUTPUT][0]); - cout << "ref moments: " << test_mat[REF_OUTPUT][0] << "\n"; - cout << "fun moments: " << test_mat[OUTPUT][0] << "\n"; - double* a = test_mat[REF_OUTPUT][0].ptr(); double* b = test_mat[OUTPUT][0].ptr(); for( i = 0; i < MOMENT_COUNT; i++ ) From a760c454ddf36143f25ef63e25898137e37f3e9d Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Thu, 26 Dec 2013 13:25:00 +0400 Subject: [PATCH 3/5] tuned the speed for OpenCL-based moments (still slower than the single-thread SSE2 CPU code :( ) --- modules/imgproc/src/opencl/moments.cl | 40 +++++++++++++++++++++------ modules/imgproc/test/test_moments.cpp | 26 ++++++++++++++++- 2 files changed, 57 insertions(+), 9 deletions(-) diff --git a/modules/imgproc/src/opencl/moments.cl b/modules/imgproc/src/opencl/moments.cl index 44c29d9c65..9cc5a873c7 100644 --- a/modules/imgproc/src/opencl/moments.cl +++ b/modules/imgproc/src/opencl/moments.cl @@ -1,5 +1,9 @@ /* See LICENSE file in the root OpenCV directory */ +#if TILE_SIZE > 16 +#error "TILE SIZE should be <= 16" +#endif + __kernel void moments(__global const uchar* src, int src_step, int src_offset, int src_rows, int src_cols, __global int* mom0, int xtiles) { @@ -15,30 +19,50 @@ __kernel void moments(__global const uchar* src, int src_step, int src_offset, int m00=0, m10=0, m01=0, m20=0, m11=0, m02=0, m30=0, m21=0, m12=0, m03=0; __global const uchar* ptr = src + src_offset + y_min*src_step + x_min; __global int* mom = mom0 + (xtiles*y + x)*10; + x = x_max & -4; for( y = 0; y < y_max; y++, ptr += src_step ) { - int4 S = (int4)(0,0,0,0); + int4 S = (int4)(0,0,0,0), p; - for( x = 0; x <= x_max - 4; x += 4 ) + #define SUM_ELEM(elem, ofs) \ + (int4)(1, (ofs), ((ofs)*(ofs)), ((ofs)*(ofs)*(ofs)))*elem + if( x_max >= 4 ) { - int4 p = convert_int4(vload4(0, ptr + x)); - #define SUM_ELEM(elem, ofs) \ - (int4)(elem, (x+ofs)*elem, (x+ofs)*(x+ofs)*elem, (x+ofs)*(x+ofs)*(x+ofs)*elem) + p = convert_int4(vload4(0, ptr)); S += SUM_ELEM(p.s0, 0) + SUM_ELEM(p.s1, 1) + SUM_ELEM(p.s2, 2) + SUM_ELEM(p.s3, 3); + + if( x_max >= 8 ) + { + p = convert_int4(vload4(0, ptr+4)); + S += SUM_ELEM(p.s0, 4) + SUM_ELEM(p.s1, 5) + SUM_ELEM(p.s2, 6) + SUM_ELEM(p.s3, 7); + + if( x_max >= 12 ) + { + p = convert_int4(vload4(0, ptr+8)); + S += SUM_ELEM(p.s0, 8) + SUM_ELEM(p.s1, 9) + SUM_ELEM(p.s2, 10) + SUM_ELEM(p.s3, 11); + + if( x_max >= 16 ) + { + p = convert_int4(vload4(0, ptr+12)); + S += SUM_ELEM(p.s0, 12) + SUM_ELEM(p.s1, 13) + SUM_ELEM(p.s2, 14) + SUM_ELEM(p.s3, 15); + } + } + } } + if( x < x_max ) { int ps = ptr[x]; - S += SUM_ELEM(ps, 0); + S += SUM_ELEM(ps, x); if( x+1 < x_max ) { ps = ptr[x+1]; - S += SUM_ELEM(ps, 1); + S += SUM_ELEM(ps, x+1); if( x+2 < x_max ) { ps = ptr[x+2]; - S += SUM_ELEM(ps, 2); + S += SUM_ELEM(ps, x+2); } } } diff --git a/modules/imgproc/test/test_moments.cpp b/modules/imgproc/test/test_moments.cpp index 52bccd6e93..45987dc081 100644 --- a/modules/imgproc/test/test_moments.cpp +++ b/modules/imgproc/test/test_moments.cpp @@ -43,6 +43,13 @@ using namespace cv; using namespace std; +#define OCL_TUNING_MODE 0 +#if OCL_TUNING_MODE +#define OCL_TUNING_MODE_ONLY(code) code +#else +#define OCL_TUNING_MODE_ONLY(code) +#endif + // image moments class CV_MomentsTest : public cvtest::ArrayTest { @@ -71,6 +78,7 @@ CV_MomentsTest::CV_MomentsTest() test_array[REF_OUTPUT].push_back(NULL); coi = -1; is_binary = false; + OCL_TUNING_MODE_ONLY(test_case_count = 10); //element_wise_relative_error = false; } @@ -97,7 +105,6 @@ void CV_MomentsTest::get_minmax_bounds( int i, int j, int type, Scalar& low, Sca } } - void CV_MomentsTest::get_test_array_types_and_sizes( int test_case_idx, vector >& sizes, vector >& types ) { @@ -115,6 +122,14 @@ void CV_MomentsTest::get_test_array_types_and_sizes( int test_case_idx, if( cn == 2 || try_umat ) cn = 1; + + OCL_TUNING_MODE_ONLY( + cn = 1; + depth = CV_8U; + try_umat = true; + is_binary = false; + sizes[INPUT][0] = Size(1024,768) + ); types[INPUT][0] = CV_MAKETYPE(depth, cn); types[OUTPUT][0] = types[REF_OUTPUT][0] = CV_64FC1; @@ -160,7 +175,16 @@ void CV_MomentsTest::run_func() { UMat u; test_mat[INPUT][0].clone().copyTo(u); + OCL_TUNING_MODE_ONLY( + static double ttime = 0; + static int ncalls = 0; + moments(u, is_binary != 0); + double t = (double)getTickCount()); Moments new_m = moments(u, is_binary != 0); + OCL_TUNING_MODE_ONLY( + ttime += (double)getTickCount() - t; + ncalls++; + printf("%g\n", ttime/ncalls/u.total())); *m = new_m; } else From e97dd57dc79bc1f3c31aa2f30753abc307cccc9e Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Thu, 26 Dec 2013 22:00:29 +0400 Subject: [PATCH 4/5] hopefully fixed test failures and complains from the doc builder --- modules/core/src/matrix.cpp | 2 +- modules/imgproc/src/moments.cpp | 40 +++++++++++++-------------- modules/imgproc/src/opencl/moments.cl | 10 +++---- modules/imgproc/test/test_moments.cpp | 10 +++---- 4 files changed, 31 insertions(+), 31 deletions(-) diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp index 3cc928471e..33c1d24ab2 100644 --- a/modules/core/src/matrix.cpp +++ b/modules/core/src/matrix.cpp @@ -2261,7 +2261,7 @@ void _OutputArray::release() const ((Mat*)obj)->release(); return; } - + if( k == UMAT ) { ((UMat*)obj)->release(); diff --git a/modules/imgproc/src/moments.cpp b/modules/imgproc/src/moments.cpp index 0813435684..02b4cc8355 100644 --- a/modules/imgproc/src/moments.cpp +++ b/modules/imgproc/src/moments.cpp @@ -370,14 +370,14 @@ static bool ocl_moments( InputArray _src, Moments& m) ocl::Kernel k("moments", ocl::imgproc::moments_oclsrc, format("-D TILE_SIZE=%d", TILE_SIZE)); if( k.empty() ) return false; - + UMat src = _src.getUMat(); Size sz = src.size(); int xtiles = (sz.width + TILE_SIZE-1)/TILE_SIZE; int ytiles = (sz.height + TILE_SIZE-1)/TILE_SIZE; int ntiles = xtiles*ytiles; UMat umbuf(1, ntiles*K, CV_32S); - + size_t globalsize[] = {xtiles, ytiles}; bool ok = k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::PtrWriteOnly(umbuf), @@ -390,43 +390,43 @@ static bool ocl_moments( InputArray _src, Moments& m) double x = (i % xtiles)*TILE_SIZE, y = (i / xtiles)*TILE_SIZE; const int* mom = mbuf.ptr() + i*K; double xm = x * mom[0], ym = y * mom[0]; - + // accumulate moments computed in each tile - + // + m00 ( = m00' ) m.m00 += mom[0]; - + // + m10 ( = m10' + x*m00' ) m.m10 += mom[1] + xm; - + // + m01 ( = m01' + y*m00' ) m.m01 += mom[2] + ym; - + // + m20 ( = m20' + 2*x*m10' + x*x*m00' ) m.m20 += mom[3] + x * (mom[1] * 2 + xm); - + // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) m.m11 += mom[4] + x * (mom[2] + ym) + y * mom[1]; - + // + m02 ( = m02' + 2*y*m01' + y*y*m00' ) m.m02 += mom[5] + y * (mom[2] * 2 + ym); - + // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) m.m30 += mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); - + // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') m.m21 += mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; - + // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') m.m12 += mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; - + // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) m.m03 += mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); } - + return true; } - + } @@ -441,13 +441,10 @@ cv::Moments cv::moments( InputArray _src, bool binary ) int cn = CV_MAT_CN( type ); Size size = _src.size(); - if( cn > 1 ) - CV_Error( CV_StsBadArg, "Invalid image type (must be single-channel)" ); - if( size.width <= 0 || size.height <= 0 ) return m; - - if( ocl::useOpenCL() && depth == CV_8U && !binary && + + if( ocl::useOpenCL() && type == CV_8UC1 && !binary && _src.isUMat() && ocl_moments(_src, m) ) ; else @@ -456,6 +453,9 @@ cv::Moments cv::moments( InputArray _src, bool binary ) if( mat.checkVector(2) >= 0 && (depth == CV_32F || depth == CV_32S)) return contourMoments(mat); + if( cn > 1 ) + CV_Error( CV_StsBadArg, "Invalid image type (must be single-channel)" ); + if( binary || depth == CV_8U ) func = momentsInTile; else if( depth == CV_16U ) diff --git a/modules/imgproc/src/opencl/moments.cl b/modules/imgproc/src/opencl/moments.cl index 9cc5a873c7..f6527b1657 100644 --- a/modules/imgproc/src/opencl/moments.cl +++ b/modules/imgproc/src/opencl/moments.cl @@ -31,17 +31,17 @@ __kernel void moments(__global const uchar* src, int src_step, int src_offset, { p = convert_int4(vload4(0, ptr)); S += SUM_ELEM(p.s0, 0) + SUM_ELEM(p.s1, 1) + SUM_ELEM(p.s2, 2) + SUM_ELEM(p.s3, 3); - + if( x_max >= 8 ) { p = convert_int4(vload4(0, ptr+4)); S += SUM_ELEM(p.s0, 4) + SUM_ELEM(p.s1, 5) + SUM_ELEM(p.s2, 6) + SUM_ELEM(p.s3, 7); - + if( x_max >= 12 ) { p = convert_int4(vload4(0, ptr+8)); S += SUM_ELEM(p.s0, 8) + SUM_ELEM(p.s1, 9) + SUM_ELEM(p.s2, 10) + SUM_ELEM(p.s3, 11); - + if( x_max >= 16 ) { p = convert_int4(vload4(0, ptr+12)); @@ -50,7 +50,7 @@ __kernel void moments(__global const uchar* src, int src_step, int src_offset, } } } - + if( x < x_max ) { int ps = ptr[x]; @@ -66,7 +66,7 @@ __kernel void moments(__global const uchar* src, int src_step, int src_offset, } } } - + int sy = y*y; m00 += S.s0; m10 += S.s1; diff --git a/modules/imgproc/test/test_moments.cpp b/modules/imgproc/test/test_moments.cpp index 45987dc081..b74ee5db87 100644 --- a/modules/imgproc/test/test_moments.cpp +++ b/modules/imgproc/test/test_moments.cpp @@ -113,16 +113,16 @@ void CV_MomentsTest::get_test_array_types_and_sizes( int test_case_idx, int cn = (cvtest::randInt(rng) % 4) + 1; int depth = cvtest::randInt(rng) % 4; depth = depth == 0 ? CV_8U : depth == 1 ? CV_16U : depth == 2 ? CV_16S : CV_32F; - + is_binary = cvtest::randInt(rng) % 2 != 0; if( depth == 0 && !is_binary ) try_umat = cvtest::randInt(rng) % 5 != 0; else try_umat = cvtest::randInt(rng) % 2 != 0; - + if( cn == 2 || try_umat ) cn = 1; - + OCL_TUNING_MODE_ONLY( cn = 1; depth = CV_8U; @@ -136,7 +136,7 @@ void CV_MomentsTest::get_test_array_types_and_sizes( int test_case_idx, sizes[OUTPUT][0] = sizes[REF_OUTPUT][0] = cvSize(MOMENT_COUNT,1); if(CV_MAT_DEPTH(types[INPUT][0])>=CV_32S) sizes[INPUT][0].width = MAX(sizes[INPUT][0].width, 3); - + coi = 0; cvmat_allowed = true; if( cn > 1 ) @@ -189,7 +189,7 @@ void CV_MomentsTest::run_func() } else cvMoments( test_array[INPUT][0], m, is_binary ); - + others[0] = cvGetNormalizedCentralMoment( m, 2, 0 ); others[1] = cvGetNormalizedCentralMoment( m, 1, 1 ); others[2] = cvGetNormalizedCentralMoment( m, 0, 2 ); From 48c7378c8ff01aad14442d06971a68259b4f2e2f Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Thu, 26 Dec 2013 23:29:04 +0400 Subject: [PATCH 5/5] improved performance of moments (on 720p or larger images) --- modules/imgproc/src/moments.cpp | 6 +- modules/imgproc/src/opencl/moments.cl | 125 ++++++++++++++++++-------- 2 files changed, 92 insertions(+), 39 deletions(-) diff --git a/modules/imgproc/src/moments.cpp b/modules/imgproc/src/moments.cpp index 02b4cc8355..f1954cfe33 100644 --- a/modules/imgproc/src/moments.cpp +++ b/modules/imgproc/src/moments.cpp @@ -365,7 +365,7 @@ Moments::Moments( double _m00, double _m10, double _m01, double _m20, double _m1 static bool ocl_moments( InputArray _src, Moments& m) { - const int TILE_SIZE = 16; + const int TILE_SIZE = 32; const int K = 10; ocl::Kernel k("moments", ocl::imgproc::moments_oclsrc, format("-D TILE_SIZE=%d", TILE_SIZE)); if( k.empty() ) @@ -378,10 +378,10 @@ static bool ocl_moments( InputArray _src, Moments& m) int ntiles = xtiles*ytiles; UMat umbuf(1, ntiles*K, CV_32S); - size_t globalsize[] = {xtiles, ytiles}; + size_t globalsize[] = {xtiles, sz.height}, localsize[] = {1, TILE_SIZE}; bool ok = k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::PtrWriteOnly(umbuf), - xtiles).run(2, globalsize, 0, true); + xtiles).run(2, globalsize, localsize, true); if(!ok) return false; Mat mbuf = umbuf.getMat(ACCESS_READ); diff --git a/modules/imgproc/src/opencl/moments.cl b/modules/imgproc/src/opencl/moments.cl index f6527b1657..0cf5b35440 100644 --- a/modules/imgproc/src/opencl/moments.cl +++ b/modules/imgproc/src/opencl/moments.cl @@ -1,32 +1,31 @@ /* See LICENSE file in the root OpenCV directory */ -#if TILE_SIZE > 16 -#error "TILE SIZE should be <= 16" +#if TILE_SIZE != 32 +#error "TILE SIZE should be 32" #endif __kernel void moments(__global const uchar* src, int src_step, int src_offset, int src_rows, int src_cols, __global int* mom0, int xtiles) { - int x = get_global_id(0); - int y = get_global_id(1); - int x_min = x*TILE_SIZE; - int y_min = y*TILE_SIZE; + int x0 = get_global_id(0); + int y0 = get_group_id(1); + int x, y = get_local_id(1); + int x_min = x0*TILE_SIZE; + int ypix = y0*TILE_SIZE + y; + __local int mom[TILE_SIZE][10]; - if( x_min < src_cols && y_min < src_rows ) + if( x_min < src_cols && y0*TILE_SIZE < src_rows ) { - int x_max = min(src_cols - x_min, TILE_SIZE); - int y_max = min(src_rows - y_min, TILE_SIZE); - int m00=0, m10=0, m01=0, m20=0, m11=0, m02=0, m30=0, m21=0, m12=0, m03=0; - __global const uchar* ptr = src + src_offset + y_min*src_step + x_min; - __global int* mom = mom0 + (xtiles*y + x)*10; - x = x_max & -4; - - for( y = 0; y < y_max; y++, ptr += src_step ) + if( ypix < src_rows ) { + int x_max = min(src_cols - x_min, TILE_SIZE); + __global const uchar* ptr = src + src_offset + ypix*src_step + x_min; int4 S = (int4)(0,0,0,0), p; #define SUM_ELEM(elem, ofs) \ - (int4)(1, (ofs), ((ofs)*(ofs)), ((ofs)*(ofs)*(ofs)))*elem + (int4)(1, (ofs), (ofs)*(ofs), (ofs)*(ofs)*(ofs))*elem + + x = x_max & -4; if( x_max >= 4 ) { p = convert_int4(vload4(0, ptr)); @@ -51,6 +50,30 @@ __kernel void moments(__global const uchar* src, int src_step, int src_offset, } } + if( x_max >= 20 ) + { + p = convert_int4(vload4(0, ptr+16)); + S += SUM_ELEM(p.s0, 16) + SUM_ELEM(p.s1, 17) + SUM_ELEM(p.s2, 18) + SUM_ELEM(p.s3, 19); + + if( x_max >= 24 ) + { + p = convert_int4(vload4(0, ptr+20)); + S += SUM_ELEM(p.s0, 20) + SUM_ELEM(p.s1, 21) + SUM_ELEM(p.s2, 22) + SUM_ELEM(p.s3, 23); + + if( x_max >= 28 ) + { + p = convert_int4(vload4(0, ptr+24)); + S += SUM_ELEM(p.s0, 24) + SUM_ELEM(p.s1, 25) + SUM_ELEM(p.s2, 26) + SUM_ELEM(p.s3, 27); + + if( x_max >= 32 ) + { + p = convert_int4(vload4(0, ptr+28)); + S += SUM_ELEM(p.s0, 28) + SUM_ELEM(p.s1, 29) + SUM_ELEM(p.s2, 30) + SUM_ELEM(p.s3, 31); + } + } + } + } + if( x < x_max ) { int ps = ptr[x]; @@ -68,27 +91,57 @@ __kernel void moments(__global const uchar* src, int src_step, int src_offset, } int sy = y*y; - m00 += S.s0; - m10 += S.s1; - m01 += y*S.s0; - m20 += S.s2; - m11 += y*S.s1; - m02 += sy*S.s0; - m30 += S.s3; - m21 += y*S.s2; - m12 += sy*S.s1; - m03 += y*sy*S.s0; + + mom[y][0] = S.s0; + mom[y][1] = S.s1; + mom[y][2] = y*S.s0; + mom[y][3] = S.s2; + mom[y][4] = y*S.s1; + mom[y][5] = sy*S.s0; + mom[y][6] = S.s3; + mom[y][7] = y*S.s2; + mom[y][8] = sy*S.s1; + mom[y][9] = y*sy*S.s0; } + else + mom[y][0] = mom[y][1] = mom[y][2] = mom[y][3] = mom[y][4] = + mom[y][5] = mom[y][6] = mom[y][7] = mom[y][8] = mom[y][9] = 0; + barrier(CLK_LOCAL_MEM_FENCE); - mom[0] = m00; - mom[1] = m10; - mom[2] = m01; - mom[3] = m20; - mom[4] = m11; - mom[5] = m02; - mom[6] = m30; - mom[7] = m21; - mom[8] = m12; - mom[9] = m03; + #define REDUCE(d) \ + if( y < d ) \ + { \ + mom[y][0] += mom[y+d][0]; \ + mom[y][1] += mom[y+d][1]; \ + mom[y][2] += mom[y+d][2]; \ + mom[y][3] += mom[y+d][3]; \ + mom[y][4] += mom[y+d][4]; \ + mom[y][5] += mom[y+d][5]; \ + mom[y][6] += mom[y+d][6]; \ + mom[y][7] += mom[y+d][7]; \ + mom[y][8] += mom[y+d][8]; \ + mom[y][9] += mom[y+d][9]; \ + } \ + barrier(CLK_LOCAL_MEM_FENCE) + + REDUCE(16); + REDUCE(8); + REDUCE(4); + REDUCE(2); + + if( y == 0 ) + { + __global int* momout = mom0 + (y0*xtiles + x0)*10; + momout[0] = mom[0][0] + mom[1][0]; + momout[1] = mom[0][1] + mom[1][1]; + momout[2] = mom[0][2] + mom[1][2]; + momout[3] = mom[0][3] + mom[1][3]; + momout[4] = mom[0][4] + mom[1][4]; + momout[5] = mom[0][5] + mom[1][5]; + momout[6] = mom[0][6] + mom[1][6]; + momout[7] = mom[0][7] + mom[1][7]; + momout[8] = mom[0][8] + mom[1][8]; + momout[9] = mom[0][9] + mom[1][9]; + } } }