From adca219f18570e08b4c9ad81463b8ca4854e7d8a Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 2 Oct 2013 23:21:28 +0400 Subject: [PATCH] fixed convertC3C4 and convertC4C3 functions in case cols == 1 --- modules/ocl/src/matrix_operations.cpp | 147 +++++---------------- modules/ocl/src/opencl/convertC3C4.cl | 46 ++++--- modules/ocl/test/test_matrix_operation.cpp | 2 +- 3 files changed, 59 insertions(+), 136 deletions(-) diff --git a/modules/ocl/src/matrix_operations.cpp b/modules/ocl/src/matrix_operations.cpp index 78d1cd4afb..3ae14eb48d 100644 --- a/modules/ocl/src/matrix_operations.cpp +++ b/modules/ocl/src/matrix_operations.cpp @@ -58,12 +58,13 @@ using namespace std; //////////////////////////////// oclMat //////////////////////////////// //////////////////////////////////////////////////////////////////////// -//helper routines +// helper routines namespace cv { namespace ocl { - ///////////////////////////OpenCL kernel strings/////////////////////////// + /////////////////////////// OpenCL kernel strings /////////////////////////// + extern const char *operator_copyToM; extern const char *operator_convertTo; extern const char *operator_setTo; @@ -74,42 +75,18 @@ namespace cv } } - //////////////////////////////////////////////////////////////////////// // convert_C3C4 + static void convert_C3C4(const cl_mem &src, oclMat &dst) { - int dstStep_in_pixel = dst.step1() / dst.oclchannels(); - int pixel_end = dst.wholecols * dst.wholerows - 1; Context *clCxt = dst.clCxt; - string kernelName = "convertC3C4"; - char compile_option[32]; - switch(dst.depth()) - { - case 0: - sprintf(compile_option, "-D GENTYPE4=uchar4"); - break; - case 1: - sprintf(compile_option, "-D GENTYPE4=char4"); - break; - case 2: - sprintf(compile_option, "-D GENTYPE4=ushort4"); - break; - case 3: - sprintf(compile_option, "-D GENTYPE4=short4"); - break; - case 4: - sprintf(compile_option, "-D GENTYPE4=int4"); - break; - case 5: - sprintf(compile_option, "-D GENTYPE4=float4"); - break; - case 6: - sprintf(compile_option, "-D GENTYPE4=double4"); - break; - default: - CV_Error(CV_StsUnsupportedFormat, "unknown depth"); - } + int pixel_end = dst.wholecols * dst.wholerows - 1; + int dstStep_in_pixel = dst.step1() / dst.oclchannels(); + + const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" }; + std::string buildOptions = format("-D GENTYPE4=%s4", typeMap[dst.depth()]); + vector< pair > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&src)); args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data)); @@ -118,46 +95,24 @@ static void convert_C3C4(const cl_mem &src, oclMat &dst) args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep_in_pixel)); args.push_back( make_pair( sizeof(cl_int), (void *)&pixel_end)); - size_t globalThreads[3] = {((dst.wholecols * dst.wholerows + 3) / 4 + 255) / 256 * 256, 1, 1}; - size_t localThreads[3] = {256, 1, 1}; + size_t globalThreads[3] = { divUp(dst.wholecols * dst.wholerows, 4), 1, 1 }; + size_t localThreads[3] = { 256, 1, 1 }; - openCLExecuteKernel(clCxt, &convertC3C4, kernelName, globalThreads, localThreads, args, -1, -1, compile_option); + openCLExecuteKernel(clCxt, &convertC3C4, "convertC3C4", globalThreads, localThreads, + args, -1, -1, buildOptions.c_str()); } + //////////////////////////////////////////////////////////////////////// // convert_C4C3 + static void convert_C4C3(const oclMat &src, cl_mem &dst) { int srcStep_in_pixel = src.step1() / src.oclchannels(); int pixel_end = src.wholecols * src.wholerows - 1; Context *clCxt = src.clCxt; - string kernelName = "convertC4C3"; - char compile_option[32]; - switch(src.depth()) - { - case 0: - sprintf(compile_option, "-D GENTYPE4=uchar4"); - break; - case 1: - sprintf(compile_option, "-D GENTYPE4=char4"); - break; - case 2: - sprintf(compile_option, "-D GENTYPE4=ushort4"); - break; - case 3: - sprintf(compile_option, "-D GENTYPE4=short4"); - break; - case 4: - sprintf(compile_option, "-D GENTYPE4=int4"); - break; - case 5: - sprintf(compile_option, "-D GENTYPE4=float4"); - break; - case 6: - sprintf(compile_option, "-D GENTYPE4=double4"); - break; - default: - CV_Error(CV_StsUnsupportedFormat, "unknown depth"); - } + + const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" }; + std::string buildOptions = format("-D GENTYPE4=%s4", typeMap[src.depth()]); vector< pair > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data)); @@ -167,10 +122,10 @@ static void convert_C4C3(const oclMat &src, cl_mem &dst) args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep_in_pixel)); args.push_back( make_pair( sizeof(cl_int), (void *)&pixel_end)); - size_t globalThreads[3] = {((src.wholecols * src.wholerows + 3) / 4 + 255) / 256 * 256, 1, 1}; - size_t localThreads[3] = {256, 1, 1}; + size_t globalThreads[3] = { divUp(src.wholecols * src.wholerows, 4), 1, 1}; + size_t localThreads[3] = { 256, 1, 1 }; - openCLExecuteKernel(clCxt, &convertC3C4, kernelName, globalThreads, localThreads, args, -1, -1, compile_option); + openCLExecuteKernel(clCxt, &convertC3C4, "convertC4C3", globalThreads, localThreads, args, -1, -1, buildOptions.c_str()); } void cv::ocl::oclMat::upload(const Mat &m) @@ -179,14 +134,10 @@ void cv::ocl::oclMat::upload(const Mat &m) Size wholeSize; Point ofs; m.locateROI(wholeSize, ofs); - // int type = m.type(); - // if(m.oclchannels() == 3) - //{ - // type = CV_MAKETYPE(m.depth(), 4); - //} + create(wholeSize, m.type()); - if(m.channels() == 3) + if (m.channels() == 3) { int pitch = wholeSize.width * 3 * m.elemSize1(); int tail_padding = m.elemSize1() * 3072; @@ -197,35 +148,15 @@ void cv::ocl::oclMat::upload(const Mat &m) openCLMemcpy2D(clCxt, temp, pitch, m.datastart, m.step, wholeSize.width * m.elemSize(), wholeSize.height, clMemcpyHostToDevice, 3); convert_C3C4(temp, *this); - //int* cputemp=new int[wholeSize.height*wholeSize.width * 3]; - //int* cpudata=new int[this->step*this->wholerows/sizeof(int)]; - //openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, temp, CL_TRUE, - // 0, wholeSize.height*wholeSize.width * 3* sizeof(int), cputemp, 0, NULL, NULL)); - //openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)data, CL_TRUE, - // 0, this->step*this->wholerows, cpudata, 0, NULL, NULL)); - //for(int i=0;istep/sizeof(int); - // for(int j=0;jempty()); - // int t = type(); - // if(download_channels == 3) - //{ - // t = CV_MAKETYPE(depth(), 3); - //} m.create(wholerows, wholecols, type()); if(m.channels() == 3) @@ -277,30 +203,14 @@ void cv::ocl::oclMat::download(cv::Mat &m) const convert_C4C3(*this, temp); openCLMemcpy2D(clCxt, m.data, m.step, temp, pitch, wholecols * m.elemSize(), wholerows, clMemcpyDeviceToHost, 3); - //int* cputemp=new int[wholecols*wholerows * 3]; - //int* cpudata=new int[this->step*this->wholerows/sizeof(int)]; - //openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, temp, CL_TRUE, - // 0, wholecols*wholerows * 3* sizeof(int), cputemp, 0, NULL, NULL)); - //openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)data, CL_TRUE, - // 0, this->step*this->wholerows, cpudata, 0, NULL, NULL)); - //for(int i=0;istep/sizeof(int); - // for(int j=0;jsupportsFeature(Context::CL_DOUBLE) && diff --git a/modules/ocl/src/opencl/convertC3C4.cl b/modules/ocl/src/opencl/convertC3C4.cl index 3e61827691..1908f92a2a 100644 --- a/modules/ocl/src/opencl/convertC3C4.cl +++ b/modules/ocl/src/opencl/convertC3C4.cl @@ -32,23 +32,23 @@ // the use of this software, even if advised of the possibility of such damage. // // -//#pragma OPENCL EXTENSION cl_amd_printf : enable + #if defined (DOUBLE_SUPPORT) #pragma OPENCL EXTENSION cl_khr_fp64:enable #endif + __kernel void convertC3C4(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst, int cols, int rows, int dstStep_in_piexl,int pixel_end) { int id = get_global_id(0); - //int pixel_end = mul24(cols -1 , rows -1); int3 pixelid = (int3)(mul24(id,3),mad24(id,3,1),mad24(id,3,2)); pixelid = clamp(pixelid,0,pixel_end); GENTYPE4 pixel0, pixel1, pixel2, outpix0,outpix1,outpix2,outpix3; + pixel0 = src[pixelid.x]; pixel1 = src[pixelid.y]; pixel2 = src[pixelid.z]; - outpix0 = (GENTYPE4)(pixel0.x,pixel0.y,pixel0.z,0); outpix1 = (GENTYPE4)(pixel0.w,pixel1.x,pixel1.y,0); outpix2 = (GENTYPE4)(pixel1.z,pixel1.w,pixel2.x,0); @@ -56,17 +56,19 @@ __kernel void convertC3C4(__global const GENTYPE4 * restrict src, __global GENTY int4 outy = (id<<2)/cols; int4 outx = (id<<2)%cols; - outx.y++; - outx.z+=2; - outx.w+=3; - outy = select(outy,outy+1,outx>=cols); - outx = select(outx,outx-cols,outx>=cols); - //outpix3 = select(outpix3, outpix0, (uchar4)(outy.w>=rows)); - //outpix2 = select(outpix2, outpix0, (uchar4)(outy.z>=rows)); - //outpix1 = select(outpix1, outpix0, (uchar4)(outy.y>=rows)); - //outx = select(outx,(int4)outx.x,outy>=rows); - //outy = select(outy,(int4)outy.x,outy>=rows); + + outx += (int4)(0, 1, 2, 3); + outy = select(outy, outy+1, outx>=cols); + outx = select(outx, outx-cols, outx>=cols); + + // when cols == 1 + outy = select(outy, outy + 1, outx >= cols); + outx = select(outx, outx-cols, outx >= cols); + outy = select(outy, outy + 1, outx >= cols); + outx = select(outx, outx-cols, outx >= cols); + int4 addr = mad24(outy,(int4)dstStep_in_piexl,outx); + if(outx.w=(int4)cols); - y4=clamp(y4,(int4)0,(int4)(rows-1)); x4 = select(x4,x4-(int4)cols,x4>=(int4)cols); - int4 addr = mad24(y4,(int4)srcStep_in_pixel,x4); + + // when cols == 1 + y4 = select(y4, y4 + 1,x4>=(int4)cols); + x4 = select(x4, x4 - (int4)cols,x4>=(int4)cols); + y4 = select(y4, y4 + 1,x4>=(int4)cols); + x4 = select(x4, x4-(int4)cols,x4>=(int4)cols); + + y4=clamp(y4,(int4)0,(int4)(rows-1)); + int4 addr = mad24(y4, (int4)srcStep_in_pixel, x4); + GENTYPE4 pixel0,pixel1,pixel2,pixel3, outpixel1, outpixel2; pixel0 = src[addr.x]; pixel1 = src[addr.y]; @@ -120,9 +128,11 @@ __kernel void convertC4C3(__global const GENTYPE4 * restrict src, __global GENTY outpixel2.y = pixel3.x; outpixel2.z = pixel3.y; outpixel2.w = pixel3.z; + int4 outaddr = mul24(id>>2 , 3); outaddr.y++; outaddr.z+=2; + if(outaddr.z <= pixel_end) { dst[outaddr.x] = pixel0; diff --git a/modules/ocl/test/test_matrix_operation.cpp b/modules/ocl/test/test_matrix_operation.cpp index d1d24689b5..46e077a6bb 100644 --- a/modules/ocl/test/test_matrix_operation.cpp +++ b/modules/ocl/test/test_matrix_operation.cpp @@ -402,7 +402,7 @@ PARAM_TEST_CASE(convertC3C4, MatType, bool) int type = CV_MAKE_TYPE(depth, 3); cv::RNG &rng = TS::ptr()->get_rng(); - src = randomMat(rng, randomSize(MIN_VALUE, MAX_VALUE), type, 0, 40, false); + src = randomMat(rng, randomSize(1, MAX_VALUE), type, 0, 40, false); } void random_roi()