diff --git a/modules/ocl/include/opencv2/ocl/matrix_operations.hpp b/modules/ocl/include/opencv2/ocl/matrix_operations.hpp index e90da2bc9d..3d75e14b3e 100644 --- a/modules/ocl/include/opencv2/ocl/matrix_operations.hpp +++ b/modules/ocl/include/opencv2/ocl/matrix_operations.hpp @@ -49,7 +49,7 @@ namespace cv namespace ocl { ////////////////////////////////////OpenCL kernel strings////////////////////////// - extern const char *convertC3C4; + //extern const char *convertC3C4; //////////////////////////////////////////////////////////////////////// //////////////////////////////// oclMat //////////////////////////////// diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp index 752b554bdf..0efc72283a 100644 --- a/modules/ocl/include/opencv2/ocl/ocl.hpp +++ b/modules/ocl/include/opencv2/ocl/ocl.hpp @@ -49,6 +49,7 @@ #include "opencv2/core/core.hpp" #include "opencv2/imgproc/imgproc.hpp" #include "opencv2/objdetect/objdetect.hpp" +#include "opencv2/features2d/features2d.hpp" namespace cv { diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp index dba7778b19..d709467158 100644 --- a/modules/ocl/src/arithm.cpp +++ b/modules/ocl/src/arithm.cpp @@ -455,13 +455,12 @@ void cv::ocl::multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, doub } void cv::ocl::divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar) { - if(src1.clCxt -> impl -> double_support ==0) - { - CV_Error(-217,"Selected device don't support double\r\n"); - return; - } - arithmetic_run(src1, src2, dst, "arithm_div", &arithm_div, (void *)(&scalar)); + if(src1.clCxt -> impl -> double_support !=0) + arithmetic_run(src1, src2, dst, "arithm_div", &arithm_div, (void *)(&scalar)); + else + arithmetic_run(src1, src2, dst, "arithm_div", &arithm_div, (void *)(&scalar)); + } template void arithmetic_scalar_run(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, string kernelName, const char **kernelString, int isMatSubScalar) @@ -579,7 +578,14 @@ void arithmetic_scalar_run(const oclMat &src, oclMat &dst, string kernelName, co args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 )); - args.push_back( make_pair( sizeof(cl_double), (void *)&scalar )); + + if(src.clCxt -> impl -> double_support !=0) + args.push_back( make_pair( sizeof(cl_double), (void *)&scalar )); + else + { + float f_scalar = (float)scalar; + args.push_back( make_pair( sizeof(cl_float), (void *)&f_scalar)); + } openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth); } @@ -670,9 +676,9 @@ void compare_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string ker int cols = divUp(dst.cols + offset_cols, vector_length); size_t localThreads[3] = { 64, 4, 1 }; size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0], - divUp(dst.rows, localThreads[1]) * localThreads[1], - 1 - }; + divUp(dst.rows, localThreads[1]) * localThreads[1], + 1 + }; int dst_step1 = dst.cols * dst.elemSize(); vector > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data )); @@ -1253,7 +1259,11 @@ void arithmetic_exp_log_run(const oclMat &src, oclMat &dst, string kernelName, c CV_Assert( src.type() == CV_32F || src.type() == CV_64F); Context *clCxt = src.clCxt; - + if(clCxt -> impl -> double_support ==0 && src.type() == CV_64F) + { + CV_Error(-217,"Selected device don't support double\r\n"); + return; + } //int channels = dst.channels(); int depth = dst.depth(); @@ -2193,56 +2203,46 @@ void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2, size_t localThreads[3] = { 256, 1, 1 }; size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0], - divUp(dst.rows, localThreads[1]) * localThreads[1], - 1 - }; + divUp(dst.rows, localThreads[1]) * localThreads[1], + 1 + }; int dst_step1 = dst.cols * dst.elemSize(); vector > args; - if(sizeof(double) == 8) + args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data )); + args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step )); + args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset)); + args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data )); + args.push_back( make_pair( sizeof(cl_int), (void *)&src2.step )); + args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset)); + + if(src1.clCxt -> impl -> double_support != 0) { - args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data )); args.push_back( make_pair( sizeof(cl_double), (void *)&alpha )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset)); - args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data )); args.push_back( make_pair( sizeof(cl_double), (void *)&beta )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src2.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset)); args.push_back( make_pair( sizeof(cl_double), (void *)&gama )); - args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows )); - args.push_back( make_pair( sizeof(cl_int), (void *)&cols )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 )); } else { - - args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data )); args.push_back( make_pair( sizeof(cl_float), (void *)&alpha )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset)); - args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data )); args.push_back( make_pair( sizeof(cl_float), (void *)&beta )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src2.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset)); args.push_back( make_pair( sizeof(cl_float), (void *)&gama )); - args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows )); - args.push_back( make_pair( sizeof(cl_int), (void *)&cols )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 )); - } + } + + args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data )); + args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step )); + args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset)); + args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows )); + args.push_back( make_pair( sizeof(cl_int), (void *)&cols )); + args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 )); + openCLExecuteKernel(clCxt, &arithm_addWeighted, "addWeighted", globalThreads, localThreads, args, -1, depth); } void cv::ocl::magnitudeSqr(const oclMat &src1, const oclMat &src2, oclMat &dst) { CV_Assert(src1.type() == src2.type() && src1.size() == src2.size() && - (src1.depth() == CV_32F )); + (src1.depth() == CV_32F )); dst.create(src1.size(), src1.type()); @@ -2265,9 +2265,9 @@ void cv::ocl::magnitudeSqr(const oclMat &src1, const oclMat &src2, oclMat &dst) size_t localThreads[3] = { 256, 1, 1 }; size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0], - divUp(dst.rows, localThreads[1]) * localThreads[1], - 1 - }; + divUp(dst.rows, localThreads[1]) * localThreads[1], + 1 + }; int dst_step1 = dst.cols * dst.elemSize(); vector > args; @@ -2313,9 +2313,9 @@ void cv::ocl::magnitudeSqr(const oclMat &src1, oclMat &dst) size_t localThreads[3] = { 256, 1, 1 }; size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0], - divUp(dst.rows, localThreads[1]) * localThreads[1], - 1 - }; + divUp(dst.rows, localThreads[1]) * localThreads[1], + 1 + }; int dst_step1 = dst.cols * dst.elemSize(); vector > args; @@ -2348,9 +2348,9 @@ void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string kernel size_t localThreads[3] = { 64, 4, 1 }; size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0], - divUp(rows, localThreads[1]) * localThreads[1], - 1 - }; + divUp(rows, localThreads[1]) * localThreads[1], + 1 + }; int dst_step1 = dst.cols * dst.elemSize(); vector > args; diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp index fd07df5cd1..7617c08c5e 100644 --- a/modules/ocl/src/imgproc.cpp +++ b/modules/ocl/src/imgproc.cpp @@ -410,7 +410,11 @@ namespace cv float ify = 1. / fy; double ifx_d = 1. / fx; double ify_d = 1. / fy; - + int srcStep_in_pixel = src.step1() / src.channels(); + int srcoffset_in_pixel = src.offset / src.elemSize(); + int dstStep_in_pixel = dst.step1() / dst.channels(); + int dstoffset_in_pixel = dst.offset / dst.elemSize(); + //printf("%d %d\n",src.step1() , dst.elemSize()); string kernelName; if(interpolation == INTER_LINEAR) kernelName = "resizeLN"; @@ -438,25 +442,33 @@ namespace cv { args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data)); args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data)); - args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset)); - args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset)); - args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step)); - args.push_back( make_pair(sizeof(cl_int), (void *)&src.step)); + args.push_back( make_pair(sizeof(cl_int), (void *)&dstoffset_in_pixel)); + args.push_back( make_pair(sizeof(cl_int), (void *)&srcoffset_in_pixel)); + args.push_back( make_pair(sizeof(cl_int), (void *)&dstStep_in_pixel)); + args.push_back( make_pair(sizeof(cl_int), (void *)&srcStep_in_pixel)); args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols)); args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows)); args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols)); args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows)); - args.push_back( make_pair(sizeof(cl_double), (void *)&ifx_d)); - args.push_back( make_pair(sizeof(cl_double), (void *)&ify_d)); + if(src.clCxt -> impl -> double_support != 0) + { + args.push_back( make_pair(sizeof(cl_double), (void *)&ifx_d)); + args.push_back( make_pair(sizeof(cl_double), (void *)&ify_d)); + } + else + { + args.push_back( make_pair(sizeof(cl_float), (void *)&ifx)); + args.push_back( make_pair(sizeof(cl_float), (void *)&ify)); + } } else { args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data)); args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data)); - args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset)); - args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset)); - args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step)); - args.push_back( make_pair(sizeof(cl_int), (void *)&src.step)); + args.push_back( make_pair(sizeof(cl_int), (void *)&dstoffset_in_pixel)); + args.push_back( make_pair(sizeof(cl_int), (void *)&srcoffset_in_pixel)); + args.push_back( make_pair(sizeof(cl_int), (void *)&dstStep_in_pixel)); + args.push_back( make_pair(sizeof(cl_int), (void *)&srcStep_in_pixel)); args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols)); args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows)); args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols)); diff --git a/modules/ocl/src/initialization.cpp b/modules/ocl/src/initialization.cpp index feff1db830..61e7177ae9 100644 --- a/modules/ocl/src/initialization.cpp +++ b/modules/ocl/src/initialization.cpp @@ -378,20 +378,36 @@ namespace cv void openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch, const void *src, size_t spitch, - size_t width, size_t height, enum openCLMemcpyKind kind) + size_t width, size_t height, enum openCLMemcpyKind kind, int channels) { size_t buffer_origin[3] = {0, 0, 0}; size_t host_origin[3] = {0, 0, 0}; size_t region[3] = {width, height, 1}; if(kind == clMemcpyHostToDevice) { - openCLSafeCall(clEnqueueWriteBufferRect(clCxt->impl->clCmdQueue, (cl_mem)dst, CL_TRUE, - buffer_origin, host_origin, region, dpitch, 0, spitch, 0, src, 0, 0, 0)); + if(dpitch == width || channels==3) + { + openCLSafeCall(clEnqueueWriteBuffer(clCxt->impl->clCmdQueue, (cl_mem)dst, CL_TRUE, + 0, width*height, src, 0, NULL, NULL)); + } + else + { + openCLSafeCall(clEnqueueWriteBufferRect(clCxt->impl->clCmdQueue, (cl_mem)dst, CL_TRUE, + buffer_origin, host_origin, region, dpitch, 0, spitch, 0, src, 0, 0, 0)); + } } else if(kind == clMemcpyDeviceToHost) { - openCLSafeCall(clEnqueueReadBufferRect(clCxt->impl->clCmdQueue, (cl_mem)src, CL_TRUE, - buffer_origin, host_origin, region, spitch, 0, dpitch, 0, dst, 0, 0, 0)); + if(spitch == width || channels==3) + { + openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)src, CL_TRUE, + 0, width*height, dst, 0, NULL, NULL)); + } + else + { + openCLSafeCall(clEnqueueReadBufferRect(clCxt->impl->clCmdQueue, (cl_mem)src, CL_TRUE, + buffer_origin, host_origin, region, spitch, 0, dpitch, 0, dst, 0, 0, 0)); + } } } diff --git a/modules/ocl/src/kernels/arithm_addWeighted.cl b/modules/ocl/src/kernels/arithm_addWeighted.cl index a34fd8d85c..434010068d 100644 --- a/modules/ocl/src/kernels/arithm_addWeighted.cl +++ b/modules/ocl/src/kernels/arithm_addWeighted.cl @@ -51,9 +51,9 @@ typedef float F; ////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////addWeighted////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////// -__kernel void addWeighted_D0 (__global uchar *src1, F alpha,int src1_step,int src1_offset, - __global uchar *src2, F beta, int src2_step,int src2_offset, - F gama, +__kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset, + __global uchar *src2, int src2_step,int src2_offset, + F alpha,F beta,F gama, __global uchar *dst, int dst_step,int dst_offset, int rows, int cols,int dst_step1) { @@ -99,9 +99,9 @@ __kernel void addWeighted_D0 (__global uchar *src1, F alpha,int src1_step,int sr -__kernel void addWeighted_D2 (__global ushort *src1, F alpha,int src1_step,int src1_offset, - __global ushort *src2, F beta, int src2_step,int src2_offset, - F gama, +__kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offset, + __global ushort *src2, int src2_step,int src2_offset, + F alpha,F beta,F gama, __global ushort *dst, int dst_step,int dst_offset, int rows, int cols,int dst_step1) { @@ -145,9 +145,9 @@ __kernel void addWeighted_D2 (__global ushort *src1, F alpha,int src1_step,int s } -__kernel void addWeighted_D3 (__global short *src1, F alpha,int src1_step,int src1_offset, - __global short *src2, F beta, int src2_step,int src2_offset, - F gama, +__kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offset, + __global short *src2, int src2_step,int src2_offset, + F alpha,F beta,F gama, __global short *dst, int dst_step,int dst_offset, int rows, int cols,int dst_step1) { @@ -190,9 +190,9 @@ __kernel void addWeighted_D3 (__global short *src1, F alpha,int src1_step,int sr } -__kernel void addWeighted_D4 (__global int *src1, F alpha,int src1_step,int src1_offset, - __global int *src2, F beta, int src2_step,int src2_offset, - F gama, +__kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset, + __global int *src2, int src2_step,int src2_offset, + F alpha,F beta, F gama, __global int *dst, int dst_step,int dst_offset, int rows, int cols,int dst_step1) { @@ -238,9 +238,9 @@ __kernel void addWeighted_D4 (__global int *src1, F alpha,int src1_step,int src1 } -__kernel void addWeighted_D5 (__global float *src1, F alpha,int src1_step,int src1_offset, - __global float *src2, F beta, int src2_step,int src2_offset, - F gama, +__kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset, + __global float *src2, int src2_step,int src2_offset, + F alpha,F beta, F gama, __global float *dst, int dst_step,int dst_offset, int rows, int cols,int dst_step1) { @@ -286,9 +286,9 @@ __kernel void addWeighted_D5 (__global float *src1, F alpha,int src1_step,int sr } #if defined (DOUBLE_SUPPORT) -__kernel void addWeighted_D6 (__global double *src1, F alpha,int src1_step,int src1_offset, - __global double *src2, F beta, int src2_step,int src2_offset, - F gama, +__kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offset, + __global double *src2, int src2_step,int src2_offset, + F alpha,F beta, F gama, __global double *dst, int dst_step,int dst_offset, int rows, int cols,int dst_step1) { diff --git a/modules/ocl/src/kernels/arithm_cartToPolar.cl b/modules/ocl/src/kernels/arithm_cartToPolar.cl index d4aa83a6a2..a2f65e0b73 100644 --- a/modules/ocl/src/kernels/arithm_cartToPolar.cl +++ b/modules/ocl/src/kernels/arithm_cartToPolar.cl @@ -49,6 +49,10 @@ #define CV_PI 3.1415926535897932384626433832795 +#ifndef DBL_EPSILON +#define DBL_EPSILON 0x1.0p-52 +#endif + __kernel void arithm_cartToPolar_D5 (__global float *src1, int src1_step, int src1_offset, __global float *src2, int src2_step, int src2_offset, __global float *dst1, int dst1_step, int dst1_offset, //magnitude diff --git a/modules/ocl/src/kernels/arithm_div.cl b/modules/ocl/src/kernels/arithm_div.cl index 43858f0233..ae4f46ab10 100644 --- a/modules/ocl/src/kernels/arithm_div.cl +++ b/modules/ocl/src/kernels/arithm_div.cl @@ -45,36 +45,45 @@ #if defined (DOUBLE_SUPPORT) #pragma OPENCL EXTENSION cl_khr_fp64:enable +typedef double F ; +typedef double4 F4; +#define convert_F4 convert_double4 +#define convert_F convert_double +#else +typedef float F; +typedef float4 F4; +#define convert_F4 convert_float4 +#define convert_F convert_float #endif -uchar round2_uchar(double v){ +uchar round2_uchar(F v){ - uchar v1 = convert_uchar_sat(v); - uchar v2 = convert_uchar_sat(v+(v>=0 ? 0.5 : -0.5)); + uchar v1 = convert_uchar_sat(round(v)); + //uchar v2 = convert_uchar_sat(v+(v>=0 ? 0.5 : -0.5)); - return (((v-v1)==0.5) && (v1%2==0)) ? v1 : v2; + return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2; } -ushort round2_ushort(double v){ +ushort round2_ushort(F v){ - ushort v1 = convert_ushort_sat(v); - ushort v2 = convert_ushort_sat(v+(v>=0 ? 0.5 : -0.5)); + ushort v1 = convert_ushort_sat(round(v)); + //ushort v2 = convert_ushort_sat(v+(v>=0 ? 0.5 : -0.5)); - return (((v-v1)==0.5) && (v1%2==0)) ? v1 : v2; + return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2; } -short round2_short(double v){ +short round2_short(F v){ - short v1 = convert_short_sat(v); - short v2 = convert_short_sat(v+(v>=0 ? 0.5 : -0.5)); + short v1 = convert_short_sat(round(v)); + //short v2 = convert_short_sat(v+(v>=0 ? 0.5 : -0.5)); - return (((v-v1)==0.5) && (v1%2==0)) ? v1 : v2; + return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2; } -int round2_int(double v){ +int round2_int(F v){ - int v1 = convert_int_sat(v); - int v2 = convert_int_sat(v+(v>=0 ? 0.5 : -0.5)); + int v1 = convert_int_sat(round(v)); + //int v2 = convert_int_sat(v+(v>=0 ? 0.5 : -0.5)); - return (((v-v1)==0.5) && (v1%2==0)) ? v1 : v2; + return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2; } /////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////divide/////////////////////////////////////////////////// @@ -83,7 +92,7 @@ int round2_int(double v){ __kernel void arithm_div_D0 (__global uchar *src1, int src1_step, int src1_offset, __global uchar *src2, int src2_step, int src2_offset, __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, double scalar) + int rows, int cols, int dst_step1, F scalar) { int x = get_global_id(0); int y = get_global_id(1); @@ -104,13 +113,13 @@ __kernel void arithm_div_D0 (__global uchar *src1, int src1_step, int src1_offse uchar4 src2_data = vload4(0, src2 + src2_index); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); - double4 tmp = convert_double4(src1_data) * scalar; + F4 tmp = convert_F4(src1_data) * scalar; uchar4 tmp_data; - tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_uchar(tmp.x / (double)src2_data.x); - tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_uchar(tmp.y / (double)src2_data.y); - tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_uchar(tmp.z / (double)src2_data.z); - tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_uchar(tmp.w / (double)src2_data.w); + tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_uchar(tmp.x / (F)src2_data.x); + tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_uchar(tmp.y / (F)src2_data.y); + tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_uchar(tmp.z / (F)src2_data.z); + tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_uchar(tmp.w / (F)src2_data.w); dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y; @@ -124,7 +133,7 @@ __kernel void arithm_div_D0 (__global uchar *src1, int src1_step, int src1_offse __kernel void arithm_div_D2 (__global ushort *src1, int src1_step, int src1_offset, __global ushort *src2, int src2_step, int src2_offset, __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, double scalar) + int rows, int cols, int dst_step1, F scalar) { int x = get_global_id(0); int y = get_global_id(1); @@ -145,13 +154,13 @@ __kernel void arithm_div_D2 (__global ushort *src1, int src1_step, int src1_offs ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); - double4 tmp = convert_double4(src1_data) * scalar; + F4 tmp = convert_F4(src1_data) * scalar; ushort4 tmp_data; - tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_ushort(tmp.x / (double)src2_data.x); - tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_ushort(tmp.y / (double)src2_data.y); - tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_ushort(tmp.z / (double)src2_data.z); - tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_ushort(tmp.w / (double)src2_data.w); + tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_ushort(tmp.x / (F)src2_data.x); + tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_ushort(tmp.y / (F)src2_data.y); + tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_ushort(tmp.z / (F)src2_data.z); + tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_ushort(tmp.w / (F)src2_data.w); dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y; @@ -164,7 +173,7 @@ __kernel void arithm_div_D2 (__global ushort *src1, int src1_step, int src1_offs __kernel void arithm_div_D3 (__global short *src1, int src1_step, int src1_offset, __global short *src2, int src2_step, int src2_offset, __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, double scalar) + int rows, int cols, int dst_step1, F scalar) { int x = get_global_id(0); int y = get_global_id(1); @@ -185,13 +194,13 @@ __kernel void arithm_div_D3 (__global short *src1, int src1_step, int src1_offse short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - double4 tmp = convert_double4(src1_data) * scalar; + F4 tmp = convert_F4(src1_data) * scalar; short4 tmp_data; - tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_short(tmp.x / (double)src2_data.x); - tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_short(tmp.y / (double)src2_data.y); - tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_short(tmp.z / (double)src2_data.z); - tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_short(tmp.w / (double)src2_data.w); + tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_short(tmp.x / (F)src2_data.x); + tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_short(tmp.y / (F)src2_data.y); + tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_short(tmp.z / (F)src2_data.z); + tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_short(tmp.w / (F)src2_data.w); dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; @@ -206,7 +215,7 @@ __kernel void arithm_div_D3 (__global short *src1, int src1_step, int src1_offse __kernel void arithm_div_D4 (__global int *src1, int src1_step, int src1_offset, __global int *src2, int src2_step, int src2_offset, __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, double scalar) + int rows, int cols, int dst_step1, F scalar) { int x = get_global_id(0); int y = get_global_id(1); @@ -220,8 +229,8 @@ __kernel void arithm_div_D4 (__global int *src1, int src1_step, int src1_offset, int data1 = *((__global int *)((__global char *)src1 + src1_index)); int data2 = *((__global int *)((__global char *)src2 + src2_index)); - double tmp = convert_double(data1) * scalar; - int tmp_data = (tmp == 0 || data2 == 0) ? 0 : round2_int(tmp / (convert_double)(data2)); + F tmp = convert_F(data1) * scalar; + int tmp_data = (tmp == 0 || data2 == 0) ? 0 : round2_int(tmp / (convert_F)(data2)); *((__global int *)((__global char *)dst + dst_index)) =tmp_data; } @@ -230,7 +239,7 @@ __kernel void arithm_div_D4 (__global int *src1, int src1_step, int src1_offset, __kernel void arithm_div_D5 (__global float *src1, int src1_step, int src1_offset, __global float *src2, int src2_step, int src2_offset, __global float *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, double scalar) + int rows, int cols, int dst_step1, F scalar) { int x = get_global_id(0); int y = get_global_id(1); @@ -244,13 +253,14 @@ __kernel void arithm_div_D5 (__global float *src1, int src1_step, int src1_offse float data1 = *((__global float *)((__global char *)src1 + src1_index)); float data2 = *((__global float *)((__global char *)src2 + src2_index)); - double tmp = convert_double(data1) * scalar; - float tmp_data = (tmp == 0 || data2 == 0) ? 0 : convert_float(tmp / (convert_double)(data2)); + F tmp = convert_F(data1) * scalar; + float tmp_data = (tmp == 0 || data2 == 0) ? 0 : convert_float(tmp / (convert_F)(data2)); *((__global float *)((__global char *)dst + dst_index)) = tmp_data; } } +#if defined (DOUBLE_SUPPORT) __kernel void arithm_div_D6 (__global double *src1, int src1_step, int src1_offset, __global double *src2, int src2_step, int src2_offset, __global double *dst, int dst_step, int dst_offset, @@ -274,10 +284,11 @@ __kernel void arithm_div_D6 (__global double *src1, int src1_step, int src1_offs *((__global double *)((__global char *)dst + dst_index)) = tmp_data; } } +#endif /************************************div with scalar************************************/ __kernel void arithm_s_div_D0 (__global uchar *src, int src_step, int src_offset, __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, double scalar) + int rows, int cols, int dst_step1, F scalar) { int x = get_global_id(0); int y = get_global_id(1); @@ -297,10 +308,10 @@ __kernel void arithm_s_div_D0 (__global uchar *src, int src_step, int src_offset uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data; - tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_uchar(scalar / (double)src_data.x); - tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_uchar(scalar / (double)src_data.y); - tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_uchar(scalar / (double)src_data.z); - tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_uchar(scalar / (double)src_data.w); + tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_uchar(scalar / (F)src_data.x); + tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_uchar(scalar / (F)src_data.y); + tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_uchar(scalar / (F)src_data.z); + tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_uchar(scalar / (F)src_data.w); dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y; @@ -313,7 +324,7 @@ __kernel void arithm_s_div_D0 (__global uchar *src, int src_step, int src_offset __kernel void arithm_s_div_D2 (__global ushort *src, int src_step, int src_offset, __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, double scalar) + int rows, int cols, int dst_step1, F scalar) { int x = get_global_id(0); int y = get_global_id(1); @@ -333,10 +344,10 @@ __kernel void arithm_s_div_D2 (__global ushort *src, int src_step, int src_offse ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); ushort4 tmp_data; - tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_ushort(scalar / (double)src_data.x); - tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_ushort(scalar / (double)src_data.y); - tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_ushort(scalar / (double)src_data.z); - tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_ushort(scalar / (double)src_data.w); + tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_ushort(scalar / (F)src_data.x); + tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_ushort(scalar / (F)src_data.y); + tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_ushort(scalar / (F)src_data.z); + tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_ushort(scalar / (F)src_data.w); dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y; @@ -348,7 +359,7 @@ __kernel void arithm_s_div_D2 (__global ushort *src, int src_step, int src_offse } __kernel void arithm_s_div_D3 (__global short *src, int src_step, int src_offset, __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, double scalar) + int rows, int cols, int dst_step1, F scalar) { int x = get_global_id(0); int y = get_global_id(1); @@ -368,10 +379,10 @@ __kernel void arithm_s_div_D3 (__global short *src, int src_step, int src_offset short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); short4 tmp_data; - tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_short(scalar / (double)src_data.x); - tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_short(scalar / (double)src_data.y); - tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_short(scalar / (double)src_data.z); - tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_short(scalar / (double)src_data.w); + tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_short(scalar / (F)src_data.x); + tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_short(scalar / (F)src_data.y); + tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_short(scalar / (F)src_data.z); + tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_short(scalar / (F)src_data.w); dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; @@ -385,7 +396,7 @@ __kernel void arithm_s_div_D3 (__global short *src, int src_step, int src_offset __kernel void arithm_s_div_D4 (__global int *src, int src_step, int src_offset, __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, double scalar) + int rows, int cols, int dst_step1, F scalar) { int x = get_global_id(0); int y = get_global_id(1); @@ -397,7 +408,7 @@ __kernel void arithm_s_div_D4 (__global int *src, int src_step, int src_offset, int data = *((__global int *)((__global char *)src + src_index)); - int tmp_data = (scalar == 0 || data == 0) ? 0 : round2_int(scalar / (convert_double)(data)); + int tmp_data = (scalar == 0 || data == 0) ? 0 : round2_int(scalar / (convert_F)(data)); *((__global int *)((__global char *)dst + dst_index)) =tmp_data; } @@ -405,7 +416,7 @@ __kernel void arithm_s_div_D4 (__global int *src, int src_step, int src_offset, __kernel void arithm_s_div_D5 (__global float *src, int src_step, int src_offset, __global float *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, double scalar) + int rows, int cols, int dst_step1, F scalar) { int x = get_global_id(0); int y = get_global_id(1); @@ -417,12 +428,13 @@ __kernel void arithm_s_div_D5 (__global float *src, int src_step, int src_offset float data = *((__global float *)((__global char *)src + src_index)); - float tmp_data = (scalar == 0 || data == 0) ? 0 : convert_float(scalar / (convert_double)(data)); + float tmp_data = (scalar == 0 || data == 0) ? 0 : convert_float(scalar / (convert_F)(data)); *((__global float *)((__global char *)dst + dst_index)) = tmp_data; } } +#if defined (DOUBLE_SUPPORT) __kernel void arithm_s_div_D6 (__global double *src, int src_step, int src_offset, __global double *dst, int dst_step, int dst_offset, int rows, int cols, int dst_step1, double scalar) @@ -442,5 +454,6 @@ __kernel void arithm_s_div_D6 (__global double *src, int src_step, int src_offse *((__global double *)((__global char *)dst + dst_index)) = tmp_data; } } +#endif diff --git a/modules/ocl/src/kernels/arithm_exp.cl b/modules/ocl/src/kernels/arithm_exp.cl index 18f7f01119..1b283a093f 100644 --- a/modules/ocl/src/kernels/arithm_exp.cl +++ b/modules/ocl/src/kernels/arithm_exp.cl @@ -70,6 +70,8 @@ __kernel void arithm_exp_D5(int rows, int cols, int srcStep, int dstStep, int sr } } + +#if defined (DOUBLE_SUPPORT) __kernel void arithm_exp_D6(int rows, int cols, int srcStep, int dstStep, int srcOffset, int dstOffset, __global double *src, __global double *dst) { int x = get_global_id(0); @@ -87,3 +89,5 @@ __kernel void arithm_exp_D6(int rows, int cols, int srcStep, int dstStep, int sr // dst[dstIdx] = exp(src[srcIdx]); } } + +#endif diff --git a/modules/ocl/src/kernels/arithm_log.cl b/modules/ocl/src/kernels/arithm_log.cl index ba93cc3f53..0810848004 100644 --- a/modules/ocl/src/kernels/arithm_log.cl +++ b/modules/ocl/src/kernels/arithm_log.cl @@ -73,7 +73,7 @@ __kernel void arithm_log_D5(int rows, int cols, int srcStep, int dstStep, int sr } } - +#if defined (DOUBLE_SUPPORT) __kernel void arithm_log_D6(int rows, int cols, int srcStep, int dstStep, int srcOffset, int dstOffset, __global double *src, __global double *dst) { int x = get_global_id(0); @@ -91,4 +91,4 @@ __kernel void arithm_log_D6(int rows, int cols, int srcStep, int dstStep, int sr } } - +#endif diff --git a/modules/ocl/src/kernels/convertC3C4.cl b/modules/ocl/src/kernels/convertC3C4.cl index 54f0fd9eeb..1b21fe68ce 100644 --- a/modules/ocl/src/kernels/convertC3C4.cl +++ b/modules/ocl/src/kernels/convertC3C4.cl @@ -6,7 +6,7 @@ // Third party copyrights are property of their respective owners. // // @Authors -// Zero Lin, zero.lin@amd.com +// Niko Li, newlife20080214@gmail.com // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: // @@ -32,106 +32,107 @@ // the use of this software, even if advised of the possibility of such damage. // // - -__kernel void convertC3C4_D0(__global const char4 * restrict src, __global char4 *dst, int cols, int rows, - int srcStep, int dstStep) +//#pragma OPENCL EXTENSION cl_amd_printf : enable +__kernel void convertC3C4(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst, int cols, int rows, + int dstStep_in_piexl,int pixel_end) { int id = get_global_id(0); - int y = id / cols; - int x = id % cols; + //int pixel_end = mul24(cols -1 , rows -1); + int3 pixelid = (int3)(mul24(id,3),mad24(id,3,1),mad24(id,3,2)); + pixelid = clamp(pixelid,0,pixel_end); + GENTYPE4 pixel0, pixel1, pixel2, outpix0,outpix1,outpix2,outpix3; + pixel0 = src[pixelid.x]; + pixel1 = src[pixelid.y]; + pixel2 = src[pixelid.z]; - int d = y * srcStep + x * 3; - char8 data = (char8)(src[d>>2], src[(d>>2) + 1]); - char temp[8] = {data.s0, data.s1, data.s2, data.s3, data.s4, data.s5, data.s6, data.s7}; - - int start = d & 3; - char4 ndata = (char4)(temp[start], temp[start + 1], temp[start + 2], 0); - if(y < rows) - dst[y * dstStep + x] = ndata; -} -__kernel void convertC3C4_D1(__global const short* restrict src, __global short4 *dst, int cols, int rows, - int srcStep, int dstStep) -{ - int id = get_global_id(0); - int y = id / cols; - int x = id % cols; + outpix0 = (GENTYPE4)(pixel0.x,pixel0.y,pixel0.z,0); + outpix1 = (GENTYPE4)(pixel0.w,pixel1.x,pixel1.y,0); + outpix2 = (GENTYPE4)(pixel1.z,pixel1.w,pixel2.x,0); + outpix3 = (GENTYPE4)(pixel2.y,pixel2.z,pixel2.w,0); - int d = (y * srcStep + x * 6)>>1; - short4 data = *(__global short4 *)(src + ((d>>1)<<1)); - short temp[4] = {data.s0, data.s1, data.s2, data.s3}; - - int start = d & 1; - short4 ndata = (short4)(temp[start], temp[start + 1], temp[start + 2], 0); - if(y < rows) - dst[y * dstStep + x] = ndata; + int4 outy = (id<<2)/cols; + int4 outx = (id<<2)%cols; + outx.y++; + outx.z+=2; + outx.w+=3; + outy = select(outy,outy+1,outx>=cols); + outx = select(outx,outx-cols,outx>=cols); + //outpix3 = select(outpix3, outpix0, (uchar4)(outy.w>=rows)); + //outpix2 = select(outpix2, outpix0, (uchar4)(outy.z>=rows)); + //outpix1 = select(outpix1, outpix0, (uchar4)(outy.y>=rows)); + //outx = select(outx,(int4)outx.x,outy>=rows); + //outy = select(outy,(int4)outy.x,outy>=rows); + int4 addr = mad24(outy,dstStep_in_piexl,outx); + if(outx.w>2; - int4 data = *(__global int4 *)(src + d); - data.z = 0; - - if(y < rows) - dst[y * dstStep + x] = data; -} -__kernel void convertC4C3_D2(__global const int4 * restrict src, __global int *dst, int cols, int rows, - int srcStep, int dstStep) -{ - int id = get_global_id(0); - int y = id / cols; - int x = id % cols; - - int4 data = src[y * srcStep + x]; - - if(y < rows) - { - int d = y * dstStep + x * 3; - dst[d] = data.x; - dst[d + 1] = data.y; - dst[d + 2] = data.z; - } -} -__kernel void convertC4C3_D1(__global const short4 * restrict src, __global short *dst, int cols, int rows, - int srcStep, int dstStep) +__kernel void convertC4C3(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst, int cols, int rows, + int srcStep_in_pixel,int pixel_end) { - int id = get_global_id(0); + int id = get_global_id(0)<<2; int y = id / cols; int x = id % cols; + int4 x4 = (int4)(x,x+1,x+2,x+3); + int4 y4 = select((int4)y,(int4)(y+1),x4>=(int4)cols); + x4 = select(x4,x4-(int4)cols,x4>=(int4)cols); + int4 addr = mad24(y4,(int4)srcStep_in_pixel,x4); + GENTYPE4 pixel0,pixel1,pixel2,pixel3, outpixel1, outpixel2; + pixel0 = src[addr.x]; + pixel1 = src[addr.y]; + pixel2 = src[addr.z]; + pixel3 = src[addr.w]; - short4 data = src[y * srcStep + x]; - - if(y < rows) + pixel0.w = pixel1.x; + outpixel1.x = pixel1.y; + outpixel1.y = pixel1.z; + outpixel1.z = pixel2.x; + outpixel1.w = pixel2.y; + outpixel2.x = pixel2.z; + outpixel2.y = pixel3.x; + outpixel2.z = pixel3.y; + outpixel2.w = pixel3.z; + int4 outaddr = mul24(id>>2 , 3); + outaddr.y++; + outaddr.z+=2; + //printf("%d ",outaddr.z); + if(outaddr.z <= pixel_end) { - int d = y * dstStep + x * 3; - dst[d] = data.x; - dst[d + 1] = data.y; - dst[d + 2] = data.z; + dst[outaddr.x] = pixel0; + dst[outaddr.y] = outpixel1; + dst[outaddr.z] = outpixel2; } -} - -__kernel void convertC4C3_D0(__global const char4 * restrict src, __global char *dst, int cols, int rows, - int srcStep, int dstStep) -{ - int id = get_global_id(0); - int y = id / cols; - int x = id % cols; - - char4 data = src[y * srcStep + x]; - - if(y < rows) + else if(outaddr.y <= pixel_end) { - int d = y * dstStep + x * 3; - dst[d] = data.x; - dst[d + 1] = data.y; - dst[d + 2] = data.z; + dst[outaddr.x] = pixel0; + dst[outaddr.y] = outpixel1; } + else if(outaddr.x <= pixel_end) + { + dst[outaddr.x] = pixel0; + } } diff --git a/modules/ocl/src/kernels/imgproc_resize.cl b/modules/ocl/src/kernels/imgproc_resize.cl index 2841886e24..995ce967d7 100644 --- a/modules/ocl/src/kernels/imgproc_resize.cl +++ b/modules/ocl/src/kernels/imgproc_resize.cl @@ -16,7 +16,7 @@ // // @Authors // Zhang Ying, zhangying913@gmail.com -// +// Niko Li, newlife20080214@gmail.com // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: // @@ -50,21 +50,11 @@ #if defined DOUBLE_SUPPORT #pragma OPENCL EXTENSION cl_khr_fp64:enable -typedef double F ; +#define F double #else -typedef float F; +#define F float #endif -inline uint4 getPoint_8uc4(__global uchar4 * data, int offset, int x, int y, int step) -{ - return convert_uint4(data[(offset>>2)+ y * (step>>2) + x]); -} - -inline float getPoint_32fc1(__global float * data, int offset, int x, int y, int step) -{ - return data[(offset>>2)+ y * (step>>2) + x]; -} - #define INTER_RESIZE_COEF_BITS 11 #define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS) @@ -72,8 +62,8 @@ inline float getPoint_32fc1(__global float * data, int offset, int x, int y, int #define CAST_SCALE (1.0f/(1<= (l) ? (x):((x)+1)) -__kernel void resizeLN_C1_D0(__global unsigned char * dst, __global unsigned char const * restrict src, - int dst_offset, int src_offset,int dst_step, int src_step, +__kernel void resizeLN_C1_D0(__global uchar * dst, __global uchar const * restrict src, + int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel, int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify ) { int gx = get_global_id(0); @@ -81,7 +71,7 @@ __kernel void resizeLN_C1_D0(__global unsigned char * dst, __global unsigned cha float4 sx, u, xf; int4 x, DX; - gx = (gx<<2) - (dst_offset&3); + gx = (gx<<2) - (dstoffset_in_pixel&3); DX = (int4)(gx, gx+1, gx+2, gx+3); sx = (convert_float4(DX) + 0.5f) * ifx - 0.5f; xf = floor(sx); @@ -119,10 +109,10 @@ __kernel void resizeLN_C1_D0(__global unsigned char * dst, __global unsigned cha int4 val1, val2, val; int4 sdata1, sdata2, sdata3, sdata4; - int4 pos1 = src_offset + y * src_step + x; - int4 pos2 = src_offset + y * src_step + x_; - int4 pos3 = src_offset + y_ * src_step + x; - int4 pos4 = src_offset + y_ * src_step + x_; + int4 pos1 = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel); + int4 pos2 = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel); + int4 pos3 = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel); + int4 pos4 = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel); sdata1.s0 = src[pos1.s0]; sdata1.s1 = src[pos1.s1]; @@ -144,20 +134,44 @@ __kernel void resizeLN_C1_D0(__global unsigned char * dst, __global unsigned cha sdata4.s2 = src[pos4.s2]; sdata4.s3 = src[pos4.s3]; - val1 = U1 * sdata1 + U * sdata2; - val2 = U1 * sdata3 + U * sdata4; - val = V1 * val1 + V * val2; + val1 = mul24(U1 , sdata1) + mul24(U , sdata2); + val2 = mul24(U1 , sdata3) + mul24(U , sdata4); + val = mul24(V1 , val1) + mul24(V , val2); - __global uchar4* d = (__global uchar4*)(dst + dst_offset + dy * dst_step + gx); - uchar4 dVal = *d; - int4 con = ( DX >= 0 && DX < dst_cols && dy >= 0 && dy < dst_rows); + //__global uchar4* d = (__global uchar4*)(dst + dstoffset_in_pixel + dy * dststep_in_pixel + gx); + //uchar4 dVal = *d; + //int4 con = ( DX >= 0 && DX < dst_cols && dy >= 0 && dy < dst_rows); val = ((val + (1<<(CAST_BITS-1))) >> CAST_BITS); - *d = convert_uchar4(con != 0) ? convert_uchar4_sat(val) : dVal; - + //*d = convert_uchar4(con != 0) ? convert_uchar4_sat(val) : dVal; + + pos4 = mad24(dy, dststep_in_pixel, gx+dstoffset_in_pixel); + pos4.y++; + pos4.z+=2; + uchar4 uval = convert_uchar4_sat(val); + int con = (gx >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows); + if(con) + { + *(__global uchar4*)(dst + pos4.x)=uval; + } + else + { + if(gx >= 0 && gx < dst_cols && dy >= 0 && dy < dst_rows) + { + dst[pos4.x]=uval.x; + } + if(gx+1 >= 0 && gx+1 < dst_cols && dy >= 0 && dy < dst_rows) + { + dst[pos4.y]=uval.y; + } + if(gx+2 >= 0 && gx+2 < dst_cols && dy >= 0 && dy < dst_rows) + { + dst[pos4.z]=uval.z; + } + } } __kernel void resizeLN_C4_D0(__global uchar4 * dst, __global uchar4 * src, - int dst_offset, int src_offset,int dst_step, int src_step, + int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel, int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify ) { int dx = get_global_id(0); @@ -182,18 +196,25 @@ __kernel void resizeLN_C4_D0(__global uchar4 * dst, __global uchar4 * src, int y_ = INC(y,src_rows); int x_ = INC(x,src_cols); - - uint4 val = U1* V1 * getPoint_8uc4(src,src_offset,x,y,src_step) + - U1* V * getPoint_8uc4(src,src_offset,x,y_,src_step) + - U * V1 * getPoint_8uc4(src,src_offset,x_,y,src_step) + - U * V * getPoint_8uc4(src,src_offset,x_,y_,src_step); - + int4 srcpos; + srcpos.x = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel); + srcpos.y = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel); + srcpos.z = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel); + srcpos.w = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel); + int4 data0 = convert_int4(src[srcpos.x]); + int4 data1 = convert_int4(src[srcpos.y]); + int4 data2 = convert_int4(src[srcpos.z]); + int4 data3 = convert_int4(src[srcpos.w]); + int4 val = mul24(mul24(U1, V1) , data0) + mul24(mul24(U, V1) , data1) + +mul24(mul24(U1, V) , data2)+mul24(mul24(U, V) , data3); + int dstpos = mad24(dy, dststep_in_pixel, dx+dstoffset_in_pixel); + uchar4 uval = convert_uchar4((val + (1<<(CAST_BITS-1)))>>CAST_BITS); if(dx>=0 && dx=0 && dy>2) + dy * (dst_step>>2) + dx] = convert_uchar4((val + (1<<(CAST_BITS-1)))>>CAST_BITS); + dst[dstpos] = uval; } __kernel void resizeLN_C1_D5(__global float * dst, __global float * src, - int dst_offset, int src_offset,int dst_step, int src_step, + int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel, int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify ) { int dx = get_global_id(0); @@ -210,19 +231,29 @@ __kernel void resizeLN_C1_D5(__global float * dst, __global float * src, int y_ = INC(y,src_rows); int x_ = INC(x,src_cols); - - float val1 = (1.0f-u) * getPoint_32fc1(src,src_offset,x,y,src_step) + - u * getPoint_32fc1(src,src_offset,x_,y,src_step) ; - float val2 = (1.0f-u) * getPoint_32fc1(src,src_offset,x,y_,src_step) + - u * getPoint_32fc1(src,src_offset,x_,y_,src_step); - float val = (1.0f-v) * val1 + v * val2; - + float u1 = 1.f-u; + float v1 = 1.f-v; + int4 srcpos; + srcpos.x = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel); + srcpos.y = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel); + srcpos.z = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel); + srcpos.w = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel); + float data0 = src[srcpos.x]; + float data1 = src[srcpos.y]; + float data2 = src[srcpos.z]; + float data3 = src[srcpos.w]; + float val1 = u1 * data0 + + u * data1 ; + float val2 = u1 * data2 + + u * data3; + float val = v1 * val1 + v * val2; + int dstpos = mad24(dy, dststep_in_pixel, dx+dstoffset_in_pixel); if(dx>=0 && dx=0 && dy>2) + dy * (dst_step>>2) + dx] = val; + dst[dstpos] = val; } __kernel void resizeLN_C4_D5(__global float4 * dst, __global float4 * src, - int dst_offset, int src_offset,int dst_step, int src_step, + int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel, int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify ) { int dx = get_global_id(0); @@ -239,31 +270,35 @@ __kernel void resizeLN_C4_D5(__global float4 * dst, __global float4 * src, int y_ = INC(y,src_rows); int x_ = INC(x,src_cols); - + float u1 = 1.f-u; + float v1 = 1.f-v; + int4 srcpos; + srcpos.x = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel); + srcpos.y = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel); + srcpos.z = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel); + srcpos.w = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel); float4 s_data1, s_data2, s_data3, s_data4; - src_offset = (src_offset >> 4); - src_step = (src_step >> 4); - s_data1 = src[src_offset + y*src_step + x]; - s_data2 = src[src_offset + y*src_step + x_]; - s_data3 = src[src_offset + y_*src_step + x]; - s_data4 = src[src_offset + y_*src_step + x_]; - s_data1 = (1.0f-u) * s_data1 + u * s_data2; - s_data2 = (1.0f-u) * s_data3 + u * s_data4; - s_data3 = (1.0f-v) * s_data1 + v * s_data2; + s_data1 = src[srcpos.x]; + s_data2 = src[srcpos.y]; + s_data3 = src[srcpos.z]; + s_data4 = src[srcpos.w]; + float4 val = u1 * v1 * s_data1 + u * v1 * s_data2 + +u1 * v *s_data3 + u * v *s_data4; + int dstpos = mad24(dy, dststep_in_pixel, dx+dstoffset_in_pixel); if(dx>=0 && dx=0 && dy>4) + dy * (dst_step>>4) + dx] = s_data3; + dst[dstpos] = val; } __kernel void resizeNN_C1_D0(__global uchar * dst, __global uchar * src, - int dst_offset, int src_offset,int dst_step, int src_step, + int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel, int src_cols, int src_rows, int dst_cols, int dst_rows, F ifx, F ify ) { int gx = get_global_id(0); int dy = get_global_id(1); - gx = (gx<<2) - (dst_offset&3); - int4 GX = (int4)(gx, gx+1, gx+2, gx+3); + gx = (gx<<2) - (dstoffset_in_pixel&3); + //int4 GX = (int4)(gx, gx+1, gx+2, gx+3); int4 sx; int sy; @@ -279,22 +314,42 @@ __kernel void resizeNN_C1_D0(__global uchar * dst, __global uchar * src, sy = min((int)floor(s5), src_rows-1); uchar4 val; - int4 pos = src_offset + sy * src_step + sx; + int4 pos = mad24(sy, srcstep_in_pixel, sx+srcoffset_in_pixel); val.s0 = src[pos.s0]; val.s1 = src[pos.s1]; val.s2 = src[pos.s2]; val.s3 = src[pos.s3]; - __global uchar4* d = (__global uchar4*)(dst + dst_offset + dy * dst_step + gx); - uchar4 dVal = *d; - int4 con = (GX >= 0 && GX < dst_cols && dy >= 0 && dy < dst_rows); - val = convert_uchar4(con != 0) ? val : dVal; - - *d = val; + //__global uchar4* d = (__global uchar4*)(dst + dstoffset_in_pixel + dy * dststep_in_pixel + gx); + //uchar4 dVal = *d; + pos = mad24(dy, dststep_in_pixel, gx+dstoffset_in_pixel); + pos.y++; + pos.z+=2; + + int con = (gx >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows); + if(con) + { + *(__global uchar4*)(dst + pos.x)=val; + } + else + { + if(gx >= 0 && gx < dst_cols && dy >= 0 && dy < dst_rows) + { + dst[pos.x]=val.x; + } + if(gx+1 >= 0 && gx+1 < dst_cols && dy >= 0 && dy < dst_rows) + { + dst[pos.y]=val.y; + } + if(gx+2 >= 0 && gx+2 < dst_cols && dy >= 0 && dy < dst_rows) + { + dst[pos.z]=val.z; + } + } } __kernel void resizeNN_C4_D0(__global uchar4 * dst, __global uchar4 * src, - int dst_offset, int src_offset,int dst_step, int src_step, + int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel, int src_cols, int src_rows, int dst_cols, int dst_rows, F ifx, F ify ) { int dx = get_global_id(0); @@ -304,8 +359,8 @@ __kernel void resizeNN_C4_D0(__global uchar4 * dst, __global uchar4 * src, F s2 = dy*ify; int sx = fmin((float)floor(s1), (float)src_cols-1); int sy = fmin((float)floor(s2), (float)src_rows-1); - int dpos = (dst_offset>>2) + dy * (dst_step>>2) + dx; - int spos = (src_offset>>2) + sy * (src_step>>2) + sx; + int dpos = mad24(dy, dststep_in_pixel, dx + dstoffset_in_pixel); + int spos = mad24(sy, srcstep_in_pixel, sx + srcoffset_in_pixel); if(dx>=0 && dx=0 && dy>2) + dy * (dst_step>>2) + dx; - int spos = (src_offset>>2) + sy * (src_step>>2) + sx; - + + int dpos = mad24(dy, dststep_in_pixel, dx + dstoffset_in_pixel); + int spos = mad24(sy, srcstep_in_pixel, sx + srcoffset_in_pixel); if(dx>=0 && dx=0 && dy>4) + dy * (dst_step>>4) + dx; - int spos = (src_offset>>4) + sy * (src_step>>4) + sx; + int dpos = mad24(dy, dststep_in_pixel, dx + dstoffset_in_pixel); + int spos = mad24(sy, srcstep_in_pixel, sx + srcoffset_in_pixel); if(dx>=0 && dx=0 && dy=addr_start)&(idx+3 < addr_end) & (y < rows)) { *(__global uchar4*)(dstMat+idx) = out; @@ -65,7 +61,7 @@ __kernel void set_to_without_mask_C1_D0(float4 scalar,__global uchar * dstMat, } } -__kernel void set_to_without_mask_C4_D0(float4 scalar,__global uchar4 * dstMat, +__kernel void set_to_without_mask(GENTYPE scalar,__global GENTYPE * dstMat, int cols,int rows,int dstStep_in_pixel,int offset_in_pixel) { int x=get_global_id(0); @@ -73,52 +69,6 @@ __kernel void set_to_without_mask_C4_D0(float4 scalar,__global uchar4 * dstMat, if ( (x < cols) & (y < rows)) { int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel); - dstMat[idx] = convert_uchar4_sat(scalar); + dstMat[idx] = scalar; } } -__kernel void set_to_without_mask_C1_D4(float4 scalar,__global int * dstMat, - int cols,int rows,int dstStep_in_pixel,int offset_in_pixel) -{ - int x=get_global_id(0); - int y=get_global_id(1); - if ( (x < cols) & (y < rows)) - { - int idx = mad24(y, dstStep_in_pixel, x+offset_in_pixel); - dstMat[idx] = convert_int_sat(scalar.x); - } -} -__kernel void set_to_without_mask_C4_D4(float4 scalar,__global int4 * dstMat, - int cols,int rows,int dstStep_in_pixel,int offset_in_pixel) -{ - int x=get_global_id(0); - int y=get_global_id(1); - if ( (x < cols) & (y < rows)) - { - int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel); - dstMat[idx] = convert_int4_sat(scalar); - } -} - -__kernel void set_to_without_mask_C1_D5(float4 scalar,__global float * dstMat, - int cols,int rows,int dstStep_in_pixel,int offset_in_pixel) -{ - int x=get_global_id(0); - int y=get_global_id(1); - if ( (x < cols) & (y < rows)) - { - int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel); - dstMat[idx] = scalar.x; - } -} -__kernel void set_to_without_mask_C4_D5(float4 scalar,__global float4 * dstMat, - int cols,int rows,int dstStep_in_pixel,int offset_in_pixel) -{ - int x=get_global_id(0); - int y=get_global_id(1); - if ( (x < cols) & (y < rows)) - { - int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel); - dstMat[idx] = scalar; - } -} - diff --git a/modules/ocl/src/kernels/operator_setToM.cl b/modules/ocl/src/kernels/operator_setToM.cl index e306657e4b..56a579b3f3 100644 --- a/modules/ocl/src/kernels/operator_setToM.cl +++ b/modules/ocl/src/kernels/operator_setToM.cl @@ -35,12 +35,6 @@ // -/*#if defined (__ATI__) -#pragma OPENCL EXTENSION cl_amd_fp64:enable -#elif defined (__NVIDIA__) -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#endif -*/ /* __kernel void set_to_with_mask_C1_D0( float4 scalar, @@ -67,7 +61,7 @@ __kernel void set_to_with_mask_C1_D0( */ //#pragma OPENCL EXTENSION cl_amd_printf : enable __kernel void set_to_with_mask_C1_D0( - float4 scalar, + uchar scalar, __global uchar* dstMat, int cols, int rows, @@ -85,7 +79,7 @@ __kernel void set_to_with_mask_C1_D0( int mask_addr_start = mad24(y,maskStep,maskoffset); int mask_addr_end = mad24(y,maskStep,cols+maskoffset); int maskidx = mad24(y,maskStep,x+ maskoffset & (int)0xfffffffc); - uchar out = convert_uchar_sat(scalar.x); + int off_mask = (maskoffset & 3) - (dstoffset_in_pixel & 3) +3; if ( (x < cols) & (y < rows) ) @@ -107,104 +101,16 @@ __kernel void set_to_with_mask_C1_D0( temp_mask2.z = (maskidx+6 >=mask_addr_start)&(maskidx+6 < mask_addr_end) ? temp_mask2.z : 0; temp_mask2.w = (maskidx+7 >=mask_addr_start)&(maskidx+7 < mask_addr_end) ? temp_mask2.w : 0; uchar trans_mask[10] = {temp_mask1.y,temp_mask1.z,temp_mask1.w,temp_mask.x,temp_mask.y,temp_mask.z,temp_mask.w,temp_mask2.x,temp_mask2.y,temp_mask2.z}; - temp_dst.x = (dstidx>=dst_addr_start)&(dstidx=dst_addr_start)&(dstidx+1=dst_addr_start)&(dstidx+2=dst_addr_start)&(dstidx+3=dst_addr_start)&(dstidx=dst_addr_start)&(dstidx+1=dst_addr_start)&(dstidx+2=dst_addr_start)&(dstidx+3 > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&src)); args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data)); args.push_back( make_pair( sizeof(cl_int), (void *)&dst.wholecols)); args.push_back( make_pair( sizeof(cl_int), (void *)&dst.wholerows)); - args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep)); - args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep)); + args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep_in_pixel)); + args.push_back( make_pair( sizeof(cl_int), (void *)&pixel_end)); - size_t globalThreads[3] = {(dst.wholecols *dst.wholerows + 255) / 256 * 256, 1, 1}; + size_t globalThreads[3] = {((dst.wholecols *dst.wholerows+3)/4 + 255) / 256 * 256, 1, 1}; size_t localThreads[3] = {256, 1, 1}; - openCLExecuteKernel(clCxt, &convertC3C4, kernelName, globalThreads, localThreads, args, -1, dst.elemSize1() >> 1); + openCLExecuteKernel(clCxt, &convertC3C4, kernelName, globalThreads, localThreads, args, -1, -1,compile_option); } //////////////////////////////////////////////////////////////////////// // convert_C4C3 void convert_C4C3(const oclMat &src, cl_mem &dst, int dstStep) { - int srcStep = src.step1() / src.channels(); + int srcStep_in_pixel = src.step1() / src.channels(); + int pixel_end = src.wholecols*src.wholerows -1; Context *clCxt = src.clCxt; string kernelName = "convertC4C3"; + char compile_option[32]; + switch(src.depth()) + { + case 0: + sprintf(compile_option, "-D GENTYPE4=uchar4"); + break; + case 1: + sprintf(compile_option, "-D GENTYPE4=char4"); + break; + case 2: + sprintf(compile_option, "-D GENTYPE4=ushort4"); + break; + case 3: + sprintf(compile_option, "-D GENTYPE4=short4"); + break; + case 4: + sprintf(compile_option, "-D GENTYPE4=int4"); + break; + case 5: + sprintf(compile_option, "-D GENTYPE4=float4"); + break; + case 6: + sprintf(compile_option, "-D GENTYPE4=double4"); + break; + default: + CV_Error(-217,"unknown depth"); + } vector< pair > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&dst)); args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholecols)); args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholerows)); - args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep)); - args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep)); + args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep_in_pixel)); + args.push_back( make_pair( sizeof(cl_int), (void *)&pixel_end)); - size_t globalThreads[3] = {(src.wholecols *src.wholerows + 255) / 256 * 256, 1, 1}; + size_t globalThreads[3] = {((src.wholecols *src.wholerows+3)/4 + 255) / 256 * 256, 1, 1}; size_t localThreads[3] = {256, 1, 1}; - openCLExecuteKernel(clCxt, &convertC3C4, kernelName, globalThreads, localThreads, args, -1, src.elemSize1() >> 1); + openCLExecuteKernel(clCxt, &convertC3C4, kernelName, globalThreads, localThreads, args, -1, -1,compile_option); } void cv::ocl::oclMat::upload(const Mat &m) @@ -173,23 +229,47 @@ void cv::ocl::oclMat::upload(const Mat &m) Point ofs; m.locateROI(wholeSize, ofs); int type = m.type(); - //if(m.channels() == 3) - //type = CV_MAKETYPE(m.depth(), 4); + if(m.channels() == 3) + { + type = CV_MAKETYPE(m.depth(), 4); + } create(wholeSize, type); - //if(m.channels() == 3) - //{ - //int pitch = GPU_MATRIX_MALLOC_STEP(wholeSize.width * 3 * m.elemSize1()); - //int err; - //cl_mem temp = clCreateBuffer(clCxt->clContext,CL_MEM_READ_WRITE, - //pitch*wholeSize.height,0,&err); - //CV_DbgAssert(err==0); - - //openCLMemcpy2D(clCxt,temp,pitch,m.datastart,m.step,wholeSize.width*m.elemSize(),wholeSize.height,clMemcpyHostToDevice); - //convert_C3C4(temp, *this, pitch); - //} - //else - openCLMemcpy2D(clCxt, data, step, m.datastart, m.step, wholeSize.width * elemSize(), wholeSize.height, clMemcpyHostToDevice); + if(m.channels() == 3) + { + int pitch = wholeSize.width * 3 * m.elemSize1(); + int tail_padding = m.elemSize1()*3072; + int err; + cl_mem temp = clCreateBuffer(clCxt->impl->clContext,CL_MEM_READ_WRITE, + (pitch*wholeSize.height+tail_padding-1)/tail_padding*tail_padding,0,&err); + openCLVerifyCall(err); + + openCLMemcpy2D(clCxt,temp,pitch,m.datastart,m.step,wholeSize.width*m.elemSize(),wholeSize.height,clMemcpyHostToDevice,3); + convert_C3C4(temp, *this, pitch); + //int* cputemp=new int[wholeSize.height*wholeSize.width * 3]; + //int* cpudata=new int[this->step*this->wholerows/sizeof(int)]; + //openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, temp, CL_TRUE, + // 0, wholeSize.height*wholeSize.width * 3* sizeof(int), cputemp, 0, NULL, NULL)); + //openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)data, CL_TRUE, + // 0, this->step*this->wholerows, cpudata, 0, NULL, NULL)); + //for(int i=0;istep/sizeof(int); + // for(int j=0;jempty()); int t = type(); - //if(download_channels == 3) - //t = CV_MAKETYPE(depth(), 3); + if(download_channels == 3) + { + t = CV_MAKETYPE(depth(), 3); + } m.create(wholerows, wholecols, t); - //if(download_channels == 3) - //{ - //int pitch = GPU_MATRIX_MALLOC_STEP(wholecols * 3 * m.elemSize1()); - //int err; - //cl_mem temp = clCreateBuffer(clCxt->clContext,CL_MEM_READ_WRITE, - //pitch*wholerows,0,&err); - //CV_DbgAssert(err==0); - - //convert_C4C3(*this, temp, pitch/m.elemSize1()); - //openCLMemcpy2D(clCxt,m.data,m.step,temp,pitch,wholecols*m.elemSize(),wholerows,clMemcpyDeviceToHost); - //} - //else - openCLMemcpy2D(clCxt, m.data, m.step, data, step, wholecols * elemSize(), wholerows, clMemcpyDeviceToHost); + if(download_channels == 3) + { + int pitch = wholecols * 3 * m.elemSize1(); + int tail_padding = m.elemSize1()*3072; + int err; + cl_mem temp = clCreateBuffer(clCxt->impl->clContext,CL_MEM_READ_WRITE, + (pitch*wholerows+tail_padding-1)/tail_padding*tail_padding,0,&err); + openCLVerifyCall(err); + + convert_C4C3(*this, temp, pitch/m.elemSize1()); + openCLMemcpy2D(clCxt,m.data,m.step,temp,pitch,wholecols*m.elemSize(),wholerows,clMemcpyDeviceToHost,3); + //int* cputemp=new int[wholecols*wholerows * 3]; + //int* cpudata=new int[this->step*this->wholerows/sizeof(int)]; + //openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, temp, CL_TRUE, + // 0, wholecols*wholerows * 3* sizeof(int), cputemp, 0, NULL, NULL)); + //openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)data, CL_TRUE, + // 0, this->step*this->wholerows, cpudata, 0, NULL, NULL)); + //for(int i=0;istep/sizeof(int); + // for(int j=0;j > args; - cl_float4 val; - val.s[0] = scalar.val[0]; - val.s[1] = scalar.val[1]; - val.s[2] = scalar.val[2]; - val.s[3] = scalar.val[3]; + size_t localThreads[3] = {16, 16, 1}; size_t globalThreads[3]; globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0]; @@ -388,25 +488,168 @@ void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, string kern { globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0]; } - args.push_back( make_pair( sizeof(cl_float4) , (void *)&val )); + char compile_option[32]; + union sc + { + cl_uchar4 uval; + cl_char4 cval; + cl_ushort4 usval; + cl_short4 shval; + cl_int4 ival; + cl_float4 fval; + cl_double4 dval; + }val; + switch(dst.depth()) + { + case 0: + val.uval.s[0] = saturate_cast(scalar.val[0]); + val.uval.s[1] = saturate_cast(scalar.val[1]); + val.uval.s[2] = saturate_cast(scalar.val[2]); + val.uval.s[3] = saturate_cast(scalar.val[3]); + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=uchar"); + args.push_back( make_pair( sizeof(cl_uchar) , (void *)&val.uval.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=uchar4"); + args.push_back( make_pair( sizeof(cl_uchar4) , (void *)&val.uval )); + break; + default: + CV_Error(-217,"unsupported channels"); + } + break; + case 1: + val.cval.s[0] = saturate_cast(scalar.val[0]); + val.cval.s[1] = saturate_cast(scalar.val[1]); + val.cval.s[2] = saturate_cast(scalar.val[2]); + val.cval.s[3] = saturate_cast(scalar.val[3]); + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=char"); + args.push_back( make_pair( sizeof(cl_char) , (void *)&val.cval.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=char4"); + args.push_back( make_pair( sizeof(cl_char4) , (void *)&val.cval )); + break; + default: + CV_Error(-217,"unsupported channels"); + } + break; + case 2: + val.usval.s[0] = saturate_cast(scalar.val[0]); + val.usval.s[1] = saturate_cast(scalar.val[1]); + val.usval.s[2] = saturate_cast(scalar.val[2]); + val.usval.s[3] = saturate_cast(scalar.val[3]); + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=ushort"); + args.push_back( make_pair( sizeof(cl_ushort) , (void *)&val.usval.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=ushort4"); + args.push_back( make_pair( sizeof(cl_ushort4) , (void *)&val.usval )); + break; + default: + CV_Error(-217,"unsupported channels"); + } + break; + case 3: + val.shval.s[0] = saturate_cast(scalar.val[0]); + val.shval.s[1] = saturate_cast(scalar.val[1]); + val.shval.s[2] = saturate_cast(scalar.val[2]); + val.shval.s[3] = saturate_cast(scalar.val[3]); + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=short"); + args.push_back( make_pair( sizeof(cl_short) , (void *)&val.shval.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=short4"); + args.push_back( make_pair( sizeof(cl_short4) , (void *)&val.shval )); + break; + default: + CV_Error(-217,"unsupported channels"); + } + break; + case 4: + val.ival.s[0] = saturate_cast(scalar.val[0]); + val.ival.s[1] = saturate_cast(scalar.val[1]); + val.ival.s[2] = saturate_cast(scalar.val[2]); + val.ival.s[3] = saturate_cast(scalar.val[3]); + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=int"); + args.push_back( make_pair( sizeof(cl_int) , (void *)&val.ival.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=int4"); + args.push_back( make_pair( sizeof(cl_int4) , (void *)&val.ival )); + break; + default: + CV_Error(-217,"unsupported channels"); + } + break; + case 5: + val.fval.s[0] = scalar.val[0]; + val.fval.s[1] = scalar.val[1]; + val.fval.s[2] = scalar.val[2]; + val.fval.s[3] = scalar.val[3]; + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=float"); + args.push_back( make_pair( sizeof(cl_float) , (void *)&val.fval.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=float4"); + args.push_back( make_pair( sizeof(cl_float4) , (void *)&val.fval )); + break; + default: + CV_Error(-217,"unsupported channels"); + } + break; + case 6: + val.dval.s[0] = scalar.val[0]; + val.dval.s[1] = scalar.val[1]; + val.dval.s[2] = scalar.val[2]; + val.dval.s[3] = scalar.val[3]; + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=double"); + args.push_back( make_pair( sizeof(cl_double) , (void *)&val.dval.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=double4"); + args.push_back( make_pair( sizeof(cl_double4) , (void *)&val.dval )); + break; + default: + CV_Error(-217,"unsupported channels"); + } + break; + default: + CV_Error(-217,"unknown depth"); + } args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data )); args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols )); args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows )); args.push_back( make_pair( sizeof(cl_int) , (void *)&step_in_pixel )); args.push_back( make_pair( sizeof(cl_int) , (void *)&offset_in_pixel)); openCLExecuteKernel(dst.clCxt , &operator_setTo, kernelName, globalThreads, - localThreads, args, dst.channels(), dst.depth()); + localThreads, args, -1, -1,compile_option); } void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const oclMat &mask, string kernelName) { CV_DbgAssert( dst.rows == mask.rows && dst.cols == mask.cols); vector > args; - cl_float4 val; - val.s[0] = scalar.val[0]; - val.s[1] = scalar.val[1]; - val.s[2] = scalar.val[2]; - val.s[3] = scalar.val[3]; size_t localThreads[3] = {16, 16, 1}; size_t globalThreads[3]; globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0]; @@ -417,7 +660,155 @@ void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const oclMat & globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0]; } int step_in_pixel = dst.step / dst.elemSize(), offset_in_pixel = dst.offset / dst.elemSize(); - args.push_back( make_pair( sizeof(cl_float4) , (void *)&val )); + char compile_option[32]; + union sc + { + cl_uchar4 uval; + cl_char4 cval; + cl_ushort4 usval; + cl_short4 shval; + cl_int4 ival; + cl_float4 fval; + cl_double4 dval; + }val; + switch(dst.depth()) + { + case 0: + val.uval.s[0] = saturate_cast(scalar.val[0]); + val.uval.s[1] = saturate_cast(scalar.val[1]); + val.uval.s[2] = saturate_cast(scalar.val[2]); + val.uval.s[3] = saturate_cast(scalar.val[3]); + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=uchar"); + args.push_back( make_pair( sizeof(cl_uchar) , (void *)&val.uval.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=uchar4"); + args.push_back( make_pair( sizeof(cl_uchar4) , (void *)&val.uval )); + break; + default: + CV_Error(-217,"unsupported channels"); + } + break; + case 1: + val.cval.s[0] = saturate_cast(scalar.val[0]); + val.cval.s[1] = saturate_cast(scalar.val[1]); + val.cval.s[2] = saturate_cast(scalar.val[2]); + val.cval.s[3] = saturate_cast(scalar.val[3]); + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=char"); + args.push_back( make_pair( sizeof(cl_char) , (void *)&val.cval.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=char4"); + args.push_back( make_pair( sizeof(cl_char4) , (void *)&val.cval )); + break; + default: + CV_Error(-217,"unsupported channels"); + } + break; + case 2: + val.usval.s[0] = saturate_cast(scalar.val[0]); + val.usval.s[1] = saturate_cast(scalar.val[1]); + val.usval.s[2] = saturate_cast(scalar.val[2]); + val.usval.s[3] = saturate_cast(scalar.val[3]); + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=ushort"); + args.push_back( make_pair( sizeof(cl_ushort) , (void *)&val.usval.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=ushort4"); + args.push_back( make_pair( sizeof(cl_ushort4) , (void *)&val.usval )); + break; + default: + CV_Error(-217,"unsupported channels"); + } + break; + case 3: + val.shval.s[0] = saturate_cast(scalar.val[0]); + val.shval.s[1] = saturate_cast(scalar.val[1]); + val.shval.s[2] = saturate_cast(scalar.val[2]); + val.shval.s[3] = saturate_cast(scalar.val[3]); + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=short"); + args.push_back( make_pair( sizeof(cl_short) , (void *)&val.shval.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=short4"); + args.push_back( make_pair( sizeof(cl_short4) , (void *)&val.shval )); + break; + default: + CV_Error(-217,"unsupported channels"); + } + break; + case 4: + val.ival.s[0] = saturate_cast(scalar.val[0]); + val.ival.s[1] = saturate_cast(scalar.val[1]); + val.ival.s[2] = saturate_cast(scalar.val[2]); + val.ival.s[3] = saturate_cast(scalar.val[3]); + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=int"); + args.push_back( make_pair( sizeof(cl_int) , (void *)&val.ival.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=int4"); + args.push_back( make_pair( sizeof(cl_int4) , (void *)&val.ival )); + break; + default: + CV_Error(-217,"unsupported channels"); + } + break; + case 5: + val.fval.s[0] = scalar.val[0]; + val.fval.s[1] = scalar.val[1]; + val.fval.s[2] = scalar.val[2]; + val.fval.s[3] = scalar.val[3]; + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=float"); + args.push_back( make_pair( sizeof(cl_float) , (void *)&val.fval.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=float4"); + args.push_back( make_pair( sizeof(cl_float4) , (void *)&val.fval )); + break; + default: + CV_Error(-217,"unsupported channels"); + } + break; + case 6: + val.dval.s[0] = scalar.val[0]; + val.dval.s[1] = scalar.val[1]; + val.dval.s[2] = scalar.val[2]; + val.dval.s[3] = scalar.val[3]; + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=double"); + args.push_back( make_pair( sizeof(cl_double) , (void *)&val.dval.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=double4"); + args.push_back( make_pair( sizeof(cl_double4) , (void *)&val.dval )); + break; + default: + CV_Error(-217,"unsupported channels"); + } + break; + default: + CV_Error(-217,"unknown depth"); + } args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data )); args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols )); args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows )); @@ -427,7 +818,7 @@ void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const oclMat & args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.step )); args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.offset )); openCLExecuteKernel(dst.clCxt , &operator_setToM, kernelName, globalThreads, - localThreads, args, dst.channels(), dst.depth()); + localThreads, args, -1, -1,compile_option); } oclMat &cv::ocl::oclMat::setTo(const Scalar &scalar, const oclMat &mask) @@ -446,11 +837,25 @@ oclMat &cv::ocl::oclMat::setTo(const Scalar &scalar, const oclMat &mask) // (cl_mem)mem,1,0,sizeof(double)*4,s,0,0,0)); if (mask.empty()) { - set_to_withoutmask_run(*this, scalar, "set_to_without_mask"); + if(type()==CV_8UC1) + { + set_to_withoutmask_run(*this, scalar, "set_to_without_mask_C1_D0"); + } + else + { + set_to_withoutmask_run(*this, scalar, "set_to_without_mask"); + } } else { - set_to_withmask_run(*this, scalar, mask, "set_to_with_mask"); + if(type()==CV_8UC1) + { + set_to_withmask_run(*this, scalar, mask,"set_to_with_mask_C1_D0"); + } + else + { + set_to_withmask_run(*this, scalar, mask, "set_to_with_mask"); + } } return *this; diff --git a/modules/ocl/src/precomp.hpp b/modules/ocl/src/precomp.hpp index 587d70dce7..0bde1e7754 100644 --- a/modules/ocl/src/precomp.hpp +++ b/modules/ocl/src/precomp.hpp @@ -97,7 +97,7 @@ namespace cv size_t widthInBytes, size_t height); void openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch, const void *src, size_t spitch, - size_t width, size_t height, enum openCLMemcpyKind kind); + size_t width, size_t height, enum openCLMemcpyKind kind, int channels=-1); void openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset, const void *src, size_t spitch, size_t width, size_t height, int src_offset, enum openCLMemcpyKind kind); @@ -126,8 +126,8 @@ namespace cv cl_mem openCLMalloc(cl_context clCxt, size_t size, cl_mem_flags flags, void *host_ptr); - void openCLMemcpy2DWithNoPadding(cl_command_queue command_queue, cl_mem buffer, size_t size, size_t offset, void *ptr, - enum openCLMemcpyKind kind, cl_bool blocking_write); + //void openCLMemcpy2DWithNoPadding(cl_command_queue command_queue, cl_mem buffer, size_t size, size_t offset, void *ptr, + // enum openCLMemcpyKind kind, cl_bool blocking_write); int savetofile(const Context *clcxt, cl_program &program, const char *fileName); struct Context::Impl { diff --git a/modules/ocl/test/test_imgproc.cpp b/modules/ocl/test/test_imgproc.cpp index ff2f441718..90ff0b4414 100644 --- a/modules/ocl/test/test_imgproc.cpp +++ b/modules/ocl/test/test_imgproc.cpp @@ -958,7 +958,7 @@ TEST_P(Remap, Mat) if((interpolation == 1 && map1Type == CV_16SC2) ||(interpolation == 1 && map1Type == CV_16SC1 && map2Type == CV_16SC1)) { cout << "LINEAR don't support the map1Type and map2Type" << endl; - return; + return; } int bordertype[] = {cv::BORDER_CONSTANT,cv::BORDER_REPLICATE/*,BORDER_REFLECT,BORDER_WRAP,BORDER_REFLECT_101*/}; const char* borderstr[]={"BORDER_CONSTANT", "BORDER_REPLICATE"/*, "BORDER_REFLECT","BORDER_WRAP","BORDER_REFLECT_101"*/}; diff --git a/modules/ocl/test/test_matrix_operation.cpp b/modules/ocl/test/test_matrix_operation.cpp index 997fbe7ad5..d538748fb7 100644 --- a/modules/ocl/test/test_matrix_operation.cpp +++ b/modules/ocl/test/test_matrix_operation.cpp @@ -396,6 +396,101 @@ TEST_P(SetTo, With_mask) } } +//convertC3C4 +PARAM_TEST_CASE(convertC3C4, MatType, cv::Size) +{ + int type; + cv::Size ksize; + + //src mat + cv::Mat mat1; + cv::Mat dst; + + // set up roi + int roicols; + int roirows; + int src1x; + int src1y; + int dstx; + int dsty; + + //src mat with roi + cv::Mat mat1_roi; + cv::Mat dst_roi; + std::vector oclinfo; + //ocl dst mat for testing + cv::ocl::oclMat gdst_whole; + + //ocl mat with roi + cv::ocl::oclMat gmat1; + cv::ocl::oclMat gdst; + + virtual void SetUp() + { + type = GET_PARAM(0); + ksize = GET_PARAM(1); + + + + //dst = randomMat(rng, size, type, 5, 16, false); + int devnums = getDevice(oclinfo); + CV_Assert(devnums > 0); + //if you want to use undefault device, set it here + //setDevice(oclinfo[1]); + } + + void random_roi() + { +#ifdef RANDOMROI + //randomize ROI + cv::RNG &rng = TS::ptr()->get_rng(); + roicols = rng.uniform(2, mat1.cols); + roirows = rng.uniform(2, mat1.rows); + src1x = rng.uniform(0, mat1.cols - roicols); + src1y = rng.uniform(0, mat1.rows - roirows); + dstx = rng.uniform(0, dst.cols - roicols); + dsty = rng.uniform(0, dst.rows - roirows); +#else + roicols = mat1.cols; + roirows = mat1.rows; + src1x = 0; + src1y = 0; + dstx = 0; + dsty = 0; +#endif + + mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows)); + dst_roi = dst(Rect(dstx, dsty, roicols, roirows)); + + gdst_whole = dst; + gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows)); + + + gmat1 = mat1_roi; + } + +}; + +TEST_P(convertC3C4, Accuracy) +{ + cv::RNG &rng = TS::ptr()->get_rng(); + for(int j = 0; j < LOOP_TIMES; j++) + { + //random_roi(); + int width = rng.uniform(2, MWIDTH); + int height = rng.uniform(2, MHEIGHT); + cv::Size size(width, height); + + mat1 = randomMat(rng, size, type, 0, 40, false); + gmat1 = mat1; + cv::Mat cpu_dst; + gmat1.download(cpu_dst); + char sss[1024]; + sprintf(sss, "cols=%d,rows=%d", mat1.cols, mat1.rows); + EXPECT_MAT_NEAR(mat1, cpu_dst, 0.0, sss); + } + +} INSTANTIATE_TEST_CASE_P(MatrixOperation, ConvertTo, Combine( Values(CV_8UC1, CV_8UC4, CV_32SC1, CV_32SC4, CV_32FC1, CV_32FC4), @@ -408,5 +503,8 @@ INSTANTIATE_TEST_CASE_P(MatrixOperation, CopyTo, Combine( INSTANTIATE_TEST_CASE_P(MatrixOperation, SetTo, Combine( Values(CV_8UC1, CV_8UC4, CV_32SC1, CV_32SC4, CV_32FC1, CV_32FC4), Values(false))); // Values(false) is the reserved parameter - + +INSTANTIATE_TEST_CASE_P(MatrixOperation, convertC3C4, Combine( + Values(CV_8UC3, CV_32SC3, CV_32FC3), + Values(cv::Size()))); #endif