diff --git a/modules/ocl/include/opencv2/ocl/private/util.hpp b/modules/ocl/include/opencv2/ocl/private/util.hpp index ed96eda4c8..88f603baaf 100644 --- a/modules/ocl/include/opencv2/ocl/private/util.hpp +++ b/modules/ocl/include/opencv2/ocl/private/util.hpp @@ -103,7 +103,11 @@ CV_EXPORTS cl_kernel openCLGetKernelFromSource(const Context *clCxt, const cv::ocl::ProgramEntry* source, std::string kernelName); CV_EXPORTS cl_kernel openCLGetKernelFromSource(const Context *clCxt, const cv::ocl::ProgramEntry* source, std::string kernelName, const char *build_options); +CV_EXPORTS cl_kernel openCLGetKernelFromSource(Context *ctx, const cv::ocl::ProgramEntry* source, + string kernelName, int channels, int depth, const char *build_options); CV_EXPORTS void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads); +CV_EXPORTS void openCLExecuteKernel(Context *ctx, cl_kernel kernel, size_t globalThreads[3], + size_t localThreads[3], std::vector< std::pair > &args); CV_EXPORTS void openCLExecuteKernel(Context *clCxt , const cv::ocl::ProgramEntry* source, string kernelName, std::vector< std::pair > &args, int globalcols , int globalrows, size_t blockSize = 16, int kernel_expand_depth = -1, int kernel_expand_channel = -1); CV_EXPORTS void openCLExecuteKernel_(Context *clCxt, const cv::ocl::ProgramEntry* source, std::string kernelName, diff --git a/modules/ocl/src/cl_operations.cpp b/modules/ocl/src/cl_operations.cpp index 7f09b1e505..d344689c4b 100644 --- a/modules/ocl/src/cl_operations.cpp +++ b/modules/ocl/src/cl_operations.cpp @@ -336,8 +336,7 @@ static std::string removeDuplicatedWhiteSpaces(const char * buildOptions) return opt; } -void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName, size_t globalThreads[3], - size_t localThreads[3], vector< pair > &args, int channels, +cl_kernel openCLGetKernelFromSource(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName, int channels, int depth, const char *build_options) { //construct kernel name @@ -350,10 +349,14 @@ void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, str idxStr << "_D" << depth; kernelName += idxStr.str(); - cl_kernel kernel; std::string fixedOptions = removeDuplicatedWhiteSpaces(build_options); - kernel = openCLGetKernelFromSource(ctx, source, kernelName, fixedOptions.c_str()); + cl_kernel kernel = openCLGetKernelFromSource(ctx, source, kernelName, fixedOptions.c_str()); + return kernel; +} +void openCLExecuteKernel(Context *ctx, cl_kernel kernel, size_t globalThreads[3], + size_t localThreads[3], vector< pair > &args) +{ if ( localThreads != NULL) { globalThreads[0] = roundUp(globalThreads[0], localThreads[0]); @@ -399,6 +402,15 @@ void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, str openCLSafeCall(clReleaseKernel(kernel)); } +void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName, size_t globalThreads[3], + size_t localThreads[3], vector< pair > &args, int channels, + int depth, const char *build_options) +{ + cl_kernel kernel = openCLGetKernelFromSource(ctx, source, kernelName, channels, depth, build_options); + + openCLExecuteKernel(ctx, kernel, globalThreads, localThreads, args); +} + void openCLExecuteKernel(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName, size_t globalThreads[3], size_t localThreads[3], vector< pair > &args, int channels, int depth) diff --git a/modules/ocl/src/filtering.cpp b/modules/ocl/src/filtering.cpp index 4a04e2de83..1ba07114a3 100644 --- a/modules/ocl/src/filtering.cpp +++ b/modules/ocl/src/filtering.cpp @@ -578,104 +578,124 @@ static void GPUFilter2D(const oclMat &src, oclMat &dst, const Mat &kernel, kernelDataFloat.size()*sizeof(float), 1, clMemcpyHostToDevice); } - size_t BLOCK_SIZE = src.clCxt->getDeviceInfo().maxWorkItemSizes[0]; + size_t tryWorkItems = src.clCxt->getDeviceInfo().maxWorkItemSizes[0]; + do { + size_t BLOCK_SIZE = tryWorkItems; + while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)src.cols * 2) + BLOCK_SIZE /= 2; #if 1 // TODO Mode with several blocks requires a much more VGPRs, so this optimization is not actual for the current devices - size_t BLOCK_SIZE_Y = 1; + size_t BLOCK_SIZE_Y = 1; #else - size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices - while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows) - BLOCK_SIZE_Y *= 2; + size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices + while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows) + BLOCK_SIZE_Y *= 2; #endif - CV_Assert((size_t)ksize.width <= BLOCK_SIZE); + CV_Assert((size_t)ksize.width <= BLOCK_SIZE); - bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0; + bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0; - vector > args; + vector > args; - args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data)); - cl_uint stepBytes = src.step; - args.push_back( make_pair( sizeof(cl_uint), (void *)&stepBytes)); - int offsetXBytes = src.offset % src.step; - int offsetX = offsetXBytes / src.elemSize(); - CV_Assert((int)(offsetX * src.elemSize()) == offsetXBytes); - int offsetY = src.offset / src.step; - int endX = (offsetX + src.cols); - int endY = (offsetY + src.rows); - cl_int rect[4] = {offsetX, offsetY, endX, endY}; - if (!isIsolatedBorder) - { - rect[2] = src.wholecols; - rect[3] = src.wholerows; - } - args.push_back( make_pair( sizeof(cl_int)*4, (void *)&rect[0])); - - args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data)); - cl_uint _stepBytes = dst.step; - args.push_back( make_pair( sizeof(cl_uint), (void *)&_stepBytes)); - int _offsetXBytes = dst.offset % dst.step; - int _offsetX = _offsetXBytes / dst.elemSize(); - CV_Assert((int)(_offsetX * dst.elemSize()) == _offsetXBytes); - int _offsetY = dst.offset / dst.step; - int _endX = (_offsetX + dst.cols); - int _endY = (_offsetY + dst.rows); - cl_int _rect[4] = {_offsetX, _offsetY, _endX, _endY}; - args.push_back( make_pair( sizeof(cl_int)*4, (void *)&_rect[0])); - - float borderValue[4] = {0, 0, 0, 0}; // DON'T move into 'if' body - double borderValueDouble[4] = {0, 0, 0, 0}; // DON'T move into 'if' body - if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT) - { - if (useDouble) - args.push_back( make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0])); - else - args.push_back( make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0])); - } + args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data)); + cl_uint stepBytes = src.step; + args.push_back( make_pair( sizeof(cl_uint), (void *)&stepBytes)); + int offsetXBytes = src.offset % src.step; + int offsetX = offsetXBytes / src.elemSize(); + CV_Assert((int)(offsetX * src.elemSize()) == offsetXBytes); + int offsetY = src.offset / src.step; + int endX = (offsetX + src.cols); + int endY = (offsetY + src.rows); + cl_int rect[4] = {offsetX, offsetY, endX, endY}; + if (!isIsolatedBorder) + { + rect[2] = src.wholecols; + rect[3] = src.wholerows; + } + args.push_back( make_pair( sizeof(cl_int)*4, (void *)&rect[0])); + + args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data)); + cl_uint _stepBytes = dst.step; + args.push_back( make_pair( sizeof(cl_uint), (void *)&_stepBytes)); + int _offsetXBytes = dst.offset % dst.step; + int _offsetX = _offsetXBytes / dst.elemSize(); + CV_Assert((int)(_offsetX * dst.elemSize()) == _offsetXBytes); + int _offsetY = dst.offset / dst.step; + int _endX = (_offsetX + dst.cols); + int _endY = (_offsetY + dst.rows); + cl_int _rect[4] = {_offsetX, _offsetY, _endX, _endY}; + args.push_back( make_pair( sizeof(cl_int)*4, (void *)&_rect[0])); + + float borderValue[4] = {0, 0, 0, 0}; // DON'T move into 'if' body + double borderValueDouble[4] = {0, 0, 0, 0}; // DON'T move into 'if' body + if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT) + { + if (useDouble) + args.push_back( make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0])); + else + args.push_back( make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0])); + } - args.push_back( make_pair( sizeof(cl_mem), (void *)&oclKernelParameter.data)); + args.push_back( make_pair( sizeof(cl_mem), (void *)&oclKernelParameter.data)); - const char* btype = NULL; + const char* btype = NULL; - switch (borderType & ~BORDER_ISOLATED) - { - case BORDER_CONSTANT: - btype = "BORDER_CONSTANT"; - break; - case BORDER_REPLICATE: - btype = "BORDER_REPLICATE"; - break; - case BORDER_REFLECT: - btype = "BORDER_REFLECT"; - break; - case BORDER_WRAP: - CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!"); - return; - case BORDER_REFLECT101: - btype = "BORDER_REFLECT_101"; - break; - } + switch (borderType & ~BORDER_ISOLATED) + { + case BORDER_CONSTANT: + btype = "BORDER_CONSTANT"; + break; + case BORDER_REPLICATE: + btype = "BORDER_REPLICATE"; + break; + case BORDER_REFLECT: + btype = "BORDER_REFLECT"; + break; + case BORDER_WRAP: + CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!"); + return; + case BORDER_REFLECT101: + btype = "BORDER_REFLECT_101"; + break; + } + + int requiredTop = anchor.y; + int requiredLeft = BLOCK_SIZE; // not this: anchor.x; + int requiredBottom = ksize.height - 1 - anchor.y; + int requiredRight = BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x; + int h = isIsolatedBorder ? src.rows : src.wholerows; + int w = isIsolatedBorder ? src.cols : src.wholecols; + bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight; + + char build_options[1024]; + sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d " + "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D KERNEL_SIZE_Y2_ALIGNED=%d " + "-D %s -D %s -D %s", + (int)BLOCK_SIZE, (int)BLOCK_SIZE_Y, + src.depth(), src.oclchannels(), useDouble ? 1 : 0, + anchor.x, anchor.y, ksize.width, ksize.height, kernel_size_y2_aligned, + btype, + extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", + isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED"); + + size_t lt[3] = {BLOCK_SIZE, 1, 1}; + size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1}; + + cl_kernel kernel = openCLGetKernelFromSource(src.clCxt, &filtering_filter2D, "filter2D", -1, -1, build_options); + + size_t kernelWorkGroupSize; + openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(src.clCxt), + CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0)); + if (lt[0] > kernelWorkGroupSize) + { + clReleaseKernel(kernel); + CV_Assert(BLOCK_SIZE > kernelWorkGroupSize); + tryWorkItems = kernelWorkGroupSize; + continue; + } - int requiredTop = anchor.y; - int requiredLeft = BLOCK_SIZE; // not this: anchor.x; - int requiredBottom = ksize.height - 1 - anchor.y; - int requiredRight = BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x; - int h = isIsolatedBorder ? src.rows : src.wholerows; - int w = isIsolatedBorder ? src.cols : src.wholecols; - bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight; - - char build_options[1024]; - sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d " - "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D KERNEL_SIZE_Y2_ALIGNED=%d " - "-D %s -D %s -D %s", - (int)BLOCK_SIZE, (int)BLOCK_SIZE_Y, - src.depth(), src.oclchannels(), useDouble ? 1 : 0, - anchor.x, anchor.y, ksize.width, ksize.height, kernel_size_y2_aligned, - btype, - extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", - isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED"); - - size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1}, lt[3] = {BLOCK_SIZE, 1, 1}; - openCLExecuteKernel(src.clCxt, &filtering_filter2D, "filter2D", gt, lt, args, -1, -1, build_options); + openCLExecuteKernel(src.clCxt, kernel, gt, lt, args); // kernel will be released here + } while (false); } Ptr cv::ocl::getLinearFilter_GPU(int /*srcType*/, int /*dstType*/, const Mat &kernel, const Size &ksize, @@ -770,106 +790,126 @@ static void GPUFilterBox(const oclMat &src, oclMat &dst, (src.rows == dst.rows)); CV_Assert(src.oclchannels() == dst.oclchannels()); - size_t BLOCK_SIZE = src.clCxt->getDeviceInfo().maxWorkItemSizes[0]; - size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices - while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows) - BLOCK_SIZE_Y *= 2; - - CV_Assert((size_t)ksize.width <= BLOCK_SIZE); - - bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0; - - vector > args; - - args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data)); - cl_uint stepBytes = src.step; - args.push_back( make_pair( sizeof(cl_uint), (void *)&stepBytes)); - int offsetXBytes = src.offset % src.step; - int offsetX = offsetXBytes / src.elemSize(); - CV_Assert((int)(offsetX * src.elemSize()) == offsetXBytes); - int offsetY = src.offset / src.step; - int endX = (offsetX + src.cols); - int endY = (offsetY + src.rows); - cl_int rect[4] = {offsetX, offsetY, endX, endY}; - if (!isIsolatedBorder) - { - rect[2] = src.wholecols; - rect[3] = src.wholerows; - } - args.push_back( make_pair( sizeof(cl_int)*4, (void *)&rect[0])); - - args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data)); - cl_uint _stepBytes = dst.step; - args.push_back( make_pair( sizeof(cl_uint), (void *)&_stepBytes)); - int _offsetXBytes = dst.offset % dst.step; - int _offsetX = _offsetXBytes / dst.elemSize(); - CV_Assert((int)(_offsetX * dst.elemSize()) == _offsetXBytes); - int _offsetY = dst.offset / dst.step; - int _endX = (_offsetX + dst.cols); - int _endY = (_offsetY + dst.rows); - cl_int _rect[4] = {_offsetX, _offsetY, _endX, _endY}; - args.push_back( make_pair( sizeof(cl_int)*4, (void *)&_rect[0])); - - bool useDouble = src.depth() == CV_64F; + size_t tryWorkItems = src.clCxt->getDeviceInfo().maxWorkItemSizes[0]; + do { + size_t BLOCK_SIZE = tryWorkItems; + while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)src.cols * 2) + BLOCK_SIZE /= 2; + size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices + while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows) + BLOCK_SIZE_Y *= 2; + + CV_Assert((size_t)ksize.width <= BLOCK_SIZE); + + bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0; + + vector > args; + + args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data)); + cl_uint stepBytes = src.step; + args.push_back( make_pair( sizeof(cl_uint), (void *)&stepBytes)); + int offsetXBytes = src.offset % src.step; + int offsetX = offsetXBytes / src.elemSize(); + CV_Assert((int)(offsetX * src.elemSize()) == offsetXBytes); + int offsetY = src.offset / src.step; + int endX = (offsetX + src.cols); + int endY = (offsetY + src.rows); + cl_int rect[4] = {offsetX, offsetY, endX, endY}; + if (!isIsolatedBorder) + { + rect[2] = src.wholecols; + rect[3] = src.wholerows; + } + args.push_back( make_pair( sizeof(cl_int)*4, (void *)&rect[0])); + + args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data)); + cl_uint _stepBytes = dst.step; + args.push_back( make_pair( sizeof(cl_uint), (void *)&_stepBytes)); + int _offsetXBytes = dst.offset % dst.step; + int _offsetX = _offsetXBytes / dst.elemSize(); + CV_Assert((int)(_offsetX * dst.elemSize()) == _offsetXBytes); + int _offsetY = dst.offset / dst.step; + int _endX = (_offsetX + dst.cols); + int _endY = (_offsetY + dst.rows); + cl_int _rect[4] = {_offsetX, _offsetY, _endX, _endY}; + args.push_back( make_pair( sizeof(cl_int)*4, (void *)&_rect[0])); + + bool useDouble = src.depth() == CV_64F; + + float borderValue[4] = {0, 0, 0, 0}; // DON'T move into 'if' body + double borderValueDouble[4] = {0, 0, 0, 0}; // DON'T move into 'if' body + if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT) + { + if (useDouble) + args.push_back( make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0])); + else + args.push_back( make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0])); + } - float borderValue[4] = {0, 0, 0, 0}; // DON'T move into 'if' body - double borderValueDouble[4] = {0, 0, 0, 0}; // DON'T move into 'if' body - if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT) - { + double alphaDouble = alpha; // DON'T move into 'if' body if (useDouble) - args.push_back( make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0])); + args.push_back( make_pair( sizeof(double), (void *)&alphaDouble)); else - args.push_back( make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0])); - } + args.push_back( make_pair( sizeof(float), (void *)&alpha)); - double alphaDouble = alpha; // DON'T move into 'if' body - if (useDouble) - args.push_back( make_pair( sizeof(double), (void *)&alphaDouble)); - else - args.push_back( make_pair( sizeof(float), (void *)&alpha)); + const char* btype = NULL; - const char* btype = NULL; + switch (borderType & ~BORDER_ISOLATED) + { + case BORDER_CONSTANT: + btype = "BORDER_CONSTANT"; + break; + case BORDER_REPLICATE: + btype = "BORDER_REPLICATE"; + break; + case BORDER_REFLECT: + btype = "BORDER_REFLECT"; + break; + case BORDER_WRAP: + CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!"); + return; + case BORDER_REFLECT101: + btype = "BORDER_REFLECT_101"; + break; + } - switch (borderType & ~BORDER_ISOLATED) - { - case BORDER_CONSTANT: - btype = "BORDER_CONSTANT"; - break; - case BORDER_REPLICATE: - btype = "BORDER_REPLICATE"; - break; - case BORDER_REFLECT: - btype = "BORDER_REFLECT"; - break; - case BORDER_WRAP: - CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!"); - return; - case BORDER_REFLECT101: - btype = "BORDER_REFLECT_101"; - break; - } + int requiredTop = anchor.y; + int requiredLeft = BLOCK_SIZE; // not this: anchor.x; + int requiredBottom = ksize.height - 1 - anchor.y; + int requiredRight = BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x; + int h = isIsolatedBorder ? src.rows : src.wholerows; + int w = isIsolatedBorder ? src.cols : src.wholecols; + bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight; + + CV_Assert(w >= ksize.width && h >= ksize.height); // TODO Other cases are not tested well + + char build_options[1024]; + sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s -D %s -D %s", + (int)BLOCK_SIZE, (int)BLOCK_SIZE_Y, + src.depth(), src.oclchannels(), useDouble ? 1 : 0, + anchor.x, anchor.y, ksize.width, ksize.height, + btype, + extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", + isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED"); + + size_t lt[3] = {BLOCK_SIZE, 1, 1}; + size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1}; + + cl_kernel kernel = openCLGetKernelFromSource(src.clCxt, &filtering_boxFilter, "boxFilter", -1, -1, build_options); + + size_t kernelWorkGroupSize; + openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(src.clCxt), + CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0)); + if (lt[0] > kernelWorkGroupSize) + { + clReleaseKernel(kernel); + CV_Assert(BLOCK_SIZE > kernelWorkGroupSize); + tryWorkItems = kernelWorkGroupSize; + continue; + } - int requiredTop = anchor.y; - int requiredLeft = BLOCK_SIZE; // not this: anchor.x; - int requiredBottom = ksize.height - 1 - anchor.y; - int requiredRight = BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x; - int h = isIsolatedBorder ? src.rows : src.wholerows; - int w = isIsolatedBorder ? src.cols : src.wholecols; - bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight; - - CV_Assert(w >= ksize.width && h >= ksize.height); // TODO Other cases are not tested well - - char build_options[1024]; - sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s -D %s -D %s", - (int)BLOCK_SIZE, (int)BLOCK_SIZE_Y, - src.depth(), src.oclchannels(), useDouble ? 1 : 0, - anchor.x, anchor.y, ksize.width, ksize.height, - btype, - extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", - isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED"); - - size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1}, lt[3] = {BLOCK_SIZE, 1, 1}; - openCLExecuteKernel(src.clCxt, &filtering_boxFilter, "boxFilter", gt, lt, args, -1, -1, build_options); + openCLExecuteKernel(src.clCxt, kernel, gt, lt, args); // kernel will be released here + } while (false); } Ptr cv::ocl::getBoxFilter_GPU(int /*srcType*/, int /*dstType*/,