|
|
|
@ -578,7 +578,11 @@ static void GPUFilter2D(const oclMat &src, oclMat &dst, const Mat &kernel, |
|
|
|
|
kernelDataFloat.size()*sizeof(float), 1, clMemcpyHostToDevice); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
size_t BLOCK_SIZE = src.clCxt->getDeviceInfo().maxWorkItemSizes[0]; |
|
|
|
|
size_t tryWorkItems = src.clCxt->getDeviceInfo().maxWorkItemSizes[0]; |
|
|
|
|
do { |
|
|
|
|
size_t BLOCK_SIZE = tryWorkItems; |
|
|
|
|
while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)src.cols * 2) |
|
|
|
|
BLOCK_SIZE /= 2; |
|
|
|
|
#if 1 // TODO Mode with several blocks requires a much more VGPRs, so this optimization is not actual for the current devices
|
|
|
|
|
size_t BLOCK_SIZE_Y = 1; |
|
|
|
|
#else |
|
|
|
@ -674,8 +678,24 @@ static void GPUFilter2D(const oclMat &src, oclMat &dst, const Mat &kernel, |
|
|
|
|
extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", |
|
|
|
|
isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED"); |
|
|
|
|
|
|
|
|
|
size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1}, lt[3] = {BLOCK_SIZE, 1, 1}; |
|
|
|
|
openCLExecuteKernel(src.clCxt, &filtering_filter2D, "filter2D", gt, lt, args, -1, -1, build_options); |
|
|
|
|
size_t lt[3] = {BLOCK_SIZE, 1, 1}; |
|
|
|
|
size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1}; |
|
|
|
|
|
|
|
|
|
cl_kernel kernel = openCLGetKernelFromSource(src.clCxt, &filtering_filter2D, "filter2D", -1, -1, build_options); |
|
|
|
|
|
|
|
|
|
size_t kernelWorkGroupSize; |
|
|
|
|
openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(src.clCxt), |
|
|
|
|
CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0)); |
|
|
|
|
if (lt[0] > kernelWorkGroupSize) |
|
|
|
|
{ |
|
|
|
|
clReleaseKernel(kernel); |
|
|
|
|
CV_Assert(BLOCK_SIZE > kernelWorkGroupSize); |
|
|
|
|
tryWorkItems = kernelWorkGroupSize; |
|
|
|
|
continue; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
openCLExecuteKernel(src.clCxt, kernel, gt, lt, args); // kernel will be released here
|
|
|
|
|
} while (false); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
Ptr<BaseFilter_GPU> cv::ocl::getLinearFilter_GPU(int /*srcType*/, int /*dstType*/, const Mat &kernel, const Size &ksize, |
|
|
|
@ -770,7 +790,11 @@ static void GPUFilterBox(const oclMat &src, oclMat &dst, |
|
|
|
|
(src.rows == dst.rows)); |
|
|
|
|
CV_Assert(src.oclchannels() == dst.oclchannels()); |
|
|
|
|
|
|
|
|
|
size_t BLOCK_SIZE = src.clCxt->getDeviceInfo().maxWorkItemSizes[0]; |
|
|
|
|
size_t tryWorkItems = src.clCxt->getDeviceInfo().maxWorkItemSizes[0]; |
|
|
|
|
do { |
|
|
|
|
size_t BLOCK_SIZE = tryWorkItems; |
|
|
|
|
while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)src.cols * 2) |
|
|
|
|
BLOCK_SIZE /= 2; |
|
|
|
|
size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices
|
|
|
|
|
while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows) |
|
|
|
|
BLOCK_SIZE_Y *= 2; |
|
|
|
@ -868,8 +892,24 @@ static void GPUFilterBox(const oclMat &src, oclMat &dst, |
|
|
|
|
extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", |
|
|
|
|
isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED"); |
|
|
|
|
|
|
|
|
|
size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1}, lt[3] = {BLOCK_SIZE, 1, 1}; |
|
|
|
|
openCLExecuteKernel(src.clCxt, &filtering_boxFilter, "boxFilter", gt, lt, args, -1, -1, build_options); |
|
|
|
|
size_t lt[3] = {BLOCK_SIZE, 1, 1}; |
|
|
|
|
size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1}; |
|
|
|
|
|
|
|
|
|
cl_kernel kernel = openCLGetKernelFromSource(src.clCxt, &filtering_boxFilter, "boxFilter", -1, -1, build_options); |
|
|
|
|
|
|
|
|
|
size_t kernelWorkGroupSize; |
|
|
|
|
openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(src.clCxt), |
|
|
|
|
CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0)); |
|
|
|
|
if (lt[0] > kernelWorkGroupSize) |
|
|
|
|
{ |
|
|
|
|
clReleaseKernel(kernel); |
|
|
|
|
CV_Assert(BLOCK_SIZE > kernelWorkGroupSize); |
|
|
|
|
tryWorkItems = kernelWorkGroupSize; |
|
|
|
|
continue; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
openCLExecuteKernel(src.clCxt, kernel, gt, lt, args); // kernel will be released here
|
|
|
|
|
} while (false); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
Ptr<BaseFilter_GPU> cv::ocl::getBoxFilter_GPU(int /*srcType*/, int /*dstType*/, |
|
|
|
|