From 198cd1a40db82c59fca6b41c263efd89a5aeb5ca Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 18 Nov 2013 18:46:59 +0400 Subject: [PATCH] added area fast mode to ocl::resize --- modules/ocl/perf/perf_imgwarp.cpp | 2 +- modules/ocl/src/imgproc.cpp | 100 +++++++++++++++++------ modules/ocl/src/opencl/imgproc_resize.cl | 41 +++++++++- modules/ocl/test/utility.cpp | 10 ++- modules/ocl/test/utility.hpp | 2 +- 5 files changed, 124 insertions(+), 31 deletions(-) diff --git a/modules/ocl/perf/perf_imgwarp.cpp b/modules/ocl/perf/perf_imgwarp.cpp index 62f3345493..e768d66219 100644 --- a/modules/ocl/perf/perf_imgwarp.cpp +++ b/modules/ocl/perf/perf_imgwarp.cpp @@ -191,7 +191,7 @@ typedef TestBaseWithParam resizeAreaFixture; PERF_TEST_P(resizeAreaFixture, resize, ::testing::Combine(OCL_TYPICAL_MAT_SIZES, OCL_PERF_ENUM(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), - ::testing::Values(0.6, 0.3))) + ::testing::Values(0.3, 0.5, 0.6))) { const resizeAreaParams params = GetParam(); const Size srcSize = get<0>(params); diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp index d06adb0985..e8f42edea6 100644 --- a/modules/ocl/src/imgproc.cpp +++ b/modules/ocl/src/imgproc.cpp @@ -318,28 +318,46 @@ namespace cv ofs_tab[dx] = k; } + static void computeResizeAreaFastTabs(int * dmap_tab, int * smap_tab, int scale, int dcols, int scol) + { + for (int i = 0; i < dcols; ++i) + dmap_tab[i] = scale * i; + + for (int i = 0, size = dcols * scale; i < size; ++i) + smap_tab[i] = std::min(scol - 1, i); + } + static void resize_gpu( const oclMat &src, oclMat &dst, double ifx, double ify, int interpolation) { float ifxf = (float)ifx, ifyf = (float)ify; int src_step = src.step / src.elemSize(), src_offset = src.offset / src.elemSize(); int dst_step = dst.step / dst.elemSize(), dst_offset = dst.offset / dst.elemSize(); - int ocn = interpolation == INTER_LINEAR ? dst.oclchannels() : -1; - int depth = interpolation == INTER_LINEAR ? dst.depth() : -1; + int ocn = dst.oclchannels(), depth = dst.depth(); const char * const interMap[] = { "NN", "LN", "CUBIC", "AREA", "LAN4" }; std::string kernelName = std::string("resize") + interMap[interpolation]; const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" }; const char * const channelMap[] = { "" , "", "2", "4", "4" }; - std::string buildOption = format("-D %s -D T=%s%s", interMap[interpolation], typeMap[dst.depth()], channelMap[dst.oclchannels()]); + std::string buildOption = format("-D %s -D T=%s%s", interMap[interpolation], typeMap[depth], channelMap[ocn]); int wdepth = std::max(src.depth(), CV_32F); + + // check if fx, fy is integer and then we have inter area fast mode + int iscale_x = saturate_cast(ifx); + int iscale_y = saturate_cast(ify); + + bool is_area_fast = std::abs(ifx - iscale_x) < DBL_EPSILON && + std::abs(ify - iscale_y) < DBL_EPSILON; + if (is_area_fast) + wdepth = std::max(src.depth(), CV_32S); + if (interpolation != INTER_NEAREST) { buildOption += format(" -D WT=%s -D WTV=%s%s -D convertToWTV=convert_%s%s -D convertToT=convert_%s%s%s", - typeMap[wdepth], typeMap[wdepth], channelMap[dst.oclchannels()], - typeMap[wdepth], channelMap[dst.oclchannels()], - typeMap[src.depth()], channelMap[dst.oclchannels()], src.depth() <= CV_32S ? "_sat_rte" : ""); + typeMap[wdepth], typeMap[wdepth], channelMap[ocn], + typeMap[wdepth], channelMap[ocn], + typeMap[src.depth()], channelMap[ocn], src.depth() <= CV_32S ? "_sat_rte" : ""); } size_t blkSizeX = 16, blkSizeY = 16; @@ -352,26 +370,48 @@ namespace cv else glbSizeX = dst.cols; - static oclMat alphaOcl, mapOcl, tabofsOcl; + oclMat alphaOcl, mapOcl, tabofsOcl; if (interpolation == INTER_AREA) { - Size ssize = src.size(), dsize = dst.size(); - int xytab_size = (ssize.width + ssize.height) << 1; - int tabofs_size = dsize.height + dsize.width + 2; - - AutoBuffer _xymap_tab(xytab_size), _xyofs_tab(tabofs_size); - AutoBuffer _xyalpha_tab(xytab_size); - int * xmap_tab = _xymap_tab, * ymap_tab = _xymap_tab + (ssize.width << 1); - float * xalpha_tab = _xyalpha_tab, * yalpha_tab = _xyalpha_tab + (ssize.width << 1); - int * xofs_tab = _xyofs_tab, * yofs_tab = _xyofs_tab + dsize.width + 1; - - computeResizeAreaTabs(ssize.width, dsize.width, ifx, xmap_tab, xalpha_tab, xofs_tab); - computeResizeAreaTabs(ssize.height, dsize.height, ify, ymap_tab, yalpha_tab, yofs_tab); - - // loading precomputed arrays to GPU - alphaOcl = oclMat(1, xytab_size, CV_32FC1, (void *)_xyalpha_tab); - mapOcl = oclMat(1, xytab_size, CV_32SC1, (void *)_xymap_tab); - tabofsOcl = oclMat(1, tabofs_size, CV_32SC1, (void *)_xyofs_tab); + if (is_area_fast) + { + kernelName += "_FAST"; + int wdepth2 = std::max(CV_32F, src.depth()); + buildOption += format(" -D WT2V=%s%s -D convertToWT2V=convert_%s%s -D AREA_FAST -D XSCALE=%d -D YSCALE=%d -D SCALE=%f", + typeMap[wdepth2], channelMap[ocn], typeMap[wdepth2], channelMap[ocn], + iscale_x, iscale_y, 1.0f / (iscale_x * iscale_y)); + + int smap_tab_size = dst.cols * iscale_x + dst.rows * iscale_y; + AutoBuffer dmap_tab(dst.cols + dst.rows), smap_tab(smap_tab_size); + int * dxmap_tab = dmap_tab, * dymap_tab = dxmap_tab + dst.cols; + int * sxmap_tab = smap_tab, * symap_tab = smap_tab + dst.cols * iscale_y; + + computeResizeAreaFastTabs(dxmap_tab, sxmap_tab, iscale_x, dst.cols, src.cols); + computeResizeAreaFastTabs(dymap_tab, symap_tab, iscale_y, dst.rows, src.rows); + + tabofsOcl = oclMat(1, dst.cols + dst.rows, CV_32SC1, (void *)dmap_tab); + mapOcl = oclMat(1, smap_tab_size, CV_32SC1, (void *)smap_tab); + } + else + { + Size ssize = src.size(), dsize = dst.size(); + int xytab_size = (ssize.width + ssize.height) << 1; + int tabofs_size = dsize.height + dsize.width + 2; + + AutoBuffer _xymap_tab(xytab_size), _xyofs_tab(tabofs_size); + AutoBuffer _xyalpha_tab(xytab_size); + int * xmap_tab = _xymap_tab, * ymap_tab = _xymap_tab + (ssize.width << 1); + float * xalpha_tab = _xyalpha_tab, * yalpha_tab = _xyalpha_tab + (ssize.width << 1); + int * xofs_tab = _xyofs_tab, * yofs_tab = _xyofs_tab + dsize.width + 1; + + computeResizeAreaTabs(ssize.width, dsize.width, ifx, xmap_tab, xalpha_tab, xofs_tab); + computeResizeAreaTabs(ssize.height, dsize.height, ify, ymap_tab, yalpha_tab, yofs_tab); + + // loading precomputed arrays to GPU + alphaOcl = oclMat(1, xytab_size, CV_32FC1, (void *)_xyalpha_tab); + mapOcl = oclMat(1, xytab_size, CV_32SC1, (void *)_xymap_tab); + tabofsOcl = oclMat(1, tabofs_size, CV_32SC1, (void *)_xyofs_tab); + } } size_t globalThreads[3] = { glbSizeX, dst.rows, 1 }; @@ -400,12 +440,18 @@ namespace cv args.push_back( make_pair(sizeof(cl_float), (void *)&ifyf)); } - if (interpolation == INTER_AREA) - { + // precomputed tabs + if (!tabofsOcl.empty()) args.push_back( make_pair(sizeof(cl_mem), (void *)&tabofsOcl.data)); + + if (!mapOcl.empty()) args.push_back( make_pair(sizeof(cl_mem), (void *)&mapOcl.data)); + + if (!alphaOcl.empty()) args.push_back( make_pair(sizeof(cl_mem), (void *)&alphaOcl.data)); - } + + ocn = interpolation == INTER_LINEAR ? ocn : -1; + depth = interpolation == INTER_LINEAR ? depth : -1; openCLExecuteKernel(src.clCxt, &imgproc_resize, kernelName, globalThreads, localThreads, args, ocn, depth, buildOption.c_str()); diff --git a/modules/ocl/src/opencl/imgproc_resize.cl b/modules/ocl/src/opencl/imgproc_resize.cl index 5a69e87862..ebf8c712b7 100644 --- a/modules/ocl/src/opencl/imgproc_resize.cl +++ b/modules/ocl/src/opencl/imgproc_resize.cl @@ -315,7 +315,44 @@ __kernel void resizeNN(__global T * dst, __global T * src, } } -#elif AREA +#elif defined AREA + +#ifdef AREA_FAST + +__kernel void resizeAREA_FAST(__global T * dst, __global T * src, + int dst_offset, int src_offset, int dst_step, int src_step, + int src_cols, int src_rows, int dst_cols, int dst_rows, WT ifx, WT ify, + __global const int * dmap_tab, __global const int * smap_tab) +{ + int dx = get_global_id(0); + int dy = get_global_id(1); + + if (dx < dst_cols && dy < dst_rows) + { + int dst_index = mad24(dy, dst_step, dst_offset + dx); + + __global const int * xmap_tab = dmap_tab; + __global const int * ymap_tab = dmap_tab + dst_cols; + __global const int * sxmap_tab = smap_tab; + __global const int * symap_tab = smap_tab + XSCALE * dst_cols; + + int sx = xmap_tab[dx], sy = ymap_tab[dy]; + WTV sum = (WTV)(0); + + #pragma unroll + for (int y = 0; y < YSCALE; ++y) + { + int src_index = mad24(symap_tab[y + sy], src_step, src_offset); + #pragma unroll + for (int x = 0; x < XSCALE; ++x) + sum += convertToWTV(src[src_index + sxmap_tab[sx + x]]); + } + + dst[dst_index] = convertToT(convertToWT2V(sum) * (WT2V)(SCALE)); + } +} + +#else __kernel void resizeAREA(__global T * dst, __global T * src, int dst_offset, int src_offset, int dst_step, int src_step, @@ -364,3 +401,5 @@ __kernel void resizeAREA(__global T * dst, __global T * src, } #endif + +#endif diff --git a/modules/ocl/test/utility.cpp b/modules/ocl/test/utility.cpp index f986042929..c9cb0d8d67 100644 --- a/modules/ocl/test/utility.cpp +++ b/modules/ocl/test/utility.cpp @@ -231,7 +231,7 @@ double checkRectSimilarity(Size sz, std::vector& ob1, std::vector& o return final_test_result; } -void showDiff(const Mat& gold, const Mat& actual, double eps, bool alwaysShow) +void showDiff(const Mat& src, const Mat& gold, const Mat& actual, double eps, bool alwaysShow) { Mat diff, diff_thresh; absdiff(gold, actual, diff); @@ -240,10 +240,18 @@ void showDiff(const Mat& gold, const Mat& actual, double eps, bool alwaysShow) if (alwaysShow || cv::countNonZero(diff_thresh.reshape(1)) > 0) { +#if 0 + std::cout << "Src: " << std::endl << src << std::endl; + std::cout << "Reference: " << std::endl << gold << std::endl; + std::cout << "OpenCL: " << std::endl << actual << std::endl; +#endif + + namedWindow("src", WINDOW_NORMAL); namedWindow("gold", WINDOW_NORMAL); namedWindow("actual", WINDOW_NORMAL); namedWindow("diff", WINDOW_NORMAL); + imshow("src", src); imshow("gold", gold); imshow("actual", actual); imshow("diff", diff); diff --git a/modules/ocl/test/utility.hpp b/modules/ocl/test/utility.hpp index bfbe579361..a1fe3ffb75 100644 --- a/modules/ocl/test/utility.hpp +++ b/modules/ocl/test/utility.hpp @@ -52,7 +52,7 @@ extern int LOOP_TIMES; namespace cvtest { -void showDiff(const Mat& gold, const Mat& actual, double eps, bool alwaysShow = false); +void showDiff(const Mat& src, const Mat& gold, const Mat& actual, double eps, bool alwaysShow = false); cv::ocl::oclMat createMat_ocl(cv::RNG& rng, Size size, int type, bool useRoi); cv::ocl::oclMat loadMat_ocl(cv::RNG& rng, const Mat& m, bool useRoi);