From 39b27a19bec6dc567bb246092a9ed522e2cf67c2 Mon Sep 17 00:00:00 2001 From: Alexander Karsakov Date: Fri, 5 Sep 2014 12:20:29 +0400 Subject: [PATCH] Refactoring and optimization --- modules/imgproc/src/hough.cpp | 114 ++++++++++++++-------- modules/imgproc/src/opencl/hough_lines.cl | 5 +- 2 files changed, 76 insertions(+), 43 deletions(-) diff --git a/modules/imgproc/src/hough.cpp b/modules/imgproc/src/hough.cpp index 29ca551fb1..386c7e948c 100644 --- a/modules/imgproc/src/hough.cpp +++ b/modules/imgproc/src/hough.cpp @@ -652,80 +652,102 @@ HoughLinesProbabilistic( Mat& image, } } } - - -static bool ocl_HoughLines(InputArray _src, OutputArray _lines, double rho, double theta, int threshold, - double min_theta, double max_theta) +static bool ocl_makePointsList(InputArray _src, OutputArray _pointsList, InputOutputArray _counters) { - CV_Assert(_src.type() == CV_8UC1); - - if (max_theta < 0 || max_theta > CV_PI ) { - CV_Error( CV_StsBadArg, "max_theta must fall between 0 and pi" ); - } - if (min_theta < 0 || min_theta > max_theta ) { - CV_Error( CV_StsBadArg, "min_theta must fall between 0 and max_theta" ); - } - UMat src = _src.getUMat(); - - float irho = (float) (1 / rho); - int numangle = cvRound((max_theta - min_theta) / theta); - int numrho = cvRound(((src.cols + src.rows) * 2 + 1) / rho); + _pointsList.create(1, (int) src.total(), CV_32SC1); + UMat pointsList = _pointsList.getUMat(); + UMat counters = _counters.getUMat(); ocl::Device dev = ocl::Device::getDefault(); - // make list of nonzero points - const int pixelsPerWI = 8; + const int pixelsPerWI = 16; int workgroup_size = min((int) dev.maxWorkGroupSize(), (src.cols + pixelsPerWI - 1)/pixelsPerWI); ocl::Kernel pointListKernel("make_point_list", ocl::imgproc::hough_lines_oclsrc, format("-D MAKE_POINTS_LIST -D GROUP_SIZE=%d -D LOCAL_SIZE=%d", workgroup_size, src.cols)); if (pointListKernel.empty()) return false; - UMat pointsList(1, (int) src.total(), CV_32SC1); - UMat counters(1, 2, CV_32SC1, Scalar::all(0)); pointListKernel.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnlyNoSize(pointsList), ocl::KernelArg::PtrWriteOnly(counters)); + size_t localThreads[2] = { workgroup_size, 1 }; size_t globalThreads[2] = { workgroup_size, src.rows }; - if (!pointListKernel.run(2, globalThreads, localThreads, false)) - return false; + return pointListKernel.run(2, globalThreads, localThreads, false); +} - int total_points = counters.getMat(ACCESS_READ).at(0, 0); - if (total_points <= 0) - { - _lines.assign(UMat(0,0,CV_32FC2)); - return true; - } +static bool ocl_fillAccum(InputArray _pointsList, OutputArray _accum, int total_points, double rho, double theta, int numrho, int numangle) +{ + UMat pointsList = _pointsList.getUMat(); + _accum.create(numangle + 2, numrho + 2, CV_32SC1); + UMat accum = _accum.getUMat(); + ocl::Device dev = ocl::Device::getDefault(); - // convert src image to hough space - UMat accum(numangle + 2, numrho + 2, CV_32SC1); - workgroup_size = min((int) dev.maxWorkGroupSize(), total_points); + float irho = (float) (1 / rho); + int workgroup_size = min((int) dev.maxWorkGroupSize(), total_points); + ocl::Kernel fillAccumKernel; - size_t* fillAccumLT = NULL; + size_t localThreads[2]; + size_t globalThreads[2]; + int local_memory_needed = (numrho + 2)*sizeof(int); if (local_memory_needed > dev.localMemSize()) { accum.setTo(Scalar::all(0)); fillAccumKernel.create("fill_accum_global", ocl::imgproc::hough_lines_oclsrc, format("-D FILL_ACCUM_GLOBAL")); + if (fillAccumKernel.empty()) + return false; globalThreads[0] = workgroup_size; globalThreads[1] = numangle; + fillAccumKernel.args(ocl::KernelArg::ReadOnlyNoSize(pointsList), ocl::KernelArg::WriteOnly(accum), + total_points, irho, (float) theta, numrho, numangle); + return fillAccumKernel.run(2, globalThreads, NULL, false); } else { fillAccumKernel.create("fill_accum_local", ocl::imgproc::hough_lines_oclsrc, format("-D FILL_ACCUM_LOCAL -D LOCAL_SIZE=%d -D BUFFER_SIZE=%d", workgroup_size, numrho + 2)); + if (fillAccumKernel.empty()) + return false; localThreads[0] = workgroup_size; localThreads[1] = 1; globalThreads[0] = workgroup_size; globalThreads[1] = numangle+2; - fillAccumLT = localThreads; + fillAccumKernel.args(ocl::KernelArg::ReadOnlyNoSize(pointsList), ocl::KernelArg::WriteOnly(accum), + total_points, irho, (float) theta, numrho, numangle); + return fillAccumKernel.run(2, globalThreads, localThreads, false); + } +} + +static bool ocl_HoughLines(InputArray _src, OutputArray _lines, double rho, double theta, int threshold, + double min_theta, double max_theta) +{ + CV_Assert(_src.type() == CV_8UC1); + + if (max_theta < 0 || max_theta > CV_PI ) { + CV_Error( CV_StsBadArg, "max_theta must fall between 0 and pi" ); } - if (fillAccumKernel.empty()) + if (min_theta < 0 || min_theta > max_theta ) { + CV_Error( CV_StsBadArg, "min_theta must fall between 0 and max_theta" ); + } + + UMat src = _src.getUMat(); + int numangle = cvRound((max_theta - min_theta) / theta); + int numrho = cvRound(((src.cols + src.rows) * 2 + 1) / rho); + + UMat pointsList; + UMat counters(1, 2, CV_32SC1, Scalar::all(0)); + + if (!ocl_makePointsList(src, pointsList, counters)) return false; - fillAccumKernel.args(ocl::KernelArg::ReadOnlyNoSize(pointsList), ocl::KernelArg::WriteOnly(accum), - total_points, irho, (float) theta, numrho, numangle); + int total_points = counters.getMat(ACCESS_READ).at(0, 0); + if (total_points <= 0) + { + _lines.assign(UMat(0,0,CV_32FC2)); + return true; + } - if (!fillAccumKernel.run(2, globalThreads, fillAccumLT, false)) + UMat accum; + if (!ocl_fillAccum(pointsList, accum, total_points, rho, theta, numrho, numangle)) return false; const int pixPerWI = 8; @@ -741,7 +763,7 @@ static bool ocl_HoughLines(InputArray _src, OutputArray _lines, double rho, doub getLinesKernel.args(ocl::KernelArg::ReadOnly(accum), ocl::KernelArg::WriteOnlyNoSize(lines), ocl::KernelArg::PtrWriteOnly(counters), linesMax, threshold, (float) rho, (float) theta); - globalThreads[0] = (numrho + pixPerWI - 1)/pixPerWI; globalThreads[1] = numangle; + size_t globalThreads[2] = { (numrho + pixPerWI - 1)/pixPerWI, numangle }; if (!getLinesKernel.run(2, globalThreads, NULL, false)) return false; @@ -753,13 +775,23 @@ static bool ocl_HoughLines(InputArray _src, OutputArray _lines, double rho, doub return true; } +static bool ocl_HoughLinesP(InputArray _src, OutputArray _lines, double rho, double theta, int threshold, + double minLineLength, double maxGap) +{ + CV_Assert(_src.type() == CV_8UC1); + + UMat src = _src.getUMat(); + + return false; +} + } void cv::HoughLines( InputArray _image, OutputArray _lines, double rho, double theta, int threshold, double srn, double stn, double min_theta, double max_theta ) { - CV_OCL_RUN(srn == 0 && stn == 0 && _lines.isUMat(), + CV_OCL_RUN(srn == 0 && stn == 0 && _image.isUMat() && _lines.isUMat(), ocl_HoughLines(_image, _lines, rho, theta, threshold, min_theta, max_theta)); Mat image = _image.getMat(); @@ -778,6 +810,8 @@ void cv::HoughLinesP(InputArray _image, OutputArray _lines, double rho, double theta, int threshold, double minLineLength, double maxGap ) { + CV_OCL_RUN(_image.isUMat() && _lines.isUMat(), ocl_HoughLinesP(_image, _lines, rho, theta, threshold, minLineLength, maxGap)); + Mat image = _image.getMat(); std::vector lines; HoughLinesProbabilistic(image, (float)rho, (float)theta, threshold, cvRound(minLineLength), cvRound(maxGap), lines, INT_MAX); diff --git a/modules/imgproc/src/opencl/hough_lines.cl b/modules/imgproc/src/opencl/hough_lines.cl index 25ebe4708f..d402d37419 100644 --- a/modules/imgproc/src/opencl/hough_lines.cl +++ b/modules/imgproc/src/opencl/hough_lines.cl @@ -107,14 +107,13 @@ __kernel void fill_accum_local(__global const uchar * list_ptr, int list_step, i __global const int * list = (__global const int*)(list_ptr + list_offset); const int shift = (numrho - 1) / 2; - for (int i = count_idx; i < total_points; i += LOCAL_SIZE) { const int point = list[i]; const int x = (point & 0xFFFF); - const int y = (point >> 16) & 0xFFFF; + const int y = point >> 16; - int r = convert_int_rte(x * cosVal + y * sinVal) + shift; + int r = convert_int_rte(mad(x, cosVal, y * sinVal)) + shift; atomic_inc(l_accum + r + 1); }