From c3fa7974e6113f87876759f32c223fb88afb0ccb Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Tue, 14 Aug 2012 17:00:57 +0400 Subject: [PATCH] new optimized version --- modules/gpu/include/opencv2/gpu/gpu.hpp | 4 +- modules/gpu/perf/perf_imgproc.cpp | 5 +- modules/gpu/src/cuda/hough.cu | 163 ++++++++++++++++++++---- modules/gpu/src/hough.cpp | 23 ++-- modules/gpu/test/test_imgproc.cpp | 3 +- 5 files changed, 159 insertions(+), 39 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index cb2e688726..170c4d5ff6 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -821,8 +821,8 @@ private: }; CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096); -CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096); -CV_EXPORTS void HoughLinesTransform(const GpuMat& src, GpuMat& accum, float rho, float theta); +CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, GpuMat& buf, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096); +CV_EXPORTS void HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf, float rho, float theta); CV_EXPORTS void HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096); CV_EXPORTS void HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines, OutputArray h_voices = noArray()); diff --git a/modules/gpu/perf/perf_imgproc.cpp b/modules/gpu/perf/perf_imgproc.cpp index 3baca6c9d3..0dbcd34c65 100644 --- a/modules/gpu/perf/perf_imgproc.cpp +++ b/modules/gpu/perf/perf_imgproc.cpp @@ -1364,11 +1364,12 @@ GPU_PERF_TEST(HoughLines, cv::gpu::DeviceInfo, cv::Size, DoSort) cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_lines; cv::gpu::GpuMat d_accum; - cv::gpu::HoughLines(d_src, d_lines, d_accum, rho, theta, threshold, doSort); + cv::gpu::GpuMat d_buf; + cv::gpu::HoughLines(d_src, d_lines, d_accum, d_buf, rho, theta, threshold, doSort); TEST_CYCLE() { - cv::gpu::HoughLines(d_src, d_lines, d_accum, rho, theta, threshold, doSort); + cv::gpu::HoughLines(d_src, d_lines, d_accum, d_buf, rho, theta, threshold, doSort); } } diff --git a/modules/gpu/src/cuda/hough.cu b/modules/gpu/src/cuda/hough.cu index 9f5cd65348..8c9c075f6a 100644 --- a/modules/gpu/src/cuda/hough.cu +++ b/modules/gpu/src/cuda/hough.cu @@ -42,55 +42,167 @@ #include #include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/emulation.hpp" namespace cv { namespace gpu { namespace device { namespace hough { - __global__ void linesAccum(const DevMem2Db src, PtrStep_ accum, const float theta, const int numangle, const int numrho, const float irho) + __device__ unsigned int g_counter; + + const int PIXELS_PER_THREAD = 16; + + __global__ void buildPointList(const DevMem2Db src, unsigned int* list) { - const int x = blockIdx.x * blockDim.x + threadIdx.x; - const int y = blockIdx.y * blockDim.y + threadIdx.y; + const int x = blockIdx.x * 32 * PIXELS_PER_THREAD + threadIdx.x; + const int y = blockIdx.y * 4 + threadIdx.y; - if (x >= src.cols || y >= src.rows) + if (y >= src.rows) return; - if (src(y, x)) + volatile int qindex = -1; + __shared__ volatile int s_qindex[4]; + __shared__ volatile int s_qstart[4]; + s_qindex[threadIdx.y] = -1; + + __shared__ volatile unsigned int s_queue[4][32 * PIXELS_PER_THREAD]; + + // fill the queue + for (int i = 0; i < PIXELS_PER_THREAD; ++i) { - float ang = 0.0f; - for(int n = 0; n < numangle; ++n, ang += theta) + const int xx = i * blockDim.x + x; + + if (xx >= src.cols) + break; + + if (src(y, xx)) { - float sin_ang; - float cos_ang; - sincosf(ang, &sin_ang, &cos_ang); + const unsigned int queue_val = (y << 16) | xx; - const float tabSin = sin_ang * irho; - const float tabCos = cos_ang * irho; + do { + qindex++; + s_qindex[threadIdx.y] = qindex; + s_queue[threadIdx.y][qindex] = queue_val; + } while (s_queue[threadIdx.y][qindex] != queue_val); + } + + // reload index from smem (last thread to write to smem will have updated it) + qindex = s_qindex[threadIdx.y]; + } - int r = __float2int_rn(x * tabCos + y * tabSin); - r += (numrho - 1) / 2; + __syncthreads(); - atomicInc(accum.ptr(n + 1) + r + 1, (unsigned int)-1); + // let one thread reserve the space required in the global list + if (threadIdx.x == 0 && threadIdx.y == 0) + { + // find how many items are stored in each list + int total_index = 0; + #pragma unroll + for (int i = 0; i < 4; ++i) + { + s_qstart[i] = total_index; + total_index += (s_qindex[i] + 1u); } + + //calculate the offset in the global list + const unsigned int global_offset = atomicAdd(&g_counter, total_index); + #pragma unroll + for (int i = 0; i < 4; ++i) + s_qstart[i] += global_offset; + } + + __syncthreads(); + + // copy local queues to global queue + for(int i = 0; i <= qindex; i += 32) + { + if(i + threadIdx.x > qindex) + break; + + unsigned int qvalue = s_queue[threadIdx.y][i + threadIdx.x]; + list[s_qstart[threadIdx.y] + i + threadIdx.x] = qvalue; } } - void linesAccum_gpu(DevMem2Db src, DevMem2D_ accum, float rho, float theta) + unsigned int buildPointList_gpu(DevMem2Db src, unsigned int* list) { - const dim3 block(32, 8); - const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); + void* counter_ptr; + cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) ); + + cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) ); + + const dim3 block(32, 4); + const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y)); + + cudaSafeCall( cudaFuncSetCacheConfig(buildPointList, cudaFuncCachePreferShared) ); - linesAccum<<>>(src, accum, theta, accum.rows - 2, accum.cols - 2, 1.0f / rho); + buildPointList<<>>(src, list); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); + + unsigned int total_count; + cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); + + return total_count; } - __device__ unsigned int g_counter; + __global__ void linesAccum(const unsigned int* list, const unsigned int count, PtrStep_ accum, + const float irho, const float theta, const int numrho) + { + extern __shared__ unsigned int smem[]; + + for (int i = threadIdx.x; i < numrho; i += blockDim.x) + smem[i] = 0; + __syncthreads(); + + const int n = blockIdx.x; + const float ang = n * theta; + + float sin_ang; + float cos_ang; + sincosf(ang, &sin_ang, &cos_ang); + + const float tabSin = sin_ang * irho; + const float tabCos = cos_ang * irho; + + for (int i = threadIdx.x; i < count; i += blockDim.x) + { + // read one element from global memory + const unsigned int qvalue = list[i]; + const unsigned int x = (qvalue & 0x0000FFFF); + const unsigned int y = (qvalue >> 16) & 0x0000FFFF; + + int r = __float2int_rn(x * tabCos + y * tabSin); + r += (numrho - 1) / 2; + + Emulation::smem::atomicInc(&smem[r], (unsigned int)(-1)); + } + __syncthreads(); + + for (int i = threadIdx.x; i < numrho; i += blockDim.x) + accum(n + 1, i + 1) = smem[i]; + } + + void linesAccum_gpu(const unsigned int* list, unsigned int count, DevMem2D_ accum, float rho, float theta) + { + const dim3 block(1024); + const dim3 grid(accum.rows - 2); + + cudaSafeCall( cudaFuncSetCacheConfig(linesAccum, cudaFuncCachePreferShared) ); + + size_t smem_size = (accum.cols - 2) * sizeof(unsigned int); + + linesAccum<<>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2); + cudaSafeCall( cudaGetLastError() ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } - __global__ void linesGetResult(const DevMem2D_ accum, float2* out, int* voices, const int maxSize, const float threshold, const float theta, const float rho, const int numrho) + __global__ void linesGetResult(const DevMem2D_ accum, float2* out, int* voices, const int maxSize, + const float threshold, const float theta, const float rho, const int numrho) { - __shared__ uint smem[8][32]; + __shared__ unsigned int smem[8][32]; int r = blockIdx.x * (blockDim.x - 2) + threadIdx.x; int n = blockIdx.y * (blockDim.y - 2) + threadIdx.y; @@ -125,7 +237,8 @@ namespace cv { namespace gpu { namespace device } } - unsigned int linesGetResult_gpu(DevMem2D_ accum, float2* out, int* voices, unsigned int maxSize, float rho, float theta, float threshold, bool doSort) + unsigned int linesGetResult_gpu(DevMem2D_ accum, float2* out, int* voices, unsigned int maxSize, + float rho, float theta, float threshold, bool doSort) { void* counter_ptr; cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) ); @@ -140,8 +253,8 @@ namespace cv { namespace gpu { namespace device cudaSafeCall( cudaDeviceSynchronize() ); - uint total_count; - cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(uint), cudaMemcpyDeviceToHost) ); + unsigned int total_count; + cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); total_count = ::min(total_count, maxSize); diff --git a/modules/gpu/src/hough.cpp b/modules/gpu/src/hough.cpp index b577ca5071..f4d4399d26 100644 --- a/modules/gpu/src/hough.cpp +++ b/modules/gpu/src/hough.cpp @@ -46,16 +46,23 @@ namespace cv { namespace gpu { namespace device { namespace hough { - void linesAccum_gpu(DevMem2Db src, DevMem2D_ accum, float rho, float theta); + unsigned int buildPointList_gpu(DevMem2Db src, unsigned int* list); + void linesAccum_gpu(const unsigned int* list, unsigned int count, DevMem2D_ accum, float rho, float theta); unsigned int linesGetResult_gpu(DevMem2D_ accum, float2* out, int* voices, unsigned int maxSize, float rho, float theta, float threshold, bool doSort); } }}} -void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, float rho, float theta) +void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf, float rho, float theta) { - using namespace cv::gpu::device; + using namespace cv::gpu::device::hough; CV_Assert(src.type() == CV_8UC1); + CV_Assert(src.cols < std::numeric_limits::max()); + CV_Assert(src.rows < std::numeric_limits::max()); + + ensureSizeIsEnough(1, src.size().area(), CV_32SC1, buf); + + unsigned int count = buildPointList_gpu(src, buf.ptr()); const int numangle = cvRound(CV_PI / theta); const int numrho = cvRound(((src.cols + src.rows) * 2 + 1) / rho); @@ -63,7 +70,7 @@ void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, float rho, f ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, accum); accum.setTo(cv::Scalar::all(0)); - hough::linesAccum_gpu(src, accum, rho, theta); + linesAccum_gpu(buf.ptr(), count, accum, rho, theta); } void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines) @@ -83,13 +90,13 @@ void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines) { - cv::gpu::GpuMat accum; - HoughLines(src, lines, accum, rho, theta, threshold, doSort, maxLines); + cv::gpu::GpuMat accum, buf; + HoughLines(src, lines, accum, buf, rho, theta, threshold, doSort, maxLines); } -void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, float rho, float theta, int threshold, bool doSort, int maxLines) +void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, GpuMat& buf, float rho, float theta, int threshold, bool doSort, int maxLines) { - HoughLinesTransform(src, accum, rho, theta); + HoughLinesTransform(src, accum, buf, rho, theta); HoughLinesGet(accum, lines, rho, theta, threshold, doSort, maxLines); } diff --git a/modules/gpu/test/test_imgproc.cpp b/modules/gpu/test/test_imgproc.cpp index b0e587e4ad..4d67de59d3 100644 --- a/modules/gpu/test/test_imgproc.cpp +++ b/modules/gpu/test/test_imgproc.cpp @@ -1155,7 +1155,7 @@ TEST_P(HoughLines, Accuracy) const float rho = 1.0f; const float theta = CV_PI / 180.0f; - const int threshold = 300; + const int threshold = 50; cv::Mat img = readImage(fileName, cv::IMREAD_GRAYSCALE); ASSERT_FALSE(img.empty()); @@ -1182,7 +1182,6 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, HoughLines, testing::Combine( ALL_DEVICES, testing::Values(std::string("../cv/shared/pic1.png"), std::string("../cv/shared/pic3.png"), - std::string("../cv/shared/pic4.png"), std::string("../cv/shared/pic5.png"), std::string("../cv/shared/pic6.png"))));