From 33ae078b0989b44ac8d262d210335b04bb268b4d Mon Sep 17 00:00:00 2001 From: Atlas42 Date: Thu, 31 Dec 2020 11:29:58 +0100 Subject: [PATCH] Merge pull request #2801 from Atlas42:cuda-hough-stream-fix Added stream support on hough circles, lines and segments * Added stream support on hough circles lines and segments - Passed the stream to the different cuda, OpenCV and thurst library calls - Replace all device by cuda synchronizes - Added extra synchronize calls after device to host transfers - Replaced the cuda globals by allocated values * Fixed missing include for CUDA 8 Co-authored-by: william.fink --- .../cudaimgproc/src/cuda/build_point_list.cu | 20 +++----- modules/cudaimgproc/src/cuda/hough_circles.cu | 46 ++++++++----------- modules/cudaimgproc/src/cuda/hough_lines.cu | 33 ++++++------- .../cudaimgproc/src/cuda/hough_segments.cu | 27 +++++------ modules/cudaimgproc/src/hough_circles.cpp | 45 +++++++++++------- modules/cudaimgproc/src/hough_lines.cpp | 39 ++++++++++------ modules/cudaimgproc/src/hough_segments.cpp | 36 ++++++++++----- 7 files changed, 127 insertions(+), 119 deletions(-) diff --git a/modules/cudaimgproc/src/cuda/build_point_list.cu b/modules/cudaimgproc/src/cuda/build_point_list.cu index addcabc24..c30214d5d 100644 --- a/modules/cudaimgproc/src/cuda/build_point_list.cu +++ b/modules/cudaimgproc/src/cuda/build_point_list.cu @@ -49,10 +49,8 @@ namespace cv { namespace cuda { namespace device { namespace hough { - __device__ int g_counter; - template - __global__ void buildPointList(const PtrStepSzb src, unsigned int* list) + __global__ void buildPointList(const PtrStepSzb src, unsigned int* list, int* counterPtr) { __shared__ unsigned int s_queues[4][32 * PIXELS_PER_THREAD]; __shared__ int s_qsize[4]; @@ -94,7 +92,7 @@ namespace cv { namespace cuda { namespace device } // calculate the offset in the global list - const int globalOffset = atomicAdd(&g_counter, totalSize); + const int globalOffset = atomicAdd(counterPtr, totalSize); for (int i = 0; i < blockDim.y; ++i) s_globStart[i] += globalOffset; } @@ -108,27 +106,23 @@ namespace cv { namespace cuda { namespace device list[gidx] = s_queues[threadIdx.y][i]; } - int buildPointList_gpu(PtrStepSzb src, unsigned int* list) + int buildPointList_gpu(PtrStepSzb src, unsigned int* list, int* counterPtr, cudaStream_t stream) { const int PIXELS_PER_THREAD = 16; - void* counterPtr; - cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) ); - - cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) ); + cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) ); const dim3 block(32, 4); const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y)); cudaSafeCall( cudaFuncSetCacheConfig(buildPointList, cudaFuncCachePreferShared) ); - buildPointList<<>>(src, list); + buildPointList<<>>(src, list, counterPtr); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); - int totalCount; - cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) ); + cudaSafeCall( cudaMemcpyAsync(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost, stream) ); + cudaSafeCall( cudaStreamSynchronize(stream) ); return totalCount; } diff --git a/modules/cudaimgproc/src/cuda/hough_circles.cu b/modules/cudaimgproc/src/cuda/hough_circles.cu index db1623ece..fcbf8c412 100644 --- a/modules/cudaimgproc/src/cuda/hough_circles.cu +++ b/modules/cudaimgproc/src/cuda/hough_circles.cu @@ -54,8 +54,6 @@ namespace cv { namespace cuda { namespace device { namespace hough_circles { - __device__ int g_counter; - //////////////////////////////////////////////////////////////////////// // circlesAccumCenters @@ -111,23 +109,22 @@ namespace cv { namespace cuda { namespace device } } - void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp) + void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp, cudaStream_t stream) { const dim3 block(256); const dim3 grid(divUp(count, block.x)); cudaSafeCall( cudaFuncSetCacheConfig(circlesAccumCenters, cudaFuncCachePreferL1) ); - circlesAccumCenters<<>>(list, count, dx, dy, accum, accum.cols - 2, accum.rows - 2, minRadius, maxRadius, idp); + circlesAccumCenters<<>>(list, count, dx, dy, accum, accum.cols - 2, accum.rows - 2, minRadius, maxRadius, idp); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + cudaSafeCall( cudaStreamSynchronize(stream) ); } //////////////////////////////////////////////////////////////////////// // buildCentersList - - __global__ void buildCentersList(const PtrStepSzi accum, unsigned int* centers, const int threshold) + __global__ void buildCentersList(const PtrStepSzi accum, unsigned int* centers, const int threshold, int* counterPtr) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -145,31 +142,27 @@ namespace cv { namespace cuda { namespace device if (cur > threshold && cur > top && cur >= bottom && cur > left && cur >= right) { const unsigned int val = (y << 16) | x; - const int idx = ::atomicAdd(&g_counter, 1); + const int idx = ::atomicAdd(counterPtr, 1); centers[idx] = val; } } } - int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold) + int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold, int* counterPtr, cudaStream_t stream) { - void* counterPtr; - cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) ); - - cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) ); + cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) ); const dim3 block(32, 8); const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y)); cudaSafeCall( cudaFuncSetCacheConfig(buildCentersList, cudaFuncCachePreferL1) ); - buildCentersList<<>>(accum, centers, threshold); + buildCentersList<<>>(accum, centers, threshold, counterPtr); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); - int totalCount; - cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) ); + cudaSafeCall( cudaMemcpyAsync(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost, stream) ); + cudaSafeCall( cudaStreamSynchronize(stream) ); return totalCount; } @@ -179,7 +172,8 @@ namespace cv { namespace cuda { namespace device __global__ void circlesAccumRadius(const unsigned int* centers, const unsigned int* list, const int count, float3* circles, const int maxCircles, const float dp, - const int minRadius, const int maxRadius, const int histSize, const int threshold) + const int minRadius, const int maxRadius, const int histSize, const int threshold, + int* counterPtr) { int* smem = DynamicSharedMem(); @@ -219,7 +213,7 @@ namespace cv { namespace cuda { namespace device if (curVotes >= threshold && curVotes > smem[i] && curVotes >= smem[i + 2]) { - const int ind = ::atomicAdd(&g_counter, 1); + const int ind = ::atomicAdd(counterPtr, 1); if (ind < maxCircles) circles[ind] = make_float3(cx, cy, i + minRadius); } @@ -227,12 +221,9 @@ namespace cv { namespace cuda { namespace device } int circlesAccumRadius_gpu(const unsigned int* centers, int centersCount, const unsigned int* list, int count, - float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20) + float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20, int* counterPtr, cudaStream_t stream) { - void* counterPtr; - cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) ); - - cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) ); + cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) ); const dim3 block(has20 ? 1024 : 512); const dim3 grid(centersCount); @@ -240,13 +231,12 @@ namespace cv { namespace cuda { namespace device const int histSize = maxRadius - minRadius + 1; size_t smemSize = (histSize + 2) * sizeof(int); - circlesAccumRadius<<>>(centers, list, count, circles, maxCircles, dp, minRadius, maxRadius, histSize, threshold); + circlesAccumRadius<<>>(centers, list, count, circles, maxCircles, dp, minRadius, maxRadius, histSize, threshold, counterPtr); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); - int totalCount; - cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) ); + cudaSafeCall( cudaMemcpyAsync(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost, stream) ); + cudaSafeCall( cudaStreamSynchronize(stream) ); totalCount = ::min(totalCount, maxCircles); diff --git a/modules/cudaimgproc/src/cuda/hough_lines.cu b/modules/cudaimgproc/src/cuda/hough_lines.cu index 9a93cbf14..6fa557911 100644 --- a/modules/cudaimgproc/src/cuda/hough_lines.cu +++ b/modules/cudaimgproc/src/cuda/hough_lines.cu @@ -44,6 +44,7 @@ #include #include +#include #include "opencv2/core/cuda/common.hpp" #include "opencv2/core/cuda/emulation.hpp" @@ -53,8 +54,6 @@ namespace cv { namespace cuda { namespace device { namespace hough_lines { - __device__ int g_counter; - //////////////////////////////////////////////////////////////////////// // linesAccum @@ -126,7 +125,7 @@ namespace cv { namespace cuda { namespace device accumRow[i] = smem[i]; } - void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20) + void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20, cudaStream_t stream) { const dim3 block(has20 ? 1024 : 512); const dim3 grid(accum.rows - 2); @@ -134,19 +133,18 @@ namespace cv { namespace cuda { namespace device size_t smemSize = (accum.cols - 1) * sizeof(int); if (smemSize < sharedMemPerBlock - 1000) - linesAccumShared<<>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2); + linesAccumShared<<>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2); else - linesAccumGlobal<<>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2); + linesAccumGlobal<<>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2); cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaDeviceSynchronize() ); + cudaSafeCall( cudaStreamSynchronize(stream) ); } //////////////////////////////////////////////////////////////////////// // linesGetResult - __global__ void linesGetResult(const PtrStepSzi accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const int threshold, const int numrho) + __global__ void linesGetResult(const PtrStepSzi accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const int threshold, const int numrho, int* counterPtr) { const int r = blockIdx.x * blockDim.x + threadIdx.x; const int n = blockIdx.y * blockDim.y + threadIdx.y; @@ -165,7 +163,7 @@ namespace cv { namespace cuda { namespace device const float radius = (r - (numrho - 1) * 0.5f) * rho; const float angle = n * theta; - const int ind = ::atomicAdd(&g_counter, 1); + const int ind = ::atomicAdd(counterPtr, 1); if (ind < maxSize) { out[ind] = make_float2(radius, angle); @@ -174,25 +172,22 @@ namespace cv { namespace cuda { namespace device } } - int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort) + int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort, int* counterPtr, cudaStream_t stream) { - void* counterPtr; - cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) ); - - cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) ); + cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) ); const dim3 block(32, 8); const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y)); cudaSafeCall( cudaFuncSetCacheConfig(linesGetResult, cudaFuncCachePreferL1) ); - linesGetResult<<>>(accum, out, votes, maxSize, rho, theta, threshold, accum.cols - 2); + linesGetResult<<>>(accum, out, votes, maxSize, rho, theta, threshold, accum.cols - 2, counterPtr); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); - int totalCount; - cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) ); + cudaSafeCall( cudaMemcpyAsync(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost, stream) ); + + cudaSafeCall( cudaStreamSynchronize(stream) ); totalCount = ::min(totalCount, maxSize); @@ -200,7 +195,7 @@ namespace cv { namespace cuda { namespace device { thrust::device_ptr outPtr(out); thrust::device_ptr votesPtr(votes); - thrust::sort_by_key(votesPtr, votesPtr + totalCount, outPtr, thrust::greater()); + thrust::sort_by_key(thrust::cuda::par.on(stream), votesPtr, votesPtr + totalCount, outPtr, thrust::greater()); } return totalCount; diff --git a/modules/cudaimgproc/src/cuda/hough_segments.cu b/modules/cudaimgproc/src/cuda/hough_segments.cu index ca433d30d..988f14c1d 100644 --- a/modules/cudaimgproc/src/cuda/hough_segments.cu +++ b/modules/cudaimgproc/src/cuda/hough_segments.cu @@ -49,15 +49,14 @@ namespace cv { namespace cuda { namespace device { namespace hough_segments { - __device__ int g_counter; - texture tex_mask(false, cudaFilterModePoint, cudaAddressModeClamp); __global__ void houghLinesProbabilistic(const PtrStepSzi accum, int4* out, const int maxSize, const float rho, const float theta, const int lineGap, const int lineLength, - const int rows, const int cols) + const int rows, const int cols, + int* counterPtr) { const int r = blockIdx.x * blockDim.x + threadIdx.x; const int n = blockIdx.y * blockDim.y + threadIdx.y; @@ -182,7 +181,7 @@ namespace cv { namespace cuda { namespace device if (good_line) { - const int ind = ::atomicAdd(&g_counter, 1); + const int ind = ::atomicAdd(counterPtr, 1); if (ind < maxSize) out[ind] = make_int4(line_end[0].x, line_end[0].y, line_end[1].x, line_end[1].y); } @@ -202,7 +201,7 @@ namespace cv { namespace cuda { namespace device if (good_line) { - const int ind = ::atomicAdd(&g_counter, 1); + const int ind = ::atomicAdd(counterPtr, 1); if (ind < maxSize) out[ind] = make_int4(line_end[0].x, line_end[0].y, line_end[1].x, line_end[1].y); } @@ -214,29 +213,27 @@ namespace cv { namespace cuda { namespace device } } - int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength) + int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength, int* counterPtr, cudaStream_t stream) { - void* counterPtr; - cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) ); - - cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) ); + cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) ); const dim3 block(32, 8); const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y)); bindTexture(&tex_mask, mask); - houghLinesProbabilistic<<>>(accum, + houghLinesProbabilistic<<>>(accum, out, maxSize, rho, theta, lineGap, lineLength, - mask.rows, mask.cols); + mask.rows, mask.cols, + counterPtr); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); - int totalCount; - cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) ); + cudaSafeCall( cudaMemcpyAsync(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost, stream) ); + + cudaSafeCall( cudaStreamSynchronize(stream) ); totalCount = ::min(totalCount, maxSize); diff --git a/modules/cudaimgproc/src/hough_circles.cpp b/modules/cudaimgproc/src/hough_circles.cpp index 0fa962d71..61d49d46b 100644 --- a/modules/cudaimgproc/src/hough_circles.cpp +++ b/modules/cudaimgproc/src/hough_circles.cpp @@ -55,15 +55,15 @@ namespace cv { namespace cuda { namespace device { namespace hough { - int buildPointList_gpu(PtrStepSzb src, unsigned int* list); + int buildPointList_gpu(PtrStepSzb src, unsigned int* list, int* counterPtr, cudaStream_t stream); } namespace hough_circles { - void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp); - int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold); + void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp, cudaStream_t stream); + int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold, int* counterPtr, cudaStream_t stream); int circlesAccumRadius_gpu(const unsigned int* centers, int centersCount, const unsigned int* list, int count, - float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20); + float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20, int* counterPtr, cudaStream_t stream); } }}} @@ -73,6 +73,7 @@ namespace { public: HoughCirclesDetectorImpl(float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles); + ~HoughCirclesDetectorImpl(); void detect(InputArray src, OutputArray circles, Stream& stream); @@ -140,6 +141,8 @@ namespace Ptr filterDx_; Ptr filterDy_; Ptr canny_; + + int* counterPtr_; }; bool centersCompare(Vec3f a, Vec3f b) {return (a[2] > b[2]);} @@ -153,16 +156,22 @@ namespace filterDx_ = cuda::createSobelFilter(CV_8UC1, CV_32S, 1, 0); filterDy_ = cuda::createSobelFilter(CV_8UC1, CV_32S, 0, 1); + + cudaSafeCall(cudaMalloc(&counterPtr_, sizeof(int))); } - void HoughCirclesDetectorImpl::detect(InputArray _src, OutputArray circles, Stream& stream) + HoughCirclesDetectorImpl::~HoughCirclesDetectorImpl() { - // TODO : implement async version - CV_UNUSED(stream); + cudaSafeCall(cudaFree(counterPtr_)); + } + void HoughCirclesDetectorImpl::detect(InputArray _src, OutputArray circles, Stream& stream) + { using namespace cv::cuda::device::hough; using namespace cv::cuda::device::hough_circles; + auto cudaStream = StreamAccessor::getStream(stream); + GpuMat src = _src.getGpuMat(); CV_Assert( src.type() == CV_8UC1 ); @@ -182,13 +191,13 @@ namespace canny_->setLowThreshold(std::max(cannyThreshold_ / 2, 1)); canny_->setHighThreshold(cannyThreshold_); - canny_->detect(dx_, dy_, edges_); + canny_->detect(dx_, dy_, edges_, stream); ensureSizeIsEnough(2, src.size().area(), CV_32SC1, list_); unsigned int* srcPoints = list_.ptr(0); unsigned int* centers = list_.ptr(1); - const int pointsCount = buildPointList_gpu(edges_, srcPoints); + const int pointsCount = buildPointList_gpu(edges_, srcPoints, counterPtr_, cudaStream); if (pointsCount == 0) { circles.release(); @@ -196,13 +205,13 @@ namespace } ensureSizeIsEnough(cvCeil(src.rows * idp) + 2, cvCeil(src.cols * idp) + 2, CV_32SC1, accum_); - accum_.setTo(Scalar::all(0)); + accum_.setTo(Scalar::all(0), stream); - circlesAccumCenters_gpu(srcPoints, pointsCount, dx_, dy_, accum_, minRadius_, maxRadius_, idp); + circlesAccumCenters_gpu(srcPoints, pointsCount, dx_, dy_, accum_, minRadius_, maxRadius_, idp, cudaStream); - accum_.download(tt); + accum_.download(tt, stream); - int centersCount = buildCentersList_gpu(accum_, centers, votesThreshold_); + int centersCount = buildCentersList_gpu(accum_, centers, votesThreshold_, counterPtr_, cudaStream); if (centersCount == 0) { circles.release(); @@ -218,7 +227,8 @@ namespace ushort2* oldBuf = oldBuf_.data(); ushort2* newBuf = newBuf_.data(); - cudaSafeCall( cudaMemcpy(oldBuf, centers, centersCount * sizeof(ushort2), cudaMemcpyDeviceToHost) ); + cudaSafeCall( cudaMemcpyAsync(oldBuf, centers, centersCount * sizeof(ushort2), cudaMemcpyDeviceToHost, cudaStream) ); + cudaSafeCall( cudaStreamSynchronize(cudaStream) ); const int cellSize = cvRound(minDist_); const int gridWidth = (src.cols + cellSize - 1) / cellSize; @@ -290,14 +300,15 @@ namespace } } - cudaSafeCall( cudaMemcpy(centers, newBuf, newCount * sizeof(unsigned int), cudaMemcpyHostToDevice) ); + cudaSafeCall( cudaMemcpyAsync(centers, newBuf, newCount * sizeof(unsigned int), cudaMemcpyHostToDevice, cudaStream) ); centersCount = newCount; } ensureSizeIsEnough(1, maxCircles_, CV_32FC3, result_); int circlesCount = circlesAccumRadius_gpu(centers, centersCount, srcPoints, pointsCount, result_.ptr(), maxCircles_, - dp_, minRadius_, maxRadius_, votesThreshold_, deviceSupports(FEATURE_SET_COMPUTE_20)); + dp_, minRadius_, maxRadius_, votesThreshold_, deviceSupports(FEATURE_SET_COMPUTE_20), + counterPtr_, cudaStream); if (circlesCount == 0) { @@ -306,7 +317,7 @@ namespace } result_.cols = circlesCount; - result_.copyTo(circles); + result_.copyTo(circles, stream); } } diff --git a/modules/cudaimgproc/src/hough_lines.cpp b/modules/cudaimgproc/src/hough_lines.cpp index e112e09a3..06de8c0be 100644 --- a/modules/cudaimgproc/src/hough_lines.cpp +++ b/modules/cudaimgproc/src/hough_lines.cpp @@ -55,13 +55,13 @@ namespace cv { namespace cuda { namespace device { namespace hough { - int buildPointList_gpu(PtrStepSzb src, unsigned int* list); + int buildPointList_gpu(PtrStepSzb src, unsigned int* list, int* counterPtr, cudaStream_t stream); } namespace hough_lines { - void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20); - int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort); + void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20, cudaStream_t stream); + int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort, int* counterPtr, cudaStream_t stream); } }}} @@ -70,10 +70,8 @@ namespace class HoughLinesDetectorImpl : public HoughLinesDetector { public: - HoughLinesDetectorImpl(float rho, float theta, int threshold, bool doSort, int maxLines) : - rho_(rho), theta_(theta), threshold_(threshold), doSort_(doSort), maxLines_(maxLines) - { - } + HoughLinesDetectorImpl(float rho, float theta, int threshold, bool doSort, int maxLines); + ~HoughLinesDetectorImpl(); void detect(InputArray src, OutputArray lines, Stream& stream); void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes, Stream& stream); @@ -124,16 +122,27 @@ namespace GpuMat accum_; GpuMat list_; GpuMat result_; + + int* counterPtr_; }; - void HoughLinesDetectorImpl::detect(InputArray _src, OutputArray lines, Stream& stream) + HoughLinesDetectorImpl::HoughLinesDetectorImpl(float rho, float theta, int threshold, bool doSort, int maxLines) : + rho_(rho), theta_(theta), threshold_(threshold), doSort_(doSort), maxLines_(maxLines) + { + cudaSafeCall(cudaMalloc(&counterPtr_, sizeof(int))); + } + + HoughLinesDetectorImpl::~HoughLinesDetectorImpl() { - // TODO : implement async version - CV_UNUSED(stream); + cudaSafeCall(cudaFree(counterPtr_)); + } + void HoughLinesDetectorImpl::detect(InputArray _src, OutputArray lines, Stream& stream) + { using namespace cv::cuda::device::hough; using namespace cv::cuda::device::hough_lines; + auto cudaStream = StreamAccessor::getStream(stream); GpuMat src = _src.getGpuMat(); CV_Assert( src.type() == CV_8UC1 ); @@ -143,7 +152,7 @@ namespace ensureSizeIsEnough(1, src.size().area(), CV_32SC1, list_); unsigned int* srcPoints = list_.ptr(); - const int pointsCount = buildPointList_gpu(src, srcPoints); + const int pointsCount = buildPointList_gpu(src, srcPoints, counterPtr_, cudaStream); if (pointsCount == 0) { lines.release(); @@ -155,14 +164,14 @@ namespace CV_Assert( numangle > 0 && numrho > 0 ); ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, accum_); - accum_.setTo(Scalar::all(0)); + accum_.setTo(Scalar::all(0), stream); DeviceInfo devInfo; - linesAccum_gpu(srcPoints, pointsCount, accum_, rho_, theta_, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20)); + linesAccum_gpu(srcPoints, pointsCount, accum_, rho_, theta_, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20), cudaStream); ensureSizeIsEnough(2, maxLines_, CV_32FC2, result_); - int linesCount = linesGetResult_gpu(accum_, result_.ptr(0), result_.ptr(1), maxLines_, rho_, theta_, threshold_, doSort_); + int linesCount = linesGetResult_gpu(accum_, result_.ptr(0), result_.ptr(1), maxLines_, rho_, theta_, threshold_, doSort_, counterPtr_, cudaStream); if (linesCount == 0) { @@ -171,7 +180,7 @@ namespace } result_.cols = linesCount; - result_.copyTo(lines); + result_.copyTo(lines, stream); } void HoughLinesDetectorImpl::downloadResults(InputArray _d_lines, OutputArray h_lines, OutputArray h_votes, Stream& stream) diff --git a/modules/cudaimgproc/src/hough_segments.cpp b/modules/cudaimgproc/src/hough_segments.cpp index 34ee47446..f0eb0beec 100644 --- a/modules/cudaimgproc/src/hough_segments.cpp +++ b/modules/cudaimgproc/src/hough_segments.cpp @@ -55,17 +55,17 @@ namespace cv { namespace cuda { namespace device { namespace hough { - int buildPointList_gpu(PtrStepSzb src, unsigned int* list); + int buildPointList_gpu(PtrStepSzb src, unsigned int* list, int* counterPtr, cudaStream_t stream); } namespace hough_lines { - void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20); + void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20, cudaStream_t stream); } namespace hough_segments { - int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength); + int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength, int* counterPtr, cudaStream_t stream); } }}} @@ -74,10 +74,8 @@ namespace class HoughSegmentDetectorImpl : public HoughSegmentDetector { public: - HoughSegmentDetectorImpl(float rho, float theta, int minLineLength, int maxLineGap, int maxLines) : - rho_(rho), theta_(theta), minLineLength_(minLineLength), maxLineGap_(maxLineGap), maxLines_(maxLines) - { - } + HoughSegmentDetectorImpl(float rho, float theta, int minLineLength, int maxLineGap, int maxLines); + ~HoughSegmentDetectorImpl(); void detect(InputArray src, OutputArray lines, Stream& stream); @@ -127,8 +125,21 @@ namespace GpuMat accum_; GpuMat list_; GpuMat result_; + + int* counterPtr_; }; + HoughSegmentDetectorImpl::HoughSegmentDetectorImpl(float rho, float theta, int minLineLength, int maxLineGap, int maxLines) : + rho_(rho), theta_(theta), minLineLength_(minLineLength), maxLineGap_(maxLineGap), maxLines_(maxLines) + { + cudaSafeCall(cudaMalloc(&counterPtr_, sizeof(int))); + } + + HoughSegmentDetectorImpl::~HoughSegmentDetectorImpl() + { + cudaSafeCall(cudaFree(counterPtr_)); + } + void HoughSegmentDetectorImpl::detect(InputArray _src, OutputArray lines, Stream& stream) { // TODO : implement async version @@ -138,6 +149,7 @@ namespace using namespace cv::cuda::device::hough_lines; using namespace cv::cuda::device::hough_segments; + auto cudaStream = StreamAccessor::getStream(stream); GpuMat src = _src.getGpuMat(); CV_Assert( src.type() == CV_8UC1 ); @@ -147,7 +159,7 @@ namespace ensureSizeIsEnough(1, src.size().area(), CV_32SC1, list_); unsigned int* srcPoints = list_.ptr(); - const int pointsCount = buildPointList_gpu(src, srcPoints); + const int pointsCount = buildPointList_gpu(src, srcPoints, counterPtr_, cudaStream); if (pointsCount == 0) { lines.release(); @@ -159,14 +171,14 @@ namespace CV_Assert( numangle > 0 && numrho > 0 ); ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, accum_); - accum_.setTo(Scalar::all(0)); + accum_.setTo(Scalar::all(0), stream); DeviceInfo devInfo; - linesAccum_gpu(srcPoints, pointsCount, accum_, rho_, theta_, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20)); + linesAccum_gpu(srcPoints, pointsCount, accum_, rho_, theta_, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20), cudaStream); ensureSizeIsEnough(1, maxLines_, CV_32SC4, result_); - int linesCount = houghLinesProbabilistic_gpu(src, accum_, result_.ptr(), maxLines_, rho_, theta_, maxLineGap_, minLineLength_); + int linesCount = houghLinesProbabilistic_gpu(src, accum_, result_.ptr(), maxLines_, rho_, theta_, maxLineGap_, minLineLength_, counterPtr_, cudaStream); if (linesCount == 0) { @@ -175,7 +187,7 @@ namespace } result_.cols = linesCount; - result_.copyTo(lines); + result_.copyTo(lines, stream); } }