From 33ae078b0989b44ac8d262d210335b04bb268b4d Mon Sep 17 00:00:00 2001
From: Atlas42 <william.fink@gmail.com>
Date: Thu, 31 Dec 2020 11:29:58 +0100
Subject: [PATCH] Merge pull request #2801 from Atlas42:cuda-hough-stream-fix

Added stream support on hough circles, lines and segments

* Added stream support on hough circles lines and segments
- Passed the stream to the different cuda, OpenCV and thurst library calls
- Replace all device by cuda synchronizes
- Added extra synchronize calls after device to host transfers
- Replaced the cuda globals by allocated values

* Fixed missing include for CUDA 8

Co-authored-by: william.fink <will@recurse.io>
---
 .../cudaimgproc/src/cuda/build_point_list.cu  | 20 +++-----
 modules/cudaimgproc/src/cuda/hough_circles.cu | 46 ++++++++-----------
 modules/cudaimgproc/src/cuda/hough_lines.cu   | 33 ++++++-------
 .../cudaimgproc/src/cuda/hough_segments.cu    | 27 +++++------
 modules/cudaimgproc/src/hough_circles.cpp     | 45 +++++++++++-------
 modules/cudaimgproc/src/hough_lines.cpp       | 39 ++++++++++------
 modules/cudaimgproc/src/hough_segments.cpp    | 36 ++++++++++-----
 7 files changed, 127 insertions(+), 119 deletions(-)
diff --git a/modules/cudaimgproc/src/cuda/build_point_list.cu b/modules/cudaimgproc/src/cuda/build_point_list.cu
index addcabc24..c30214d5d 100644
--- a/modules/cudaimgproc/src/cuda/build_point_list.cu
+++ b/modules/cudaimgproc/src/cuda/build_point_list.cu
@@ -49,10 +49,8 @@ namespace cv { namespace cuda { namespace device
 {
     namespace hough
     {
-        __device__ int g_counter;
-
         template <int PIXELS_PER_THREAD>
-        __global__ void buildPointList(const PtrStepSzb src, unsigned int* list)
+        __global__ void buildPointList(const PtrStepSzb src, unsigned int* list, int* counterPtr)
         {
             __shared__ unsigned int s_queues[4][32 * PIXELS_PER_THREAD];
             __shared__ int s_qsize[4];
@@ -94,7 +92,7 @@ namespace cv { namespace cuda { namespace device
                 }
 
                 // calculate the offset in the global list
-                const int globalOffset = atomicAdd(&g_counter, totalSize);
+                const int globalOffset = atomicAdd(counterPtr, totalSize);
                 for (int i = 0; i < blockDim.y; ++i)
                     s_globStart[i] += globalOffset;
             }
@@ -108,27 +106,23 @@ namespace cv { namespace cuda { namespace device
                 list[gidx] = s_queues[threadIdx.y][i];
         }
 
-        int buildPointList_gpu(PtrStepSzb src, unsigned int* list)
+        int buildPointList_gpu(PtrStepSzb src, unsigned int* list, int* counterPtr, cudaStream_t stream)
         {
             const int PIXELS_PER_THREAD = 16;
 
-            void* counterPtr;
-            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
-
-            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+            cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) );
 
             const dim3 block(32, 4);
             const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y));
 
             cudaSafeCall( cudaFuncSetCacheConfig(buildPointList<PIXELS_PER_THREAD>, cudaFuncCachePreferShared) );
 
-            buildPointList<PIXELS_PER_THREAD><<<grid, block>>>(src, list);
+            buildPointList<PIXELS_PER_THREAD><<<grid, block, 0, stream>>>(src, list, counterPtr);
             cudaSafeCall( cudaGetLastError() );
 
-            cudaSafeCall( cudaDeviceSynchronize() );
-
             int totalCount;
-            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpyAsync(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
+            cudaSafeCall( cudaStreamSynchronize(stream) );
 
             return totalCount;
         }
diff --git a/modules/cudaimgproc/src/cuda/hough_circles.cu b/modules/cudaimgproc/src/cuda/hough_circles.cu
index db1623ece..fcbf8c412 100644
--- a/modules/cudaimgproc/src/cuda/hough_circles.cu
+++ b/modules/cudaimgproc/src/cuda/hough_circles.cu
@@ -54,8 +54,6 @@ namespace cv { namespace cuda { namespace device
 {
     namespace hough_circles
     {
-        __device__ int g_counter;
-
         ////////////////////////////////////////////////////////////////////////
         // circlesAccumCenters
 
@@ -111,23 +109,22 @@ namespace cv { namespace cuda { namespace device
             }
         }
 
-        void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp)
+        void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp, cudaStream_t stream)
         {
             const dim3 block(256);
             const dim3 grid(divUp(count, block.x));
 
             cudaSafeCall( cudaFuncSetCacheConfig(circlesAccumCenters, cudaFuncCachePreferL1) );
 
-            circlesAccumCenters<<<grid, block>>>(list, count, dx, dy, accum, accum.cols - 2, accum.rows - 2, minRadius, maxRadius, idp);
+            circlesAccumCenters<<<grid, block, 0, stream>>>(list, count, dx, dy, accum, accum.cols - 2, accum.rows - 2, minRadius, maxRadius, idp);
             cudaSafeCall( cudaGetLastError() );
 
-            cudaSafeCall( cudaDeviceSynchronize() );
+            cudaSafeCall( cudaStreamSynchronize(stream) );
         }
 
         ////////////////////////////////////////////////////////////////////////
         // buildCentersList
-
-        __global__ void buildCentersList(const PtrStepSzi accum, unsigned int* centers, const int threshold)
+        __global__ void buildCentersList(const PtrStepSzi accum, unsigned int* centers, const int threshold, int* counterPtr)
         {
             const int x = blockIdx.x * blockDim.x + threadIdx.x;
             const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -145,31 +142,27 @@ namespace cv { namespace cuda { namespace device
                 if (cur > threshold && cur > top && cur >= bottom && cur >  left && cur >= right)
                 {
                     const unsigned int val = (y << 16) | x;
-                    const int idx = ::atomicAdd(&g_counter, 1);
+                    const int idx = ::atomicAdd(counterPtr, 1);
                     centers[idx] = val;
                 }
             }
         }
 
-        int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold)
+        int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold, int* counterPtr, cudaStream_t stream)
         {
-            void* counterPtr;
-            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
-
-            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+            cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) );
 
             const dim3 block(32, 8);
             const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
 
             cudaSafeCall( cudaFuncSetCacheConfig(buildCentersList, cudaFuncCachePreferL1) );
 
-            buildCentersList<<<grid, block>>>(accum, centers, threshold);
+            buildCentersList<<<grid, block, 0, stream>>>(accum, centers, threshold, counterPtr);
             cudaSafeCall( cudaGetLastError() );
 
-            cudaSafeCall( cudaDeviceSynchronize() );
-
             int totalCount;
-            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpyAsync(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
+            cudaSafeCall( cudaStreamSynchronize(stream) );
 
             return totalCount;
         }
@@ -179,7 +172,8 @@ namespace cv { namespace cuda { namespace device
 
         __global__ void circlesAccumRadius(const unsigned int* centers, const unsigned int* list, const int count,
                                            float3* circles, const int maxCircles, const float dp,
-                                           const int minRadius, const int maxRadius, const int histSize, const int threshold)
+                                           const int minRadius, const int maxRadius, const int histSize, const int threshold,
+                                           int* counterPtr)
         {
             int* smem = DynamicSharedMem<int>();
 
@@ -219,7 +213,7 @@ namespace cv { namespace cuda { namespace device
 
                 if (curVotes >= threshold && curVotes > smem[i] && curVotes >= smem[i + 2])
                 {
-                    const int ind = ::atomicAdd(&g_counter, 1);
+                    const int ind = ::atomicAdd(counterPtr, 1);
                     if (ind < maxCircles)
                         circles[ind] = make_float3(cx, cy, i + minRadius);
                 }
@@ -227,12 +221,9 @@ namespace cv { namespace cuda { namespace device
         }
 
         int circlesAccumRadius_gpu(const unsigned int* centers, int centersCount, const unsigned int* list, int count,
-                                   float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20)
+                                   float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20, int* counterPtr, cudaStream_t stream)
         {
-            void* counterPtr;
-            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
-
-            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+            cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) );
 
             const dim3 block(has20 ? 1024 : 512);
             const dim3 grid(centersCount);
@@ -240,13 +231,12 @@ namespace cv { namespace cuda { namespace device
             const int histSize = maxRadius - minRadius + 1;
             size_t smemSize = (histSize + 2) * sizeof(int);
 
-            circlesAccumRadius<<<grid, block, smemSize>>>(centers, list, count, circles, maxCircles, dp, minRadius, maxRadius, histSize, threshold);
+            circlesAccumRadius<<<grid, block, smemSize, stream>>>(centers, list, count, circles, maxCircles, dp, minRadius, maxRadius, histSize, threshold, counterPtr);
             cudaSafeCall( cudaGetLastError() );
 
-            cudaSafeCall( cudaDeviceSynchronize() );
-
             int totalCount;
-            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpyAsync(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
+            cudaSafeCall( cudaStreamSynchronize(stream) );
 
             totalCount = ::min(totalCount, maxCircles);
 
diff --git a/modules/cudaimgproc/src/cuda/hough_lines.cu b/modules/cudaimgproc/src/cuda/hough_lines.cu
index 9a93cbf14..6fa557911 100644
--- a/modules/cudaimgproc/src/cuda/hough_lines.cu
+++ b/modules/cudaimgproc/src/cuda/hough_lines.cu
@@ -44,6 +44,7 @@
 
 #include <thrust/device_ptr.h>
 #include <thrust/sort.h>
+#include <thrust/system/cuda/execution_policy.h>
 
 #include "opencv2/core/cuda/common.hpp"
 #include "opencv2/core/cuda/emulation.hpp"
@@ -53,8 +54,6 @@ namespace cv { namespace cuda { namespace device
 {
     namespace hough_lines
     {
-        __device__ int g_counter;
-
         ////////////////////////////////////////////////////////////////////////
         // linesAccum
 
@@ -126,7 +125,7 @@ namespace cv { namespace cuda { namespace device
                 accumRow[i] = smem[i];
         }
 
-        void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20)
+        void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20, cudaStream_t stream)
         {
             const dim3 block(has20 ? 1024 : 512);
             const dim3 grid(accum.rows - 2);
@@ -134,19 +133,18 @@ namespace cv { namespace cuda { namespace device
             size_t smemSize = (accum.cols - 1) * sizeof(int);
 
             if (smemSize < sharedMemPerBlock - 1000)
-                linesAccumShared<<<grid, block, smemSize>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
+                linesAccumShared<<<grid, block, smemSize, stream>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
             else
-                linesAccumGlobal<<<grid, block>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
+                linesAccumGlobal<<<grid, block, 0, stream>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
 
             cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
+            cudaSafeCall( cudaStreamSynchronize(stream) );
         }
 
         ////////////////////////////////////////////////////////////////////////
         // linesGetResult
 
-        __global__ void linesGetResult(const PtrStepSzi accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const int threshold, const int numrho)
+        __global__ void linesGetResult(const PtrStepSzi accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const int threshold, const int numrho, int* counterPtr)
         {
             const int r = blockIdx.x * blockDim.x + threadIdx.x;
             const int n = blockIdx.y * blockDim.y + threadIdx.y;
@@ -165,7 +163,7 @@ namespace cv { namespace cuda { namespace device
                 const float radius = (r - (numrho - 1) * 0.5f) * rho;
                 const float angle = n * theta;
 
-                const int ind = ::atomicAdd(&g_counter, 1);
+                const int ind = ::atomicAdd(counterPtr, 1);
                 if (ind < maxSize)
                 {
                     out[ind] = make_float2(radius, angle);
@@ -174,25 +172,22 @@ namespace cv { namespace cuda { namespace device
             }
         }
 
-        int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort)
+        int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort, int* counterPtr, cudaStream_t stream)
         {
-            void* counterPtr;
-            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
-
-            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+            cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) );
 
             const dim3 block(32, 8);
             const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
 
             cudaSafeCall( cudaFuncSetCacheConfig(linesGetResult, cudaFuncCachePreferL1) );
 
-            linesGetResult<<<grid, block>>>(accum, out, votes, maxSize, rho, theta, threshold, accum.cols - 2);
+            linesGetResult<<<grid, block, 0, stream>>>(accum, out, votes, maxSize, rho, theta, threshold, accum.cols - 2, counterPtr);
             cudaSafeCall( cudaGetLastError() );
 
-            cudaSafeCall( cudaDeviceSynchronize() );
-
             int totalCount;
-            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpyAsync(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
+
+            cudaSafeCall( cudaStreamSynchronize(stream) );
 
             totalCount = ::min(totalCount, maxSize);
 
@@ -200,7 +195,7 @@ namespace cv { namespace cuda { namespace device
             {
                 thrust::device_ptr<float2> outPtr(out);
                 thrust::device_ptr<int> votesPtr(votes);
-                thrust::sort_by_key(votesPtr, votesPtr + totalCount, outPtr, thrust::greater<int>());
+                thrust::sort_by_key(thrust::cuda::par.on(stream), votesPtr, votesPtr + totalCount, outPtr, thrust::greater<int>());
             }
 
             return totalCount;
diff --git a/modules/cudaimgproc/src/cuda/hough_segments.cu b/modules/cudaimgproc/src/cuda/hough_segments.cu
index ca433d30d..988f14c1d 100644
--- a/modules/cudaimgproc/src/cuda/hough_segments.cu
+++ b/modules/cudaimgproc/src/cuda/hough_segments.cu
@@ -49,15 +49,14 @@ namespace cv { namespace cuda { namespace device
 {
     namespace hough_segments
     {
-        __device__ int g_counter;
-
         texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_mask(false, cudaFilterModePoint, cudaAddressModeClamp);
 
         __global__ void houghLinesProbabilistic(const PtrStepSzi accum,
                                                 int4* out, const int maxSize,
                                                 const float rho, const float theta,
                                                 const int lineGap, const int lineLength,
-                                                const int rows, const int cols)
+                                                const int rows, const int cols,
+                                                int* counterPtr)
         {
             const int r = blockIdx.x * blockDim.x + threadIdx.x;
             const int n = blockIdx.y * blockDim.y + threadIdx.y;
@@ -182,7 +181,7 @@ namespace cv { namespace cuda { namespace device
 
                             if (good_line)
                             {
-                                const int ind = ::atomicAdd(&g_counter, 1);
+                                const int ind = ::atomicAdd(counterPtr, 1);
                                 if (ind < maxSize)
                                     out[ind] = make_int4(line_end[0].x, line_end[0].y, line_end[1].x, line_end[1].y);
                             }
@@ -202,7 +201,7 @@ namespace cv { namespace cuda { namespace device
 
                             if (good_line)
                             {
-                                const int ind = ::atomicAdd(&g_counter, 1);
+                                const int ind = ::atomicAdd(counterPtr, 1);
                                 if (ind < maxSize)
                                     out[ind] = make_int4(line_end[0].x, line_end[0].y, line_end[1].x, line_end[1].y);
                             }
@@ -214,29 +213,27 @@ namespace cv { namespace cuda { namespace device
             }
         }
 
-        int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength)
+        int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength, int* counterPtr, cudaStream_t stream)
         {
-            void* counterPtr;
-            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
-
-            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+            cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) );
 
             const dim3 block(32, 8);
             const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
 
             bindTexture(&tex_mask, mask);
 
-            houghLinesProbabilistic<<<grid, block>>>(accum,
+            houghLinesProbabilistic<<<grid, block, 0, stream>>>(accum,
                                                      out, maxSize,
                                                      rho, theta,
                                                      lineGap, lineLength,
-                                                     mask.rows, mask.cols);
+                                                     mask.rows, mask.cols,
+                                                     counterPtr);
             cudaSafeCall( cudaGetLastError() );
 
-            cudaSafeCall( cudaDeviceSynchronize() );
-
             int totalCount;
-            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpyAsync(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
+
+            cudaSafeCall( cudaStreamSynchronize(stream) );
 
             totalCount = ::min(totalCount, maxSize);
 
diff --git a/modules/cudaimgproc/src/hough_circles.cpp b/modules/cudaimgproc/src/hough_circles.cpp
index 0fa962d71..61d49d46b 100644
--- a/modules/cudaimgproc/src/hough_circles.cpp
+++ b/modules/cudaimgproc/src/hough_circles.cpp
@@ -55,15 +55,15 @@ namespace cv { namespace cuda { namespace device
 {
     namespace hough
     {
-        int buildPointList_gpu(PtrStepSzb src, unsigned int* list);
+        int buildPointList_gpu(PtrStepSzb src, unsigned int* list, int* counterPtr, cudaStream_t stream);
     }
 
     namespace hough_circles
     {
-        void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp);
-        int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold);
+        void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp, cudaStream_t stream);
+        int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold, int* counterPtr, cudaStream_t stream);
         int circlesAccumRadius_gpu(const unsigned int* centers, int centersCount, const unsigned int* list, int count,
-                                   float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20);
+                                   float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20, int* counterPtr, cudaStream_t stream);
     }
 }}}
 
@@ -73,6 +73,7 @@ namespace
     {
     public:
         HoughCirclesDetectorImpl(float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles);
+        ~HoughCirclesDetectorImpl();
 
         void detect(InputArray src, OutputArray circles, Stream& stream);
 
@@ -140,6 +141,8 @@ namespace
         Ptr<cuda::Filter> filterDx_;
         Ptr<cuda::Filter> filterDy_;
         Ptr<cuda::CannyEdgeDetector> canny_;
+
+        int* counterPtr_;
     };
 
     bool centersCompare(Vec3f a, Vec3f b) {return (a[2] > b[2]);}
@@ -153,16 +156,22 @@ namespace
 
         filterDx_ = cuda::createSobelFilter(CV_8UC1, CV_32S, 1, 0);
         filterDy_ = cuda::createSobelFilter(CV_8UC1, CV_32S, 0, 1);
+
+        cudaSafeCall(cudaMalloc(&counterPtr_, sizeof(int)));
     }
 
-    void HoughCirclesDetectorImpl::detect(InputArray _src, OutputArray circles, Stream& stream)
+    HoughCirclesDetectorImpl::~HoughCirclesDetectorImpl()
     {
-        // TODO : implement async version
-        CV_UNUSED(stream);
+        cudaSafeCall(cudaFree(counterPtr_));
+    }
 
+    void HoughCirclesDetectorImpl::detect(InputArray _src, OutputArray circles, Stream& stream)
+    {
         using namespace cv::cuda::device::hough;
         using namespace cv::cuda::device::hough_circles;
 
+        auto cudaStream = StreamAccessor::getStream(stream);
+
         GpuMat src = _src.getGpuMat();
 
         CV_Assert( src.type() == CV_8UC1 );
@@ -182,13 +191,13 @@ namespace
         canny_->setLowThreshold(std::max(cannyThreshold_ / 2, 1));
         canny_->setHighThreshold(cannyThreshold_);
 
-        canny_->detect(dx_, dy_, edges_);
+        canny_->detect(dx_, dy_, edges_, stream);
 
         ensureSizeIsEnough(2, src.size().area(), CV_32SC1, list_);
         unsigned int* srcPoints = list_.ptr<unsigned int>(0);
         unsigned int* centers = list_.ptr<unsigned int>(1);
 
-        const int pointsCount = buildPointList_gpu(edges_, srcPoints);
+        const int pointsCount = buildPointList_gpu(edges_, srcPoints, counterPtr_, cudaStream);
         if (pointsCount == 0)
         {
             circles.release();
@@ -196,13 +205,13 @@ namespace
         }
 
         ensureSizeIsEnough(cvCeil(src.rows * idp) + 2, cvCeil(src.cols * idp) + 2, CV_32SC1, accum_);
-        accum_.setTo(Scalar::all(0));
+        accum_.setTo(Scalar::all(0), stream);
 
-        circlesAccumCenters_gpu(srcPoints, pointsCount, dx_, dy_, accum_, minRadius_, maxRadius_, idp);
+        circlesAccumCenters_gpu(srcPoints, pointsCount, dx_, dy_, accum_, minRadius_, maxRadius_, idp, cudaStream);
 
-        accum_.download(tt);
+        accum_.download(tt, stream);
 
-        int centersCount = buildCentersList_gpu(accum_, centers, votesThreshold_);
+        int centersCount = buildCentersList_gpu(accum_, centers, votesThreshold_, counterPtr_, cudaStream);
         if (centersCount == 0)
         {
             circles.release();
@@ -218,7 +227,8 @@ namespace
             ushort2* oldBuf = oldBuf_.data();
             ushort2* newBuf = newBuf_.data();
 
-            cudaSafeCall( cudaMemcpy(oldBuf, centers, centersCount * sizeof(ushort2), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpyAsync(oldBuf, centers, centersCount * sizeof(ushort2), cudaMemcpyDeviceToHost, cudaStream) );
+            cudaSafeCall( cudaStreamSynchronize(cudaStream) );
 
             const int cellSize = cvRound(minDist_);
             const int gridWidth = (src.cols + cellSize - 1) / cellSize;
@@ -290,14 +300,15 @@ namespace
                 }
             }
 
-            cudaSafeCall( cudaMemcpy(centers, newBuf, newCount * sizeof(unsigned int), cudaMemcpyHostToDevice) );
+            cudaSafeCall( cudaMemcpyAsync(centers, newBuf, newCount * sizeof(unsigned int), cudaMemcpyHostToDevice, cudaStream) );
             centersCount = newCount;
         }
 
         ensureSizeIsEnough(1, maxCircles_, CV_32FC3, result_);
 
         int circlesCount = circlesAccumRadius_gpu(centers, centersCount, srcPoints, pointsCount, result_.ptr<float3>(), maxCircles_,
-                                                  dp_, minRadius_, maxRadius_, votesThreshold_, deviceSupports(FEATURE_SET_COMPUTE_20));
+                                                  dp_, minRadius_, maxRadius_, votesThreshold_, deviceSupports(FEATURE_SET_COMPUTE_20),
+                                                  counterPtr_, cudaStream);
 
         if (circlesCount == 0)
         {
@@ -306,7 +317,7 @@ namespace
         }
 
         result_.cols = circlesCount;
-        result_.copyTo(circles);
+        result_.copyTo(circles, stream);
     }
 }
 
diff --git a/modules/cudaimgproc/src/hough_lines.cpp b/modules/cudaimgproc/src/hough_lines.cpp
index e112e09a3..06de8c0be 100644
--- a/modules/cudaimgproc/src/hough_lines.cpp
+++ b/modules/cudaimgproc/src/hough_lines.cpp
@@ -55,13 +55,13 @@ namespace cv { namespace cuda { namespace device
 {
     namespace hough
     {
-        int buildPointList_gpu(PtrStepSzb src, unsigned int* list);
+        int buildPointList_gpu(PtrStepSzb src, unsigned int* list, int* counterPtr, cudaStream_t stream);
     }
 
     namespace hough_lines
     {
-        void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20);
-        int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort);
+        void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20, cudaStream_t stream);
+        int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort, int* counterPtr, cudaStream_t stream);
     }
 }}}
 
@@ -70,10 +70,8 @@ namespace
     class HoughLinesDetectorImpl : public HoughLinesDetector
     {
     public:
-        HoughLinesDetectorImpl(float rho, float theta, int threshold, bool doSort, int maxLines) :
-            rho_(rho), theta_(theta), threshold_(threshold), doSort_(doSort), maxLines_(maxLines)
-        {
-        }
+        HoughLinesDetectorImpl(float rho, float theta, int threshold, bool doSort, int maxLines);
+        ~HoughLinesDetectorImpl();
 
         void detect(InputArray src, OutputArray lines, Stream& stream);
         void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes, Stream& stream);
@@ -124,16 +122,27 @@ namespace
         GpuMat accum_;
         GpuMat list_;
         GpuMat result_;
+
+        int* counterPtr_;
     };
 
-    void HoughLinesDetectorImpl::detect(InputArray _src, OutputArray lines, Stream& stream)
+    HoughLinesDetectorImpl::HoughLinesDetectorImpl(float rho, float theta, int threshold, bool doSort, int maxLines) :
+        rho_(rho), theta_(theta), threshold_(threshold), doSort_(doSort), maxLines_(maxLines)
+    {
+        cudaSafeCall(cudaMalloc(&counterPtr_, sizeof(int)));
+    }
+
+    HoughLinesDetectorImpl::~HoughLinesDetectorImpl()
     {
-        // TODO : implement async version
-        CV_UNUSED(stream);
+        cudaSafeCall(cudaFree(counterPtr_));
+    }
 
+    void HoughLinesDetectorImpl::detect(InputArray _src, OutputArray lines, Stream& stream)
+    {
         using namespace cv::cuda::device::hough;
         using namespace cv::cuda::device::hough_lines;
 
+        auto cudaStream = StreamAccessor::getStream(stream);
         GpuMat src = _src.getGpuMat();
 
         CV_Assert( src.type() == CV_8UC1 );
@@ -143,7 +152,7 @@ namespace
         ensureSizeIsEnough(1, src.size().area(), CV_32SC1, list_);
         unsigned int* srcPoints = list_.ptr<unsigned int>();
 
-        const int pointsCount = buildPointList_gpu(src, srcPoints);
+        const int pointsCount = buildPointList_gpu(src, srcPoints, counterPtr_, cudaStream);
         if (pointsCount == 0)
         {
             lines.release();
@@ -155,14 +164,14 @@ namespace
         CV_Assert( numangle > 0 && numrho > 0 );
 
         ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, accum_);
-        accum_.setTo(Scalar::all(0));
+        accum_.setTo(Scalar::all(0), stream);
 
         DeviceInfo devInfo;
-        linesAccum_gpu(srcPoints, pointsCount, accum_, rho_, theta_, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20));
+        linesAccum_gpu(srcPoints, pointsCount, accum_, rho_, theta_, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20), cudaStream);
 
         ensureSizeIsEnough(2, maxLines_, CV_32FC2, result_);
 
-        int linesCount = linesGetResult_gpu(accum_, result_.ptr<float2>(0), result_.ptr<int>(1), maxLines_, rho_, theta_, threshold_, doSort_);
+        int linesCount = linesGetResult_gpu(accum_, result_.ptr<float2>(0), result_.ptr<int>(1), maxLines_, rho_, theta_, threshold_, doSort_, counterPtr_, cudaStream);
 
         if (linesCount == 0)
         {
@@ -171,7 +180,7 @@ namespace
         }
 
         result_.cols = linesCount;
-        result_.copyTo(lines);
+        result_.copyTo(lines, stream);
     }
 
     void HoughLinesDetectorImpl::downloadResults(InputArray _d_lines, OutputArray h_lines, OutputArray h_votes, Stream& stream)
diff --git a/modules/cudaimgproc/src/hough_segments.cpp b/modules/cudaimgproc/src/hough_segments.cpp
index 34ee47446..f0eb0beec 100644
--- a/modules/cudaimgproc/src/hough_segments.cpp
+++ b/modules/cudaimgproc/src/hough_segments.cpp
@@ -55,17 +55,17 @@ namespace cv { namespace cuda { namespace device
 {
     namespace hough
     {
-        int buildPointList_gpu(PtrStepSzb src, unsigned int* list);
+        int buildPointList_gpu(PtrStepSzb src, unsigned int* list, int* counterPtr, cudaStream_t stream);
     }
 
     namespace hough_lines
     {
-        void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20);
+        void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20, cudaStream_t stream);
     }
 
     namespace hough_segments
     {
-        int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength);
+        int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength, int* counterPtr, cudaStream_t stream);
     }
 }}}
 
@@ -74,10 +74,8 @@ namespace
     class HoughSegmentDetectorImpl : public HoughSegmentDetector
     {
     public:
-        HoughSegmentDetectorImpl(float rho, float theta, int minLineLength, int maxLineGap, int maxLines) :
-            rho_(rho), theta_(theta), minLineLength_(minLineLength), maxLineGap_(maxLineGap), maxLines_(maxLines)
-        {
-        }
+        HoughSegmentDetectorImpl(float rho, float theta, int minLineLength, int maxLineGap, int maxLines);
+        ~HoughSegmentDetectorImpl();
 
         void detect(InputArray src, OutputArray lines, Stream& stream);
 
@@ -127,8 +125,21 @@ namespace
         GpuMat accum_;
         GpuMat list_;
         GpuMat result_;
+
+        int* counterPtr_;
     };
 
+    HoughSegmentDetectorImpl::HoughSegmentDetectorImpl(float rho, float theta, int minLineLength, int maxLineGap, int maxLines) :
+        rho_(rho), theta_(theta), minLineLength_(minLineLength), maxLineGap_(maxLineGap), maxLines_(maxLines)
+    {
+        cudaSafeCall(cudaMalloc(&counterPtr_, sizeof(int)));
+    }
+
+    HoughSegmentDetectorImpl::~HoughSegmentDetectorImpl()
+    {
+        cudaSafeCall(cudaFree(counterPtr_));
+    }
+
     void HoughSegmentDetectorImpl::detect(InputArray _src, OutputArray lines, Stream& stream)
     {
         // TODO : implement async version
@@ -138,6 +149,7 @@ namespace
         using namespace cv::cuda::device::hough_lines;
         using namespace cv::cuda::device::hough_segments;
 
+        auto cudaStream = StreamAccessor::getStream(stream);
         GpuMat src = _src.getGpuMat();
 
         CV_Assert( src.type() == CV_8UC1 );
@@ -147,7 +159,7 @@ namespace
         ensureSizeIsEnough(1, src.size().area(), CV_32SC1, list_);
         unsigned int* srcPoints = list_.ptr<unsigned int>();
 
-        const int pointsCount = buildPointList_gpu(src, srcPoints);
+        const int pointsCount = buildPointList_gpu(src, srcPoints, counterPtr_, cudaStream);
         if (pointsCount == 0)
         {
             lines.release();
@@ -159,14 +171,14 @@ namespace
         CV_Assert( numangle > 0 && numrho > 0 );
 
         ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, accum_);
-        accum_.setTo(Scalar::all(0));
+        accum_.setTo(Scalar::all(0), stream);
 
         DeviceInfo devInfo;
-        linesAccum_gpu(srcPoints, pointsCount, accum_, rho_, theta_, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20));
+        linesAccum_gpu(srcPoints, pointsCount, accum_, rho_, theta_, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20), cudaStream);
 
         ensureSizeIsEnough(1, maxLines_, CV_32SC4, result_);
 
-        int linesCount = houghLinesProbabilistic_gpu(src, accum_, result_.ptr<int4>(), maxLines_, rho_, theta_, maxLineGap_, minLineLength_);
+        int linesCount = houghLinesProbabilistic_gpu(src, accum_, result_.ptr<int4>(), maxLines_, rho_, theta_, maxLineGap_, minLineLength_, counterPtr_, cudaStream);
 
         if (linesCount == 0)
         {
@@ -175,7 +187,7 @@ namespace
         }
 
         result_.cols = linesCount;
-        result_.copyTo(lines);
+        result_.copyTo(lines, stream);
     }
 }