Merge pull request #2801 from Atlas42:cuda-hough-stream-fix

Added stream support on hough circles, lines and segments * Added stream support on hough circles lines and segments - Passed the stream to the different cuda, OpenCV and thurst library calls - Replace all device by cuda synchronizes - Added extra synchronize calls after device to host transfers - Replaced the cuda globals by allocated values * Fixed missing include for CUDA 8 Co-authored-by: william.fink <will@recurse.io>
4 years ago · 33ae078b09
parent 6d5f440402
commit 33ae078b09
7 changed files with 127 additions and 119 deletions
--- a/modules/cudaimgproc/src/cuda/build_point_list.cu
+++ b/modules/cudaimgproc/src/cuda/build_point_list.cu
@ -49,10 +49,8 @@ namespace cv { namespace cuda { namespace device
 {
    namespace hough
    {
-        __device__ int g_counter;
-
        template <int PIXELS_PER_THREAD>
-        __global__ void buildPointList(const PtrStepSzb src, unsigned int* list)
+        __global__ void buildPointList(const PtrStepSzb src, unsigned int* list, int* counterPtr)
        {
            __shared__ unsigned int s_queues[4][32 * PIXELS_PER_THREAD];
            __shared__ int s_qsize[4];
@ -94,7 +92,7 @@ namespace cv { namespace cuda { namespace device
                }

                // calculate the offset in the global list
-                const int globalOffset = atomicAdd(&g_counter, totalSize);
+                const int globalOffset = atomicAdd(counterPtr, totalSize);
                for (int i = 0; i < blockDim.y; ++i)
                    s_globStart[i] += globalOffset;
            }
@ -108,27 +106,23 @@ namespace cv { namespace cuda { namespace device
                list[gidx] = s_queues[threadIdx.y][i];
        }

-        int buildPointList_gpu(PtrStepSzb src, unsigned int* list)
+        int buildPointList_gpu(PtrStepSzb src, unsigned int* list, int* counterPtr, cudaStream_t stream)
        {
            const int PIXELS_PER_THREAD = 16;

-            void* counterPtr;
-            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
-
-            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+            cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) );

            const dim3 block(32, 4);
            const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y));

            cudaSafeCall( cudaFuncSetCacheConfig(buildPointList<PIXELS_PER_THREAD>, cudaFuncCachePreferShared) );

-            buildPointList<PIXELS_PER_THREAD><<<grid, block>>>(src, list);
+            buildPointList<PIXELS_PER_THREAD><<<grid, block, 0, stream>>>(src, list, counterPtr);
            cudaSafeCall( cudaGetLastError() );

-            cudaSafeCall( cudaDeviceSynchronize() );
-
            int totalCount;
-            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpyAsync(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
+            cudaSafeCall( cudaStreamSynchronize(stream) );

            return totalCount;
        }
--- a/modules/cudaimgproc/src/cuda/hough_circles.cu
+++ b/modules/cudaimgproc/src/cuda/hough_circles.cu
@ -54,8 +54,6 @@ namespace cv { namespace cuda { namespace device
 {
    namespace hough_circles
    {
-        __device__ int g_counter;
-
        ////////////////////////////////////////////////////////////////////////
        // circlesAccumCenters

@ -111,23 +109,22 @@ namespace cv { namespace cuda { namespace device
            }
        }

-        void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp)
+        void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp, cudaStream_t stream)
        {
            const dim3 block(256);
            const dim3 grid(divUp(count, block.x));

            cudaSafeCall( cudaFuncSetCacheConfig(circlesAccumCenters, cudaFuncCachePreferL1) );

-            circlesAccumCenters<<<grid, block>>>(list, count, dx, dy, accum, accum.cols - 2, accum.rows - 2, minRadius, maxRadius, idp);
+            circlesAccumCenters<<<grid, block, 0, stream>>>(list, count, dx, dy, accum, accum.cols - 2, accum.rows - 2, minRadius, maxRadius, idp);
            cudaSafeCall( cudaGetLastError() );

-            cudaSafeCall( cudaDeviceSynchronize() );
+            cudaSafeCall( cudaStreamSynchronize(stream) );
        }

        ////////////////////////////////////////////////////////////////////////
        // buildCentersList
-
-        __global__ void buildCentersList(const PtrStepSzi accum, unsigned int* centers, const int threshold)
+        __global__ void buildCentersList(const PtrStepSzi accum, unsigned int* centers, const int threshold, int* counterPtr)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -145,31 +142,27 @@ namespace cv { namespace cuda { namespace device
                if (cur > threshold && cur > top && cur >= bottom && cur >  left && cur >= right)
                {
                    const unsigned int val = (y << 16) | x;
-                    const int idx = ::atomicAdd(&g_counter, 1);
+                    const int idx = ::atomicAdd(counterPtr, 1);
                    centers[idx] = val;
                }
            }
        }

-        int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold)
+        int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold, int* counterPtr, cudaStream_t stream)
        {
-            void* counterPtr;
-            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
-
-            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+            cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) );

            const dim3 block(32, 8);
            const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));

            cudaSafeCall( cudaFuncSetCacheConfig(buildCentersList, cudaFuncCachePreferL1) );

-            buildCentersList<<<grid, block>>>(accum, centers, threshold);
+            buildCentersList<<<grid, block, 0, stream>>>(accum, centers, threshold, counterPtr);
            cudaSafeCall( cudaGetLastError() );

-            cudaSafeCall( cudaDeviceSynchronize() );
-
            int totalCount;
-            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpyAsync(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
+            cudaSafeCall( cudaStreamSynchronize(stream) );

            return totalCount;
        }
@ -179,7 +172,8 @@ namespace cv { namespace cuda { namespace device

        __global__ void circlesAccumRadius(const unsigned int* centers, const unsigned int* list, const int count,
                                           float3* circles, const int maxCircles, const float dp,
-                                           const int minRadius, const int maxRadius, const int histSize, const int threshold)
+                                           const int minRadius, const int maxRadius, const int histSize, const int threshold,
+                                           int* counterPtr)
        {
            int* smem = DynamicSharedMem<int>();

@ -219,7 +213,7 @@ namespace cv { namespace cuda { namespace device

                if (curVotes >= threshold && curVotes > smem[i] && curVotes >= smem[i + 2])
                {
-                    const int ind = ::atomicAdd(&g_counter, 1);
+                    const int ind = ::atomicAdd(counterPtr, 1);
                    if (ind < maxCircles)
                        circles[ind] = make_float3(cx, cy, i + minRadius);
                }
@ -227,12 +221,9 @@ namespace cv { namespace cuda { namespace device
        }

        int circlesAccumRadius_gpu(const unsigned int* centers, int centersCount, const unsigned int* list, int count,
-                                   float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20)
+                                   float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20, int* counterPtr, cudaStream_t stream)
        {
-            void* counterPtr;
-            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
-
-            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+            cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) );

            const dim3 block(has20 ? 1024 : 512);
            const dim3 grid(centersCount);
@ -240,13 +231,12 @@ namespace cv { namespace cuda { namespace device
            const int histSize = maxRadius - minRadius + 1;
            size_t smemSize = (histSize + 2) * sizeof(int);

-            circlesAccumRadius<<<grid, block, smemSize>>>(centers, list, count, circles, maxCircles, dp, minRadius, maxRadius, histSize, threshold);
+            circlesAccumRadius<<<grid, block, smemSize, stream>>>(centers, list, count, circles, maxCircles, dp, minRadius, maxRadius, histSize, threshold, counterPtr);
            cudaSafeCall( cudaGetLastError() );

-            cudaSafeCall( cudaDeviceSynchronize() );
-
            int totalCount;
-            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpyAsync(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
+            cudaSafeCall( cudaStreamSynchronize(stream) );

            totalCount = ::min(totalCount, maxCircles);

--- a/modules/cudaimgproc/src/cuda/hough_lines.cu
+++ b/modules/cudaimgproc/src/cuda/hough_lines.cu
@ -44,6 +44,7 @@

 #include <thrust/device_ptr.h>
 #include <thrust/sort.h>
+#include <thrust/system/cuda/execution_policy.h>

 #include "opencv2/core/cuda/common.hpp"
 #include "opencv2/core/cuda/emulation.hpp"
@ -53,8 +54,6 @@ namespace cv { namespace cuda { namespace device
 {
    namespace hough_lines
    {
-        __device__ int g_counter;
-
        ////////////////////////////////////////////////////////////////////////
        // linesAccum

@ -126,7 +125,7 @@ namespace cv { namespace cuda { namespace device
                accumRow[i] = smem[i];
        }

-        void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20)
+        void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20, cudaStream_t stream)
        {
            const dim3 block(has20 ? 1024 : 512);
            const dim3 grid(accum.rows - 2);
@ -134,19 +133,18 @@ namespace cv { namespace cuda { namespace device
            size_t smemSize = (accum.cols - 1) * sizeof(int);

            if (smemSize < sharedMemPerBlock - 1000)
-                linesAccumShared<<<grid, block, smemSize>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
+                linesAccumShared<<<grid, block, smemSize, stream>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
            else
-                linesAccumGlobal<<<grid, block>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
+                linesAccumGlobal<<<grid, block, 0, stream>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);

            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
+            cudaSafeCall( cudaStreamSynchronize(stream) );
        }

        ////////////////////////////////////////////////////////////////////////
        // linesGetResult

-        __global__ void linesGetResult(const PtrStepSzi accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const int threshold, const int numrho)
+        __global__ void linesGetResult(const PtrStepSzi accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const int threshold, const int numrho, int* counterPtr)
        {
            const int r = blockIdx.x * blockDim.x + threadIdx.x;
            const int n = blockIdx.y * blockDim.y + threadIdx.y;
@ -165,7 +163,7 @@ namespace cv { namespace cuda { namespace device
                const float radius = (r - (numrho - 1) * 0.5f) * rho;
                const float angle = n * theta;

-                const int ind = ::atomicAdd(&g_counter, 1);
+                const int ind = ::atomicAdd(counterPtr, 1);
                if (ind < maxSize)
                {
                    out[ind] = make_float2(radius, angle);
@ -174,25 +172,22 @@ namespace cv { namespace cuda { namespace device
            }
        }

-        int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort)
+        int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort, int* counterPtr, cudaStream_t stream)
        {
-            void* counterPtr;
-            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
-
-            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+            cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) );

            const dim3 block(32, 8);
            const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));

            cudaSafeCall( cudaFuncSetCacheConfig(linesGetResult, cudaFuncCachePreferL1) );

-            linesGetResult<<<grid, block>>>(accum, out, votes, maxSize, rho, theta, threshold, accum.cols - 2);
+            linesGetResult<<<grid, block, 0, stream>>>(accum, out, votes, maxSize, rho, theta, threshold, accum.cols - 2, counterPtr);
            cudaSafeCall( cudaGetLastError() );

-            cudaSafeCall( cudaDeviceSynchronize() );
-
            int totalCount;
-            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpyAsync(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
+
+            cudaSafeCall( cudaStreamSynchronize(stream) );

            totalCount = ::min(totalCount, maxSize);

@ -200,7 +195,7 @@ namespace cv { namespace cuda { namespace device
            {
                thrust::device_ptr<float2> outPtr(out);
                thrust::device_ptr<int> votesPtr(votes);
-                thrust::sort_by_key(votesPtr, votesPtr + totalCount, outPtr, thrust::greater<int>());
+                thrust::sort_by_key(thrust::cuda::par.on(stream), votesPtr, votesPtr + totalCount, outPtr, thrust::greater<int>());
            }

            return totalCount;
--- a/modules/cudaimgproc/src/cuda/hough_segments.cu
+++ b/modules/cudaimgproc/src/cuda/hough_segments.cu
@ -49,15 +49,14 @@ namespace cv { namespace cuda { namespace device
 {
    namespace hough_segments
    {
-        __device__ int g_counter;
-
        texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_mask(false, cudaFilterModePoint, cudaAddressModeClamp);

        __global__ void houghLinesProbabilistic(const PtrStepSzi accum,
                                                int4* out, const int maxSize,
                                                const float rho, const float theta,
                                                const int lineGap, const int lineLength,
-                                                const int rows, const int cols)
+                                                const int rows, const int cols,
+                                                int* counterPtr)
        {
            const int r = blockIdx.x * blockDim.x + threadIdx.x;
            const int n = blockIdx.y * blockDim.y + threadIdx.y;
@ -182,7 +181,7 @@ namespace cv { namespace cuda { namespace device

                            if (good_line)
                            {
-                                const int ind = ::atomicAdd(&g_counter, 1);
+                                const int ind = ::atomicAdd(counterPtr, 1);
                                if (ind < maxSize)
                                    out[ind] = make_int4(line_end[0].x, line_end[0].y, line_end[1].x, line_end[1].y);
                            }
@ -202,7 +201,7 @@ namespace cv { namespace cuda { namespace device

                            if (good_line)
                            {
-                                const int ind = ::atomicAdd(&g_counter, 1);
+                                const int ind = ::atomicAdd(counterPtr, 1);
                                if (ind < maxSize)
                                    out[ind] = make_int4(line_end[0].x, line_end[0].y, line_end[1].x, line_end[1].y);
                            }
@ -214,29 +213,27 @@ namespace cv { namespace cuda { namespace device
            }
        }

-        int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength)
+        int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength, int* counterPtr, cudaStream_t stream)
        {
-            void* counterPtr;
-            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
-
-            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+            cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) );

            const dim3 block(32, 8);
            const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));

            bindTexture(&tex_mask, mask);

-            houghLinesProbabilistic<<<grid, block>>>(accum,
+            houghLinesProbabilistic<<<grid, block, 0, stream>>>(accum,
                                                     out, maxSize,
                                                     rho, theta,
                                                     lineGap, lineLength,
-                                                     mask.rows, mask.cols);
+                                                     mask.rows, mask.cols,
+                                                     counterPtr);
            cudaSafeCall( cudaGetLastError() );

-            cudaSafeCall( cudaDeviceSynchronize() );
-
            int totalCount;
-            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpyAsync(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
+
+            cudaSafeCall( cudaStreamSynchronize(stream) );

            totalCount = ::min(totalCount, maxSize);

--- a/modules/cudaimgproc/src/hough_circles.cpp
+++ b/modules/cudaimgproc/src/hough_circles.cpp
@ -55,15 +55,15 @@ namespace cv { namespace cuda { namespace device
 {
    namespace hough
    {
-        int buildPointList_gpu(PtrStepSzb src, unsigned int* list);
+        int buildPointList_gpu(PtrStepSzb src, unsigned int* list, int* counterPtr, cudaStream_t stream);
    }

    namespace hough_circles
    {
-        void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp);
-        int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold);
+        void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp, cudaStream_t stream);
+        int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold, int* counterPtr, cudaStream_t stream);
        int circlesAccumRadius_gpu(const unsigned int* centers, int centersCount, const unsigned int* list, int count,
-                                   float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20);
+                                   float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20, int* counterPtr, cudaStream_t stream);
    }
 }}}

@ -73,6 +73,7 @@ namespace
    {
    public:
        HoughCirclesDetectorImpl(float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles);
+        ~HoughCirclesDetectorImpl();

        void detect(InputArray src, OutputArray circles, Stream& stream);

@ -140,6 +141,8 @@ namespace
        Ptr<cuda::Filter> filterDx_;
        Ptr<cuda::Filter> filterDy_;
        Ptr<cuda::CannyEdgeDetector> canny_;
+
+        int* counterPtr_;
    };

    bool centersCompare(Vec3f a, Vec3f b) {return (a[2] > b[2]);}
@ -153,16 +156,22 @@ namespace

        filterDx_ = cuda::createSobelFilter(CV_8UC1, CV_32S, 1, 0);
        filterDy_ = cuda::createSobelFilter(CV_8UC1, CV_32S, 0, 1);
+
+        cudaSafeCall(cudaMalloc(&counterPtr_, sizeof(int)));
    }

-    void HoughCirclesDetectorImpl::detect(InputArray _src, OutputArray circles, Stream& stream)
+    HoughCirclesDetectorImpl::~HoughCirclesDetectorImpl()
    {
-        // TODO : implement async version
-        CV_UNUSED(stream);
+        cudaSafeCall(cudaFree(counterPtr_));
+    }

+    void HoughCirclesDetectorImpl::detect(InputArray _src, OutputArray circles, Stream& stream)
+    {
        using namespace cv::cuda::device::hough;
        using namespace cv::cuda::device::hough_circles;

+        auto cudaStream = StreamAccessor::getStream(stream);
+
        GpuMat src = _src.getGpuMat();

        CV_Assert( src.type() == CV_8UC1 );
@ -182,13 +191,13 @@ namespace
        canny_->setLowThreshold(std::max(cannyThreshold_ / 2, 1));
        canny_->setHighThreshold(cannyThreshold_);

-        canny_->detect(dx_, dy_, edges_);
+        canny_->detect(dx_, dy_, edges_, stream);

        ensureSizeIsEnough(2, src.size().area(), CV_32SC1, list_);
        unsigned int* srcPoints = list_.ptr<unsigned int>(0);
        unsigned int* centers = list_.ptr<unsigned int>(1);

-        const int pointsCount = buildPointList_gpu(edges_, srcPoints);
+        const int pointsCount = buildPointList_gpu(edges_, srcPoints, counterPtr_, cudaStream);
        if (pointsCount == 0)
        {
            circles.release();
@ -196,13 +205,13 @@ namespace
        }

        ensureSizeIsEnough(cvCeil(src.rows * idp) + 2, cvCeil(src.cols * idp) + 2, CV_32SC1, accum_);
-        accum_.setTo(Scalar::all(0));
+        accum_.setTo(Scalar::all(0), stream);

-        circlesAccumCenters_gpu(srcPoints, pointsCount, dx_, dy_, accum_, minRadius_, maxRadius_, idp);
+        circlesAccumCenters_gpu(srcPoints, pointsCount, dx_, dy_, accum_, minRadius_, maxRadius_, idp, cudaStream);

-        accum_.download(tt);
+        accum_.download(tt, stream);

-        int centersCount = buildCentersList_gpu(accum_, centers, votesThreshold_);
+        int centersCount = buildCentersList_gpu(accum_, centers, votesThreshold_, counterPtr_, cudaStream);
        if (centersCount == 0)
        {
            circles.release();
@ -218,7 +227,8 @@ namespace
            ushort2* oldBuf = oldBuf_.data();
            ushort2* newBuf = newBuf_.data();

-            cudaSafeCall( cudaMemcpy(oldBuf, centers, centersCount * sizeof(ushort2), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpyAsync(oldBuf, centers, centersCount * sizeof(ushort2), cudaMemcpyDeviceToHost, cudaStream) );
+            cudaSafeCall( cudaStreamSynchronize(cudaStream) );

            const int cellSize = cvRound(minDist_);
            const int gridWidth = (src.cols + cellSize - 1) / cellSize;
@ -290,14 +300,15 @@ namespace
                }
            }

-            cudaSafeCall( cudaMemcpy(centers, newBuf, newCount * sizeof(unsigned int), cudaMemcpyHostToDevice) );
+            cudaSafeCall( cudaMemcpyAsync(centers, newBuf, newCount * sizeof(unsigned int), cudaMemcpyHostToDevice, cudaStream) );
            centersCount = newCount;
        }

        ensureSizeIsEnough(1, maxCircles_, CV_32FC3, result_);

        int circlesCount = circlesAccumRadius_gpu(centers, centersCount, srcPoints, pointsCount, result_.ptr<float3>(), maxCircles_,
-                                                  dp_, minRadius_, maxRadius_, votesThreshold_, deviceSupports(FEATURE_SET_COMPUTE_20));
+                                                  dp_, minRadius_, maxRadius_, votesThreshold_, deviceSupports(FEATURE_SET_COMPUTE_20),
+                                                  counterPtr_, cudaStream);

        if (circlesCount == 0)
        {
@ -306,7 +317,7 @@ namespace
        }

        result_.cols = circlesCount;
-        result_.copyTo(circles);
+        result_.copyTo(circles, stream);
    }
 }

--- a/modules/cudaimgproc/src/hough_lines.cpp
+++ b/modules/cudaimgproc/src/hough_lines.cpp
@ -55,13 +55,13 @@ namespace cv { namespace cuda { namespace device
 {
    namespace hough
    {
-        int buildPointList_gpu(PtrStepSzb src, unsigned int* list);
+        int buildPointList_gpu(PtrStepSzb src, unsigned int* list, int* counterPtr, cudaStream_t stream);
    }

    namespace hough_lines
    {
-        void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20);
-        int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort);
+        void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20, cudaStream_t stream);
+        int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort, int* counterPtr, cudaStream_t stream);
    }
 }}}

@ -70,10 +70,8 @@ namespace
    class HoughLinesDetectorImpl : public HoughLinesDetector
    {
    public:
-        HoughLinesDetectorImpl(float rho, float theta, int threshold, bool doSort, int maxLines) :
-            rho_(rho), theta_(theta), threshold_(threshold), doSort_(doSort), maxLines_(maxLines)
-        {
-        }
+        HoughLinesDetectorImpl(float rho, float theta, int threshold, bool doSort, int maxLines);
+        ~HoughLinesDetectorImpl();

        void detect(InputArray src, OutputArray lines, Stream& stream);
        void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes, Stream& stream);
@ -124,16 +122,27 @@ namespace
        GpuMat accum_;
        GpuMat list_;
        GpuMat result_;
+
+        int* counterPtr_;
    };

-    void HoughLinesDetectorImpl::detect(InputArray _src, OutputArray lines, Stream& stream)
+    HoughLinesDetectorImpl::HoughLinesDetectorImpl(float rho, float theta, int threshold, bool doSort, int maxLines) :
+        rho_(rho), theta_(theta), threshold_(threshold), doSort_(doSort), maxLines_(maxLines)
+    {
+        cudaSafeCall(cudaMalloc(&counterPtr_, sizeof(int)));
+    }
+
+    HoughLinesDetectorImpl::~HoughLinesDetectorImpl()
    {
-        // TODO : implement async version
-        CV_UNUSED(stream);
+        cudaSafeCall(cudaFree(counterPtr_));
+    }

+    void HoughLinesDetectorImpl::detect(InputArray _src, OutputArray lines, Stream& stream)
+    {
        using namespace cv::cuda::device::hough;
        using namespace cv::cuda::device::hough_lines;

+        auto cudaStream = StreamAccessor::getStream(stream);
        GpuMat src = _src.getGpuMat();

        CV_Assert( src.type() == CV_8UC1 );
@ -143,7 +152,7 @@ namespace
        ensureSizeIsEnough(1, src.size().area(), CV_32SC1, list_);
        unsigned int* srcPoints = list_.ptr<unsigned int>();

-        const int pointsCount = buildPointList_gpu(src, srcPoints);
+        const int pointsCount = buildPointList_gpu(src, srcPoints, counterPtr_, cudaStream);
        if (pointsCount == 0)
        {
            lines.release();
@ -155,14 +164,14 @@ namespace
        CV_Assert( numangle > 0 && numrho > 0 );

        ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, accum_);
-        accum_.setTo(Scalar::all(0));
+        accum_.setTo(Scalar::all(0), stream);

        DeviceInfo devInfo;
-        linesAccum_gpu(srcPoints, pointsCount, accum_, rho_, theta_, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20));
+        linesAccum_gpu(srcPoints, pointsCount, accum_, rho_, theta_, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20), cudaStream);

        ensureSizeIsEnough(2, maxLines_, CV_32FC2, result_);

-        int linesCount = linesGetResult_gpu(accum_, result_.ptr<float2>(0), result_.ptr<int>(1), maxLines_, rho_, theta_, threshold_, doSort_);
+        int linesCount = linesGetResult_gpu(accum_, result_.ptr<float2>(0), result_.ptr<int>(1), maxLines_, rho_, theta_, threshold_, doSort_, counterPtr_, cudaStream);

        if (linesCount == 0)
        {
@ -171,7 +180,7 @@ namespace
        }

        result_.cols = linesCount;
-        result_.copyTo(lines);
+        result_.copyTo(lines, stream);
    }

    void HoughLinesDetectorImpl::downloadResults(InputArray _d_lines, OutputArray h_lines, OutputArray h_votes, Stream& stream)
--- a/modules/cudaimgproc/src/hough_segments.cpp
+++ b/modules/cudaimgproc/src/hough_segments.cpp
@ -55,17 +55,17 @@ namespace cv { namespace cuda { namespace device
 {
    namespace hough
    {
-        int buildPointList_gpu(PtrStepSzb src, unsigned int* list);
+        int buildPointList_gpu(PtrStepSzb src, unsigned int* list, int* counterPtr, cudaStream_t stream);
    }

    namespace hough_lines
    {
-        void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20);
+        void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20, cudaStream_t stream);
    }

    namespace hough_segments
    {
-        int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength);
+        int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength, int* counterPtr, cudaStream_t stream);
    }
 }}}

@ -74,10 +74,8 @@ namespace
    class HoughSegmentDetectorImpl : public HoughSegmentDetector
    {
    public:
-        HoughSegmentDetectorImpl(float rho, float theta, int minLineLength, int maxLineGap, int maxLines) :
-            rho_(rho), theta_(theta), minLineLength_(minLineLength), maxLineGap_(maxLineGap), maxLines_(maxLines)
-        {
-        }
+        HoughSegmentDetectorImpl(float rho, float theta, int minLineLength, int maxLineGap, int maxLines);
+        ~HoughSegmentDetectorImpl();

        void detect(InputArray src, OutputArray lines, Stream& stream);

@ -127,8 +125,21 @@ namespace
        GpuMat accum_;
        GpuMat list_;
        GpuMat result_;
+
+        int* counterPtr_;
    };

+    HoughSegmentDetectorImpl::HoughSegmentDetectorImpl(float rho, float theta, int minLineLength, int maxLineGap, int maxLines) :
+        rho_(rho), theta_(theta), minLineLength_(minLineLength), maxLineGap_(maxLineGap), maxLines_(maxLines)
+    {
+        cudaSafeCall(cudaMalloc(&counterPtr_, sizeof(int)));
+    }
+
+    HoughSegmentDetectorImpl::~HoughSegmentDetectorImpl()
+    {
+        cudaSafeCall(cudaFree(counterPtr_));
+    }
+
    void HoughSegmentDetectorImpl::detect(InputArray _src, OutputArray lines, Stream& stream)
    {
        // TODO : implement async version
@ -138,6 +149,7 @@ namespace
        using namespace cv::cuda::device::hough_lines;
        using namespace cv::cuda::device::hough_segments;

+        auto cudaStream = StreamAccessor::getStream(stream);
        GpuMat src = _src.getGpuMat();

        CV_Assert( src.type() == CV_8UC1 );
@ -147,7 +159,7 @@ namespace
        ensureSizeIsEnough(1, src.size().area(), CV_32SC1, list_);
        unsigned int* srcPoints = list_.ptr<unsigned int>();

-        const int pointsCount = buildPointList_gpu(src, srcPoints);
+        const int pointsCount = buildPointList_gpu(src, srcPoints, counterPtr_, cudaStream);
        if (pointsCount == 0)
        {
            lines.release();
@ -159,14 +171,14 @@ namespace
        CV_Assert( numangle > 0 && numrho > 0 );

        ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, accum_);
-        accum_.setTo(Scalar::all(0));
+        accum_.setTo(Scalar::all(0), stream);

        DeviceInfo devInfo;
-        linesAccum_gpu(srcPoints, pointsCount, accum_, rho_, theta_, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20));
+        linesAccum_gpu(srcPoints, pointsCount, accum_, rho_, theta_, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20), cudaStream);

        ensureSizeIsEnough(1, maxLines_, CV_32SC4, result_);

-        int linesCount = houghLinesProbabilistic_gpu(src, accum_, result_.ptr<int4>(), maxLines_, rho_, theta_, maxLineGap_, minLineLength_);
+        int linesCount = houghLinesProbabilistic_gpu(src, accum_, result_.ptr<int4>(), maxLines_, rho_, theta_, maxLineGap_, minLineLength_, counterPtr_, cudaStream);

        if (linesCount == 0)
        {
@ -175,7 +187,7 @@ namespace
        }

        result_.cols = linesCount;
-        result_.copyTo(lines);
+        result_.copyTo(lines, stream);
    }
 }