From 79ecefb51fc374fbd813cc1d44c9d3d3f579d69d Mon Sep 17 00:00:00 2001
From: Dan <dtmoodie>
Date: Wed, 9 Dec 2015 11:05:07 -0500
Subject: [PATCH 1/4] Implemented async calls.

---
 modules/cudafeatures2d/src/cuda/orb.cu | 14 ++++--
 modules/cudafeatures2d/src/orb.cpp     | 66 +++++++++++++-------------
 2 files changed, 43 insertions(+), 37 deletions(-)
diff --git a/modules/cudafeatures2d/src/cuda/orb.cu b/modules/cudafeatures2d/src/cuda/orb.cu
index 2e4f2e0068..a84c601fb5 100644
--- a/modules/cudafeatures2d/src/cuda/orb.cu
+++ b/modules/cudafeatures2d/src/cuda/orb.cu
@@ -44,6 +44,8 @@
 
 #include <thrust/device_ptr.h>
 #include <thrust/sort.h>
+#include <thrust/system/cuda/execution_policy.h>
+
 
 #include "opencv2/core/cuda/common.hpp"
 #include "opencv2/core/cuda/reduce.hpp"
@@ -56,13 +58,17 @@ namespace cv { namespace cuda { namespace device
         ////////////////////////////////////////////////////////////////////////////////////////////////////////
         // cull
 
-        int cull_gpu(int* loc, float* response, int size, int n_points)
+        int cull_gpu(int* loc, float* response, int size, int n_points, cudaStream_t stream)
         {
             thrust::device_ptr<int> loc_ptr(loc);
             thrust::device_ptr<float> response_ptr(response);
-
-            thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
-
+            if(stream)
+            {
+                thrust::sort_by_key(thrust::cuda::par.on(stream), response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
+            }else
+            {
+                thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
+            }
             return n_points;
         }
 
diff --git a/modules/cudafeatures2d/src/orb.cpp b/modules/cudafeatures2d/src/orb.cpp
index 6bfdd5ac47..deb5ccf53a 100644
--- a/modules/cudafeatures2d/src/orb.cpp
+++ b/modules/cudafeatures2d/src/orb.cpp
@@ -55,7 +55,7 @@ namespace cv { namespace cuda { namespace device
 {
     namespace orb
     {
-        int cull_gpu(int* loc, float* response, int size, int n_points);
+        int cull_gpu(int* loc, float* response, int size, int n_points, cudaStream_t stream);
 
         void HarrisResponses_gpu(PtrStepSzb img, const short2* loc, float* response, const int npoints, int blockSize, float harris_k, cudaStream_t stream);
 
@@ -401,10 +401,10 @@ namespace
         bool blurForDescriptor_;
 
     private:
-        void buildScalePyramids(InputArray _image, InputArray _mask);
-        void computeKeyPointsPyramid();
-        void computeDescriptors(OutputArray _descriptors);
-        void mergeKeyPoints(OutputArray _keypoints);
+        void buildScalePyramids(InputArray _image, InputArray _mask, Stream& stream);
+        void computeKeyPointsPyramid(Stream& stream);
+        void computeDescriptors(OutputArray _descriptors, Stream& stream);
+        void mergeKeyPoints(OutputArray _keypoints, Stream& stream);
 
     private:
         Ptr<cv::cuda::FastFeatureDetector> fastDetector_;
@@ -582,13 +582,13 @@ namespace
     {
         CV_Assert( useProvidedKeypoints == false );
 
-        buildScalePyramids(_image, _mask);
-        computeKeyPointsPyramid();
+        buildScalePyramids(_image, _mask, stream);
+        computeKeyPointsPyramid(stream);
         if (_descriptors.needed())
         {
-            computeDescriptors(_descriptors);
+            computeDescriptors(_descriptors, stream);
         }
-        mergeKeyPoints(_keypoints);
+        mergeKeyPoints(_keypoints, stream);
     }
 
     static float getScale(float scaleFactor, int firstLevel, int level)
@@ -596,7 +596,7 @@ namespace
         return pow(scaleFactor, level - firstLevel);
     }
 
-    void ORB_Impl::buildScalePyramids(InputArray _image, InputArray _mask)
+    void ORB_Impl::buildScalePyramids(InputArray _image, InputArray _mask, Stream& stream)
     {
         const GpuMat image = _image.getGpuMat();
         const GpuMat mask = _mask.getGpuMat();
@@ -622,28 +622,28 @@ namespace
             {
                 if (level < firstLevel_)
                 {
-                    cuda::resize(image, imagePyr_[level], sz, 0, 0, INTER_LINEAR);
+                    cuda::resize(image, imagePyr_[level], sz, 0, 0, INTER_LINEAR, stream);
 
                     if (!mask.empty())
-                        cuda::resize(mask, maskPyr_[level], sz, 0, 0, INTER_LINEAR);
+                        cuda::resize(mask, maskPyr_[level], sz, 0, 0, INTER_LINEAR, stream);
                 }
                 else
                 {
-                    cuda::resize(imagePyr_[level - 1], imagePyr_[level], sz, 0, 0, INTER_LINEAR);
+                    cuda::resize(imagePyr_[level - 1], imagePyr_[level], sz, 0, 0, INTER_LINEAR, stream);
 
                     if (!mask.empty())
                     {
-                        cuda::resize(maskPyr_[level - 1], maskPyr_[level], sz, 0, 0, INTER_LINEAR);
-                        cuda::threshold(maskPyr_[level], maskPyr_[level], 254, 0, THRESH_TOZERO);
+                        cuda::resize(maskPyr_[level - 1], maskPyr_[level], sz, 0, 0, INTER_LINEAR, stream);
+                        cuda::threshold(maskPyr_[level], maskPyr_[level], 254, 0, THRESH_TOZERO, stream);
                     }
                 }
             }
             else
             {
-                image.copyTo(imagePyr_[level]);
+                image.copyTo(imagePyr_[level], stream);
 
                 if (!mask.empty())
-                    mask.copyTo(maskPyr_[level]);
+                    mask.copyTo(maskPyr_[level], stream);
             }
 
             // Filter keypoints by image border
@@ -652,12 +652,12 @@ namespace
             Rect inner(edgeThreshold_, edgeThreshold_, sz.width - 2 * edgeThreshold_, sz.height - 2 * edgeThreshold_);
             buf_(inner).setTo(Scalar::all(255));
 
-            cuda::bitwise_and(maskPyr_[level], buf_, maskPyr_[level]);
+            cuda::bitwise_and(maskPyr_[level], buf_, maskPyr_[level], stream);
         }
     }
 
     // takes keypoints and culls them by the response
-    static void cull(GpuMat& keypoints, int& count, int n_points)
+    static void cull(GpuMat& keypoints, int& count, int n_points, Stream& stream)
     {
         using namespace cv::cuda::device::orb;
 
@@ -670,11 +670,11 @@ namespace
                 return;
             }
 
-            count = cull_gpu(keypoints.ptr<int>(cuda::FastFeatureDetector::LOCATION_ROW), keypoints.ptr<float>(cuda::FastFeatureDetector::RESPONSE_ROW), count, n_points);
+            count = cull_gpu(keypoints.ptr<int>(cuda::FastFeatureDetector::LOCATION_ROW), keypoints.ptr<float>(cuda::FastFeatureDetector::RESPONSE_ROW), count, n_points, StreamAccessor::getStream(stream));
         }
     }
 
-    void ORB_Impl::computeKeyPointsPyramid()
+    void ORB_Impl::computeKeyPointsPyramid(Stream& stream)
     {
         using namespace cv::cuda::device::orb;
 
@@ -690,7 +690,7 @@ namespace
             fastDetector_->setMaxNumPoints(0.05 * imagePyr_[level].size().area());
 
             GpuMat fastKpRange;
-            fastDetector_->detectAsync(imagePyr_[level], fastKpRange, maskPyr_[level], Stream::Null());
+            fastDetector_->detectAsync(imagePyr_[level], fastKpRange, maskPyr_[level], stream);
 
             keyPointsCount_[level] = fastKpRange.cols;
 
@@ -698,28 +698,28 @@ namespace
                 continue;
 
             ensureSizeIsEnough(3, keyPointsCount_[level], fastKpRange.type(), keyPointsPyr_[level]);
-            fastKpRange.copyTo(keyPointsPyr_[level].rowRange(0, 2));
+            fastKpRange.copyTo(keyPointsPyr_[level].rowRange(0, 2), stream);
 
             const int n_features = static_cast<int>(n_features_per_level_[level]);
 
             if (scoreType_ == ORB::HARRIS_SCORE)
             {
                 // Keep more points than necessary as FAST does not give amazing corners
-                cull(keyPointsPyr_[level], keyPointsCount_[level], 2 * n_features);
+                cull(keyPointsPyr_[level], keyPointsCount_[level], 2 * n_features, stream);
 
                 // Compute the Harris cornerness (better scoring than FAST)
-                HarrisResponses_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(1), keyPointsCount_[level], 7, HARRIS_K, 0);
+                HarrisResponses_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(1), keyPointsCount_[level], 7, HARRIS_K, StreamAccessor::getStream(stream));
             }
 
             //cull to the final desired level, using the new Harris scores or the original FAST scores.
-            cull(keyPointsPyr_[level], keyPointsCount_[level], n_features);
+            cull(keyPointsPyr_[level], keyPointsCount_[level], n_features, stream);
 
             // Compute orientation
-            IC_Angle_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2), keyPointsCount_[level], half_patch_size, 0);
+            IC_Angle_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2), keyPointsCount_[level], half_patch_size, StreamAccessor::getStream(stream));
         }
     }
 
-    void ORB_Impl::computeDescriptors(OutputArray _descriptors)
+    void ORB_Impl::computeDescriptors(OutputArray _descriptors, Stream& stream)
     {
         using namespace cv::cuda::device::orb;
 
@@ -750,17 +750,17 @@ namespace
             {
                 // preprocess the resized image
                 ensureSizeIsEnough(imagePyr_[level].size(), imagePyr_[level].type(), buf_);
-                blurFilter_->apply(imagePyr_[level], buf_);
+                blurFilter_->apply(imagePyr_[level], buf_, stream);
             }
 
             computeOrbDescriptor_gpu(blurForDescriptor_ ? buf_ : imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2),
-                keyPointsCount_[level], pattern_.ptr<int>(0), pattern_.ptr<int>(1), descRange, descriptorSize(), WTA_K_, 0);
+                keyPointsCount_[level], pattern_.ptr<int>(0), pattern_.ptr<int>(1), descRange, descriptorSize(), WTA_K_, StreamAccessor::getStream(stream));
 
             offset += keyPointsCount_[level];
         }
     }
 
-    void ORB_Impl::mergeKeyPoints(OutputArray _keypoints)
+    void ORB_Impl::mergeKeyPoints(OutputArray _keypoints, Stream& stream)
     {
         using namespace cv::cuda::device::orb;
 
@@ -791,10 +791,10 @@ namespace
 
             float locScale = level != firstLevel_ ? sf : 1.0f;
 
-            mergeLocation_gpu(keyPointsPyr_[level].ptr<short2>(0), keyPointsRange.ptr<float>(0), keyPointsRange.ptr<float>(1), keyPointsCount_[level], locScale, 0);
+            mergeLocation_gpu(keyPointsPyr_[level].ptr<short2>(0), keyPointsRange.ptr<float>(0), keyPointsRange.ptr<float>(1), keyPointsCount_[level], locScale, StreamAccessor::getStream(stream));
 
             GpuMat range = keyPointsRange.rowRange(2, 4);
-            keyPointsPyr_[level](Range(1, 3), Range(0, keyPointsCount_[level])).copyTo(range);
+            keyPointsPyr_[level](Range(1, 3), Range(0, keyPointsCount_[level])).copyTo(range, stream);
 
             keyPointsRange.row(4).setTo(Scalar::all(level));
             keyPointsRange.row(5).setTo(Scalar::all(patchSize_ * sf));

From a8ca5606845d8b0c5b764004f322ae4e73434d86 Mon Sep 17 00:00:00 2001
From: Dan <dtmoodie>
Date: Wed, 9 Dec 2015 11:08:17 -0500
Subject: [PATCH 2/4] Fixed more missing async calls.

---
 modules/cudafeatures2d/src/orb.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/cudafeatures2d/src/orb.cpp b/modules/cudafeatures2d/src/orb.cpp
index deb5ccf53a..ec1525374a 100644
--- a/modules/cudafeatures2d/src/orb.cpp
+++ b/modules/cudafeatures2d/src/orb.cpp
@@ -648,9 +648,9 @@ namespace
 
             // Filter keypoints by image border
             ensureSizeIsEnough(sz, CV_8UC1, buf_);
-            buf_.setTo(Scalar::all(0));
+            buf_.setTo(Scalar::all(0), stream);
             Rect inner(edgeThreshold_, edgeThreshold_, sz.width - 2 * edgeThreshold_, sz.height - 2 * edgeThreshold_);
-            buf_(inner).setTo(Scalar::all(255));
+            buf_(inner).setTo(Scalar::all(255), stream);
 
             cuda::bitwise_and(maskPyr_[level], buf_, maskPyr_[level], stream);
         }
@@ -796,8 +796,8 @@ namespace
             GpuMat range = keyPointsRange.rowRange(2, 4);
             keyPointsPyr_[level](Range(1, 3), Range(0, keyPointsCount_[level])).copyTo(range, stream);
 
-            keyPointsRange.row(4).setTo(Scalar::all(level));
-            keyPointsRange.row(5).setTo(Scalar::all(patchSize_ * sf));
+            keyPointsRange.row(4).setTo(Scalar::all(level), stream);
+            keyPointsRange.row(5).setTo(Scalar::all(patchSize_ * sf), stream);
 
             offset += keyPointsCount_[level];
         }

From 7a934f9ee0cc07ed5720b34a2c03867b320950d7 Mon Sep 17 00:00:00 2001
From: Dan <dtmoodie>
Date: Thu, 10 Dec 2015 11:21:16 -0500
Subject: [PATCH 3/4] Compatibility and bug fixes.

---
 modules/cudafeatures2d/src/cuda/orb.cu | 5 +++++
 modules/cudafeatures2d/src/orb.cpp     | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/modules/cudafeatures2d/src/cuda/orb.cu b/modules/cudafeatures2d/src/cuda/orb.cu
index a84c601fb5..926c80a9b4 100644
--- a/modules/cudafeatures2d/src/cuda/orb.cu
+++ b/modules/cudafeatures2d/src/cuda/orb.cu
@@ -45,6 +45,7 @@
 #include <thrust/device_ptr.h>
 #include <thrust/sort.h>
 #include <thrust/system/cuda/execution_policy.h>
+#include <thrust/version.h>
 
 
 #include "opencv2/core/cuda/common.hpp"
@@ -62,6 +63,7 @@ namespace cv { namespace cuda { namespace device
         {
             thrust::device_ptr<int> loc_ptr(loc);
             thrust::device_ptr<float> response_ptr(response);
+#if THRUST_VERSION >= 100800
             if(stream)
             {
                 thrust::sort_by_key(thrust::cuda::par.on(stream), response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
@@ -69,6 +71,9 @@ namespace cv { namespace cuda { namespace device
             {
                 thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
             }
+#else
+            thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
+#endif
             return n_points;
         }
 
diff --git a/modules/cudafeatures2d/src/orb.cpp b/modules/cudafeatures2d/src/orb.cpp
index ec1525374a..615ccc8db4 100644
--- a/modules/cudafeatures2d/src/orb.cpp
+++ b/modules/cudafeatures2d/src/orb.cpp
@@ -652,7 +652,7 @@ namespace
             Rect inner(edgeThreshold_, edgeThreshold_, sz.width - 2 * edgeThreshold_, sz.height - 2 * edgeThreshold_);
             buf_(inner).setTo(Scalar::all(255), stream);
 
-            cuda::bitwise_and(maskPyr_[level], buf_, maskPyr_[level], stream);
+            cuda::bitwise_and(maskPyr_[level], buf_, maskPyr_[level], cv::noArray(), stream);
         }
     }
 

From a21ede94522d94d4457bb2cc53d76e1789f9557b Mon Sep 17 00:00:00 2001
From: Dan <dtmoodie@gmail.com>
Date: Thu, 10 Mar 2016 13:39:21 -0500
Subject: [PATCH 4/4] Thrust allocator usage.

---
 modules/cudafeatures2d/src/cuda/orb.cu | 13 +++++++++++-
 modules/cudaimgproc/src/cuda/gftt.cu   | 29 ++++++++++++++++----------
 modules/cudaimgproc/src/gftt.cpp       | 23 +++++++++-----------
 3 files changed, 40 insertions(+), 25 deletions(-)

diff --git a/modules/cudafeatures2d/src/cuda/orb.cu b/modules/cudafeatures2d/src/cuda/orb.cu
index 926c80a9b4..182ca4fb86 100644
--- a/modules/cudafeatures2d/src/cuda/orb.cu
+++ b/modules/cudafeatures2d/src/cuda/orb.cu
@@ -51,7 +51,7 @@
 #include "opencv2/core/cuda/common.hpp"
 #include "opencv2/core/cuda/reduce.hpp"
 #include "opencv2/core/cuda/functional.hpp"
-
+#include "opencv2/core/cuda/utility.hpp"
 namespace cv { namespace cuda { namespace device
 {
     namespace orb
@@ -64,6 +64,16 @@ namespace cv { namespace cuda { namespace device
             thrust::device_ptr<int> loc_ptr(loc);
             thrust::device_ptr<float> response_ptr(response);
 #if THRUST_VERSION >= 100800
+#if THRUST_VERSION >= 100802
+            if (stream)
+            {
+                thrust::sort_by_key(thrust::cuda::par(ThrustAllocator::getAllocator()).on(stream), response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
+            }
+            else
+            {
+                thrust::sort_by_key(thrust::cuda::par(ThrustAllocator::getAllocator()), response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
+            }
+#else
             if(stream)
             {
                 thrust::sort_by_key(thrust::cuda::par.on(stream), response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
@@ -71,6 +81,7 @@ namespace cv { namespace cuda { namespace device
             {
                 thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
             }
+#endif
 #else
             thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
 #endif
diff --git a/modules/cudaimgproc/src/cuda/gftt.cu b/modules/cudaimgproc/src/cuda/gftt.cu
index 029df41ce8..ab8713f868 100644
--- a/modules/cudaimgproc/src/cuda/gftt.cu
+++ b/modules/cudaimgproc/src/cuda/gftt.cu
@@ -47,7 +47,7 @@
 
 #include "opencv2/core/cuda/common.hpp"
 #include "opencv2/core/cuda/utility.hpp"
-
+#include <thrust/execution_policy.h>
 namespace cv { namespace cuda { namespace device
 {
     namespace gfft
@@ -91,12 +91,12 @@ namespace cv { namespace cuda { namespace device
             }
         }
 
-        int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count)
+        int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count, cudaStream_t stream)
         {
             void* counter_ptr;
             cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
 
-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
+            cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) );
 
             bindTexture(&eigTex, eig);
 
@@ -104,17 +104,18 @@ namespace cv { namespace cuda { namespace device
             dim3 grid(divUp(eig.cols, block.x), divUp(eig.rows, block.y));
 
             if (mask.data)
-                findCorners<<<grid, block>>>(threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols);
+                findCorners<<<grid, block, 0, stream>>>(threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols);
             else
-                findCorners<<<grid, block>>>(threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols);
+                findCorners<<<grid, block, 0, stream>>>(threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols);
 
             cudaSafeCall( cudaGetLastError() );
 
-            cudaSafeCall( cudaDeviceSynchronize() );
-
             int count;
-            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
-
+            cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
+            if (stream)
+                cudaSafeCall(cudaStreamSynchronize(stream));
+            else
+                cudaSafeCall( cudaDeviceSynchronize() );
             return std::min(count, max_count);
         }
 
@@ -128,13 +129,19 @@ namespace cv { namespace cuda { namespace device
         };
 
 
-        void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count)
+        void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count, cudaStream_t stream)
         {
             bindTexture(&eigTex, eig);
 
             thrust::device_ptr<float2> ptr(corners);
-
+#if THRUST_VERSION >= 100802
+            if (stream)
+                thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()).on(stream), ptr, ptr + count, EigGreater());
+            else
+                thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()), ptr, ptr + count, EigGreater());
+#else
             thrust::sort(ptr, ptr + count, EigGreater());
+#endif
         }
     } // namespace optical_flow
 }}}
diff --git a/modules/cudaimgproc/src/gftt.cpp b/modules/cudaimgproc/src/gftt.cpp
index 73221c44d1..bf5d01b117 100644
--- a/modules/cudaimgproc/src/gftt.cpp
+++ b/modules/cudaimgproc/src/gftt.cpp
@@ -55,8 +55,8 @@ namespace cv { namespace cuda { namespace device
 {
     namespace gfft
     {
-        int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count);
-        void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count);
+        int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count, cudaStream_t stream);
+        void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count, cudaStream_t stream);
     }
 }}}
 
@@ -97,9 +97,6 @@ namespace
 
     void GoodFeaturesToTrackDetector::detect(InputArray _image, OutputArray _corners, InputArray _mask, Stream& stream)
     {
-        // TODO : implement async version
-        (void) stream;
-
         using namespace cv::cuda::device::gfft;
 
         GpuMat image = _image.getGpuMat();
@@ -108,14 +105,14 @@ namespace
         CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()) );
 
         ensureSizeIsEnough(image.size(), CV_32FC1, eig_);
-        cornerCriteria_->compute(image, eig_);
+        cornerCriteria_->compute(image, eig_, stream);
 
         double maxVal = 0;
         cuda::minMax(eig_, 0, &maxVal);
-
+        cudaStream_t stream_ = StreamAccessor::getStream(stream);
         ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);
 
-        int total = findCorners_gpu(eig_, static_cast<float>(maxVal * qualityLevel_), mask, tmpCorners_.ptr<float2>(), tmpCorners_.cols);
+        int total = findCorners_gpu(eig_, static_cast<float>(maxVal * qualityLevel_), mask, tmpCorners_.ptr<float2>(), tmpCorners_.cols, stream_);
 
         if (total == 0)
         {
@@ -123,18 +120,18 @@ namespace
             return;
         }
 
-        sortCorners_gpu(eig_, tmpCorners_.ptr<float2>(), total);
+        sortCorners_gpu(eig_, tmpCorners_.ptr<float2>(), total, stream_);
 
         if (minDistance_ < 1)
         {
-            tmpCorners_.colRange(0, maxCorners_ > 0 ? std::min(maxCorners_, total) : total).copyTo(_corners);
+            tmpCorners_.colRange(0, maxCorners_ > 0 ? std::min(maxCorners_, total) : total).copyTo(_corners, stream);
         }
         else
         {
             std::vector<Point2f> tmp(total);
             Mat tmpMat(1, total, CV_32FC2, (void*)&tmp[0]);
-            tmpCorners_.colRange(0, total).download(tmpMat);
-
+            tmpCorners_.colRange(0, total).download(tmpMat, stream);
+            stream.waitForCompletion();
             std::vector<Point2f> tmp2;
             tmp2.reserve(total);
 
@@ -203,7 +200,7 @@ namespace
             _corners.create(1, static_cast<int>(tmp2.size()), CV_32FC2);
             GpuMat corners = _corners.getGpuMat();
 
-            corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0]));
+            corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0]), stream);
         }
     }
 }