From 79ecefb51fc374fbd813cc1d44c9d3d3f579d69d Mon Sep 17 00:00:00 2001 From: Dan Date: Wed, 9 Dec 2015 11:05:07 -0500 Subject: [PATCH 1/4] Implemented async calls. --- modules/cudafeatures2d/src/cuda/orb.cu | 14 ++++-- modules/cudafeatures2d/src/orb.cpp | 66 +++++++++++++------------- 2 files changed, 43 insertions(+), 37 deletions(-) diff --git a/modules/cudafeatures2d/src/cuda/orb.cu b/modules/cudafeatures2d/src/cuda/orb.cu index 2e4f2e0068..a84c601fb5 100644 --- a/modules/cudafeatures2d/src/cuda/orb.cu +++ b/modules/cudafeatures2d/src/cuda/orb.cu @@ -44,6 +44,8 @@ #include #include +#include + #include "opencv2/core/cuda/common.hpp" #include "opencv2/core/cuda/reduce.hpp" @@ -56,13 +58,17 @@ namespace cv { namespace cuda { namespace device //////////////////////////////////////////////////////////////////////////////////////////////////////// // cull - int cull_gpu(int* loc, float* response, int size, int n_points) + int cull_gpu(int* loc, float* response, int size, int n_points, cudaStream_t stream) { thrust::device_ptr loc_ptr(loc); thrust::device_ptr response_ptr(response); - - thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater()); - + if(stream) + { + thrust::sort_by_key(thrust::cuda::par.on(stream), response_ptr, response_ptr + size, loc_ptr, thrust::greater()); + }else + { + thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater()); + } return n_points; } diff --git a/modules/cudafeatures2d/src/orb.cpp b/modules/cudafeatures2d/src/orb.cpp index 6bfdd5ac47..deb5ccf53a 100644 --- a/modules/cudafeatures2d/src/orb.cpp +++ b/modules/cudafeatures2d/src/orb.cpp @@ -55,7 +55,7 @@ namespace cv { namespace cuda { namespace device { namespace orb { - int cull_gpu(int* loc, float* response, int size, int n_points); + int cull_gpu(int* loc, float* response, int size, int n_points, cudaStream_t stream); void HarrisResponses_gpu(PtrStepSzb img, const short2* loc, float* response, const int npoints, int blockSize, float harris_k, cudaStream_t stream); @@ -401,10 +401,10 @@ namespace bool blurForDescriptor_; private: - void buildScalePyramids(InputArray _image, InputArray _mask); - void computeKeyPointsPyramid(); - void computeDescriptors(OutputArray _descriptors); - void mergeKeyPoints(OutputArray _keypoints); + void buildScalePyramids(InputArray _image, InputArray _mask, Stream& stream); + void computeKeyPointsPyramid(Stream& stream); + void computeDescriptors(OutputArray _descriptors, Stream& stream); + void mergeKeyPoints(OutputArray _keypoints, Stream& stream); private: Ptr fastDetector_; @@ -582,13 +582,13 @@ namespace { CV_Assert( useProvidedKeypoints == false ); - buildScalePyramids(_image, _mask); - computeKeyPointsPyramid(); + buildScalePyramids(_image, _mask, stream); + computeKeyPointsPyramid(stream); if (_descriptors.needed()) { - computeDescriptors(_descriptors); + computeDescriptors(_descriptors, stream); } - mergeKeyPoints(_keypoints); + mergeKeyPoints(_keypoints, stream); } static float getScale(float scaleFactor, int firstLevel, int level) @@ -596,7 +596,7 @@ namespace return pow(scaleFactor, level - firstLevel); } - void ORB_Impl::buildScalePyramids(InputArray _image, InputArray _mask) + void ORB_Impl::buildScalePyramids(InputArray _image, InputArray _mask, Stream& stream) { const GpuMat image = _image.getGpuMat(); const GpuMat mask = _mask.getGpuMat(); @@ -622,28 +622,28 @@ namespace { if (level < firstLevel_) { - cuda::resize(image, imagePyr_[level], sz, 0, 0, INTER_LINEAR); + cuda::resize(image, imagePyr_[level], sz, 0, 0, INTER_LINEAR, stream); if (!mask.empty()) - cuda::resize(mask, maskPyr_[level], sz, 0, 0, INTER_LINEAR); + cuda::resize(mask, maskPyr_[level], sz, 0, 0, INTER_LINEAR, stream); } else { - cuda::resize(imagePyr_[level - 1], imagePyr_[level], sz, 0, 0, INTER_LINEAR); + cuda::resize(imagePyr_[level - 1], imagePyr_[level], sz, 0, 0, INTER_LINEAR, stream); if (!mask.empty()) { - cuda::resize(maskPyr_[level - 1], maskPyr_[level], sz, 0, 0, INTER_LINEAR); - cuda::threshold(maskPyr_[level], maskPyr_[level], 254, 0, THRESH_TOZERO); + cuda::resize(maskPyr_[level - 1], maskPyr_[level], sz, 0, 0, INTER_LINEAR, stream); + cuda::threshold(maskPyr_[level], maskPyr_[level], 254, 0, THRESH_TOZERO, stream); } } } else { - image.copyTo(imagePyr_[level]); + image.copyTo(imagePyr_[level], stream); if (!mask.empty()) - mask.copyTo(maskPyr_[level]); + mask.copyTo(maskPyr_[level], stream); } // Filter keypoints by image border @@ -652,12 +652,12 @@ namespace Rect inner(edgeThreshold_, edgeThreshold_, sz.width - 2 * edgeThreshold_, sz.height - 2 * edgeThreshold_); buf_(inner).setTo(Scalar::all(255)); - cuda::bitwise_and(maskPyr_[level], buf_, maskPyr_[level]); + cuda::bitwise_and(maskPyr_[level], buf_, maskPyr_[level], stream); } } // takes keypoints and culls them by the response - static void cull(GpuMat& keypoints, int& count, int n_points) + static void cull(GpuMat& keypoints, int& count, int n_points, Stream& stream) { using namespace cv::cuda::device::orb; @@ -670,11 +670,11 @@ namespace return; } - count = cull_gpu(keypoints.ptr(cuda::FastFeatureDetector::LOCATION_ROW), keypoints.ptr(cuda::FastFeatureDetector::RESPONSE_ROW), count, n_points); + count = cull_gpu(keypoints.ptr(cuda::FastFeatureDetector::LOCATION_ROW), keypoints.ptr(cuda::FastFeatureDetector::RESPONSE_ROW), count, n_points, StreamAccessor::getStream(stream)); } } - void ORB_Impl::computeKeyPointsPyramid() + void ORB_Impl::computeKeyPointsPyramid(Stream& stream) { using namespace cv::cuda::device::orb; @@ -690,7 +690,7 @@ namespace fastDetector_->setMaxNumPoints(0.05 * imagePyr_[level].size().area()); GpuMat fastKpRange; - fastDetector_->detectAsync(imagePyr_[level], fastKpRange, maskPyr_[level], Stream::Null()); + fastDetector_->detectAsync(imagePyr_[level], fastKpRange, maskPyr_[level], stream); keyPointsCount_[level] = fastKpRange.cols; @@ -698,28 +698,28 @@ namespace continue; ensureSizeIsEnough(3, keyPointsCount_[level], fastKpRange.type(), keyPointsPyr_[level]); - fastKpRange.copyTo(keyPointsPyr_[level].rowRange(0, 2)); + fastKpRange.copyTo(keyPointsPyr_[level].rowRange(0, 2), stream); const int n_features = static_cast(n_features_per_level_[level]); if (scoreType_ == ORB::HARRIS_SCORE) { // Keep more points than necessary as FAST does not give amazing corners - cull(keyPointsPyr_[level], keyPointsCount_[level], 2 * n_features); + cull(keyPointsPyr_[level], keyPointsCount_[level], 2 * n_features, stream); // Compute the Harris cornerness (better scoring than FAST) - HarrisResponses_gpu(imagePyr_[level], keyPointsPyr_[level].ptr(0), keyPointsPyr_[level].ptr(1), keyPointsCount_[level], 7, HARRIS_K, 0); + HarrisResponses_gpu(imagePyr_[level], keyPointsPyr_[level].ptr(0), keyPointsPyr_[level].ptr(1), keyPointsCount_[level], 7, HARRIS_K, StreamAccessor::getStream(stream)); } //cull to the final desired level, using the new Harris scores or the original FAST scores. - cull(keyPointsPyr_[level], keyPointsCount_[level], n_features); + cull(keyPointsPyr_[level], keyPointsCount_[level], n_features, stream); // Compute orientation - IC_Angle_gpu(imagePyr_[level], keyPointsPyr_[level].ptr(0), keyPointsPyr_[level].ptr(2), keyPointsCount_[level], half_patch_size, 0); + IC_Angle_gpu(imagePyr_[level], keyPointsPyr_[level].ptr(0), keyPointsPyr_[level].ptr(2), keyPointsCount_[level], half_patch_size, StreamAccessor::getStream(stream)); } } - void ORB_Impl::computeDescriptors(OutputArray _descriptors) + void ORB_Impl::computeDescriptors(OutputArray _descriptors, Stream& stream) { using namespace cv::cuda::device::orb; @@ -750,17 +750,17 @@ namespace { // preprocess the resized image ensureSizeIsEnough(imagePyr_[level].size(), imagePyr_[level].type(), buf_); - blurFilter_->apply(imagePyr_[level], buf_); + blurFilter_->apply(imagePyr_[level], buf_, stream); } computeOrbDescriptor_gpu(blurForDescriptor_ ? buf_ : imagePyr_[level], keyPointsPyr_[level].ptr(0), keyPointsPyr_[level].ptr(2), - keyPointsCount_[level], pattern_.ptr(0), pattern_.ptr(1), descRange, descriptorSize(), WTA_K_, 0); + keyPointsCount_[level], pattern_.ptr(0), pattern_.ptr(1), descRange, descriptorSize(), WTA_K_, StreamAccessor::getStream(stream)); offset += keyPointsCount_[level]; } } - void ORB_Impl::mergeKeyPoints(OutputArray _keypoints) + void ORB_Impl::mergeKeyPoints(OutputArray _keypoints, Stream& stream) { using namespace cv::cuda::device::orb; @@ -791,10 +791,10 @@ namespace float locScale = level != firstLevel_ ? sf : 1.0f; - mergeLocation_gpu(keyPointsPyr_[level].ptr(0), keyPointsRange.ptr(0), keyPointsRange.ptr(1), keyPointsCount_[level], locScale, 0); + mergeLocation_gpu(keyPointsPyr_[level].ptr(0), keyPointsRange.ptr(0), keyPointsRange.ptr(1), keyPointsCount_[level], locScale, StreamAccessor::getStream(stream)); GpuMat range = keyPointsRange.rowRange(2, 4); - keyPointsPyr_[level](Range(1, 3), Range(0, keyPointsCount_[level])).copyTo(range); + keyPointsPyr_[level](Range(1, 3), Range(0, keyPointsCount_[level])).copyTo(range, stream); keyPointsRange.row(4).setTo(Scalar::all(level)); keyPointsRange.row(5).setTo(Scalar::all(patchSize_ * sf)); From a8ca5606845d8b0c5b764004f322ae4e73434d86 Mon Sep 17 00:00:00 2001 From: Dan Date: Wed, 9 Dec 2015 11:08:17 -0500 Subject: [PATCH 2/4] Fixed more missing async calls. --- modules/cudafeatures2d/src/orb.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/cudafeatures2d/src/orb.cpp b/modules/cudafeatures2d/src/orb.cpp index deb5ccf53a..ec1525374a 100644 --- a/modules/cudafeatures2d/src/orb.cpp +++ b/modules/cudafeatures2d/src/orb.cpp @@ -648,9 +648,9 @@ namespace // Filter keypoints by image border ensureSizeIsEnough(sz, CV_8UC1, buf_); - buf_.setTo(Scalar::all(0)); + buf_.setTo(Scalar::all(0), stream); Rect inner(edgeThreshold_, edgeThreshold_, sz.width - 2 * edgeThreshold_, sz.height - 2 * edgeThreshold_); - buf_(inner).setTo(Scalar::all(255)); + buf_(inner).setTo(Scalar::all(255), stream); cuda::bitwise_and(maskPyr_[level], buf_, maskPyr_[level], stream); } @@ -796,8 +796,8 @@ namespace GpuMat range = keyPointsRange.rowRange(2, 4); keyPointsPyr_[level](Range(1, 3), Range(0, keyPointsCount_[level])).copyTo(range, stream); - keyPointsRange.row(4).setTo(Scalar::all(level)); - keyPointsRange.row(5).setTo(Scalar::all(patchSize_ * sf)); + keyPointsRange.row(4).setTo(Scalar::all(level), stream); + keyPointsRange.row(5).setTo(Scalar::all(patchSize_ * sf), stream); offset += keyPointsCount_[level]; } From 7a934f9ee0cc07ed5720b34a2c03867b320950d7 Mon Sep 17 00:00:00 2001 From: Dan Date: Thu, 10 Dec 2015 11:21:16 -0500 Subject: [PATCH 3/4] Compatibility and bug fixes. --- modules/cudafeatures2d/src/cuda/orb.cu | 5 +++++ modules/cudafeatures2d/src/orb.cpp | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/modules/cudafeatures2d/src/cuda/orb.cu b/modules/cudafeatures2d/src/cuda/orb.cu index a84c601fb5..926c80a9b4 100644 --- a/modules/cudafeatures2d/src/cuda/orb.cu +++ b/modules/cudafeatures2d/src/cuda/orb.cu @@ -45,6 +45,7 @@ #include #include #include +#include #include "opencv2/core/cuda/common.hpp" @@ -62,6 +63,7 @@ namespace cv { namespace cuda { namespace device { thrust::device_ptr loc_ptr(loc); thrust::device_ptr response_ptr(response); +#if THRUST_VERSION >= 100800 if(stream) { thrust::sort_by_key(thrust::cuda::par.on(stream), response_ptr, response_ptr + size, loc_ptr, thrust::greater()); @@ -69,6 +71,9 @@ namespace cv { namespace cuda { namespace device { thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater()); } +#else + thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater()); +#endif return n_points; } diff --git a/modules/cudafeatures2d/src/orb.cpp b/modules/cudafeatures2d/src/orb.cpp index ec1525374a..615ccc8db4 100644 --- a/modules/cudafeatures2d/src/orb.cpp +++ b/modules/cudafeatures2d/src/orb.cpp @@ -652,7 +652,7 @@ namespace Rect inner(edgeThreshold_, edgeThreshold_, sz.width - 2 * edgeThreshold_, sz.height - 2 * edgeThreshold_); buf_(inner).setTo(Scalar::all(255), stream); - cuda::bitwise_and(maskPyr_[level], buf_, maskPyr_[level], stream); + cuda::bitwise_and(maskPyr_[level], buf_, maskPyr_[level], cv::noArray(), stream); } } From a21ede94522d94d4457bb2cc53d76e1789f9557b Mon Sep 17 00:00:00 2001 From: Dan Date: Thu, 10 Mar 2016 13:39:21 -0500 Subject: [PATCH 4/4] Thrust allocator usage. --- modules/cudafeatures2d/src/cuda/orb.cu | 13 +++++++++++- modules/cudaimgproc/src/cuda/gftt.cu | 29 ++++++++++++++++---------- modules/cudaimgproc/src/gftt.cpp | 23 +++++++++----------- 3 files changed, 40 insertions(+), 25 deletions(-) diff --git a/modules/cudafeatures2d/src/cuda/orb.cu b/modules/cudafeatures2d/src/cuda/orb.cu index 926c80a9b4..182ca4fb86 100644 --- a/modules/cudafeatures2d/src/cuda/orb.cu +++ b/modules/cudafeatures2d/src/cuda/orb.cu @@ -51,7 +51,7 @@ #include "opencv2/core/cuda/common.hpp" #include "opencv2/core/cuda/reduce.hpp" #include "opencv2/core/cuda/functional.hpp" - +#include "opencv2/core/cuda/utility.hpp" namespace cv { namespace cuda { namespace device { namespace orb @@ -64,6 +64,16 @@ namespace cv { namespace cuda { namespace device thrust::device_ptr loc_ptr(loc); thrust::device_ptr response_ptr(response); #if THRUST_VERSION >= 100800 +#if THRUST_VERSION >= 100802 + if (stream) + { + thrust::sort_by_key(thrust::cuda::par(ThrustAllocator::getAllocator()).on(stream), response_ptr, response_ptr + size, loc_ptr, thrust::greater()); + } + else + { + thrust::sort_by_key(thrust::cuda::par(ThrustAllocator::getAllocator()), response_ptr, response_ptr + size, loc_ptr, thrust::greater()); + } +#else if(stream) { thrust::sort_by_key(thrust::cuda::par.on(stream), response_ptr, response_ptr + size, loc_ptr, thrust::greater()); @@ -71,6 +81,7 @@ namespace cv { namespace cuda { namespace device { thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater()); } +#endif #else thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater()); #endif diff --git a/modules/cudaimgproc/src/cuda/gftt.cu b/modules/cudaimgproc/src/cuda/gftt.cu index 029df41ce8..ab8713f868 100644 --- a/modules/cudaimgproc/src/cuda/gftt.cu +++ b/modules/cudaimgproc/src/cuda/gftt.cu @@ -47,7 +47,7 @@ #include "opencv2/core/cuda/common.hpp" #include "opencv2/core/cuda/utility.hpp" - +#include namespace cv { namespace cuda { namespace device { namespace gfft @@ -91,12 +91,12 @@ namespace cv { namespace cuda { namespace device } } - int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count) + int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count, cudaStream_t stream) { void* counter_ptr; cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) ); - cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) ); + cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) ); bindTexture(&eigTex, eig); @@ -104,17 +104,18 @@ namespace cv { namespace cuda { namespace device dim3 grid(divUp(eig.cols, block.x), divUp(eig.rows, block.y)); if (mask.data) - findCorners<<>>(threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols); + findCorners<<>>(threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols); else - findCorners<<>>(threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols); + findCorners<<>>(threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); - int count; - cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) ); - + cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) ); + if (stream) + cudaSafeCall(cudaStreamSynchronize(stream)); + else + cudaSafeCall( cudaDeviceSynchronize() ); return std::min(count, max_count); } @@ -128,13 +129,19 @@ namespace cv { namespace cuda { namespace device }; - void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count) + void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count, cudaStream_t stream) { bindTexture(&eigTex, eig); thrust::device_ptr ptr(corners); - +#if THRUST_VERSION >= 100802 + if (stream) + thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()).on(stream), ptr, ptr + count, EigGreater()); + else + thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()), ptr, ptr + count, EigGreater()); +#else thrust::sort(ptr, ptr + count, EigGreater()); +#endif } } // namespace optical_flow }}} diff --git a/modules/cudaimgproc/src/gftt.cpp b/modules/cudaimgproc/src/gftt.cpp index 73221c44d1..bf5d01b117 100644 --- a/modules/cudaimgproc/src/gftt.cpp +++ b/modules/cudaimgproc/src/gftt.cpp @@ -55,8 +55,8 @@ namespace cv { namespace cuda { namespace device { namespace gfft { - int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count); - void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count); + int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count, cudaStream_t stream); + void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count, cudaStream_t stream); } }}} @@ -97,9 +97,6 @@ namespace void GoodFeaturesToTrackDetector::detect(InputArray _image, OutputArray _corners, InputArray _mask, Stream& stream) { - // TODO : implement async version - (void) stream; - using namespace cv::cuda::device::gfft; GpuMat image = _image.getGpuMat(); @@ -108,14 +105,14 @@ namespace CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()) ); ensureSizeIsEnough(image.size(), CV_32FC1, eig_); - cornerCriteria_->compute(image, eig_); + cornerCriteria_->compute(image, eig_, stream); double maxVal = 0; cuda::minMax(eig_, 0, &maxVal); - + cudaStream_t stream_ = StreamAccessor::getStream(stream); ensureSizeIsEnough(1, std::max(1000, static_cast(image.size().area() * 0.05)), CV_32FC2, tmpCorners_); - int total = findCorners_gpu(eig_, static_cast(maxVal * qualityLevel_), mask, tmpCorners_.ptr(), tmpCorners_.cols); + int total = findCorners_gpu(eig_, static_cast(maxVal * qualityLevel_), mask, tmpCorners_.ptr(), tmpCorners_.cols, stream_); if (total == 0) { @@ -123,18 +120,18 @@ namespace return; } - sortCorners_gpu(eig_, tmpCorners_.ptr(), total); + sortCorners_gpu(eig_, tmpCorners_.ptr(), total, stream_); if (minDistance_ < 1) { - tmpCorners_.colRange(0, maxCorners_ > 0 ? std::min(maxCorners_, total) : total).copyTo(_corners); + tmpCorners_.colRange(0, maxCorners_ > 0 ? std::min(maxCorners_, total) : total).copyTo(_corners, stream); } else { std::vector tmp(total); Mat tmpMat(1, total, CV_32FC2, (void*)&tmp[0]); - tmpCorners_.colRange(0, total).download(tmpMat); - + tmpCorners_.colRange(0, total).download(tmpMat, stream); + stream.waitForCompletion(); std::vector tmp2; tmp2.reserve(total); @@ -203,7 +200,7 @@ namespace _corners.create(1, static_cast(tmp2.size()), CV_32FC2); GpuMat corners = _corners.getGpuMat(); - corners.upload(Mat(1, static_cast(tmp2.size()), CV_32FC2, &tmp2[0])); + corners.upload(Mat(1, static_cast(tmp2.size()), CV_32FC2, &tmp2[0]), stream); } } }