Merge pull request #6290 from dtmoodie:thrust_allocator_usage_pr

pull/6302/head
Alexander Alekhin 9 years ago
commit 4e479d58f8
  1. 30
      modules/cudafeatures2d/src/cuda/orb.cu
  2. 74
      modules/cudafeatures2d/src/orb.cpp
  3. 29
      modules/cudaimgproc/src/cuda/gftt.cu
  4. 23
      modules/cudaimgproc/src/gftt.cpp

@ -44,11 +44,14 @@
#include <thrust/device_ptr.h> #include <thrust/device_ptr.h>
#include <thrust/sort.h> #include <thrust/sort.h>
#include <thrust/system/cuda/execution_policy.h>
#include <thrust/version.h>
#include "opencv2/core/cuda/common.hpp" #include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/reduce.hpp" #include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/functional.hpp" #include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/utility.hpp"
namespace cv { namespace cuda { namespace device namespace cv { namespace cuda { namespace device
{ {
namespace orb namespace orb
@ -56,13 +59,32 @@ namespace cv { namespace cuda { namespace device
//////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////
// cull // cull
int cull_gpu(int* loc, float* response, int size, int n_points) int cull_gpu(int* loc, float* response, int size, int n_points, cudaStream_t stream)
{ {
thrust::device_ptr<int> loc_ptr(loc); thrust::device_ptr<int> loc_ptr(loc);
thrust::device_ptr<float> response_ptr(response); thrust::device_ptr<float> response_ptr(response);
#if THRUST_VERSION >= 100800
#if THRUST_VERSION >= 100802
if (stream)
{
thrust::sort_by_key(thrust::cuda::par(ThrustAllocator::getAllocator()).on(stream), response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
}
else
{
thrust::sort_by_key(thrust::cuda::par(ThrustAllocator::getAllocator()), response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
}
#else
if(stream)
{
thrust::sort_by_key(thrust::cuda::par.on(stream), response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
}else
{
thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
}
#endif
#else
thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>()); thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
#endif
return n_points; return n_points;
} }

@ -55,7 +55,7 @@ namespace cv { namespace cuda { namespace device
{ {
namespace orb namespace orb
{ {
int cull_gpu(int* loc, float* response, int size, int n_points); int cull_gpu(int* loc, float* response, int size, int n_points, cudaStream_t stream);
void HarrisResponses_gpu(PtrStepSzb img, const short2* loc, float* response, const int npoints, int blockSize, float harris_k, cudaStream_t stream); void HarrisResponses_gpu(PtrStepSzb img, const short2* loc, float* response, const int npoints, int blockSize, float harris_k, cudaStream_t stream);
@ -401,10 +401,10 @@ namespace
bool blurForDescriptor_; bool blurForDescriptor_;
private: private:
void buildScalePyramids(InputArray _image, InputArray _mask); void buildScalePyramids(InputArray _image, InputArray _mask, Stream& stream);
void computeKeyPointsPyramid(); void computeKeyPointsPyramid(Stream& stream);
void computeDescriptors(OutputArray _descriptors); void computeDescriptors(OutputArray _descriptors, Stream& stream);
void mergeKeyPoints(OutputArray _keypoints); void mergeKeyPoints(OutputArray _keypoints, Stream& stream);
private: private:
Ptr<cv::cuda::FastFeatureDetector> fastDetector_; Ptr<cv::cuda::FastFeatureDetector> fastDetector_;
@ -582,13 +582,13 @@ namespace
{ {
CV_Assert( useProvidedKeypoints == false ); CV_Assert( useProvidedKeypoints == false );
buildScalePyramids(_image, _mask); buildScalePyramids(_image, _mask, stream);
computeKeyPointsPyramid(); computeKeyPointsPyramid(stream);
if (_descriptors.needed()) if (_descriptors.needed())
{ {
computeDescriptors(_descriptors); computeDescriptors(_descriptors, stream);
} }
mergeKeyPoints(_keypoints); mergeKeyPoints(_keypoints, stream);
} }
static float getScale(float scaleFactor, int firstLevel, int level) static float getScale(float scaleFactor, int firstLevel, int level)
@ -596,7 +596,7 @@ namespace
return pow(scaleFactor, level - firstLevel); return pow(scaleFactor, level - firstLevel);
} }
void ORB_Impl::buildScalePyramids(InputArray _image, InputArray _mask) void ORB_Impl::buildScalePyramids(InputArray _image, InputArray _mask, Stream& stream)
{ {
const GpuMat image = _image.getGpuMat(); const GpuMat image = _image.getGpuMat();
const GpuMat mask = _mask.getGpuMat(); const GpuMat mask = _mask.getGpuMat();
@ -622,42 +622,42 @@ namespace
{ {
if (level < firstLevel_) if (level < firstLevel_)
{ {
cuda::resize(image, imagePyr_[level], sz, 0, 0, INTER_LINEAR); cuda::resize(image, imagePyr_[level], sz, 0, 0, INTER_LINEAR, stream);
if (!mask.empty()) if (!mask.empty())
cuda::resize(mask, maskPyr_[level], sz, 0, 0, INTER_LINEAR); cuda::resize(mask, maskPyr_[level], sz, 0, 0, INTER_LINEAR, stream);
} }
else else
{ {
cuda::resize(imagePyr_[level - 1], imagePyr_[level], sz, 0, 0, INTER_LINEAR); cuda::resize(imagePyr_[level - 1], imagePyr_[level], sz, 0, 0, INTER_LINEAR, stream);
if (!mask.empty()) if (!mask.empty())
{ {
cuda::resize(maskPyr_[level - 1], maskPyr_[level], sz, 0, 0, INTER_LINEAR); cuda::resize(maskPyr_[level - 1], maskPyr_[level], sz, 0, 0, INTER_LINEAR, stream);
cuda::threshold(maskPyr_[level], maskPyr_[level], 254, 0, THRESH_TOZERO); cuda::threshold(maskPyr_[level], maskPyr_[level], 254, 0, THRESH_TOZERO, stream);
} }
} }
} }
else else
{ {
image.copyTo(imagePyr_[level]); image.copyTo(imagePyr_[level], stream);
if (!mask.empty()) if (!mask.empty())
mask.copyTo(maskPyr_[level]); mask.copyTo(maskPyr_[level], stream);
} }
// Filter keypoints by image border // Filter keypoints by image border
ensureSizeIsEnough(sz, CV_8UC1, buf_); ensureSizeIsEnough(sz, CV_8UC1, buf_);
buf_.setTo(Scalar::all(0)); buf_.setTo(Scalar::all(0), stream);
Rect inner(edgeThreshold_, edgeThreshold_, sz.width - 2 * edgeThreshold_, sz.height - 2 * edgeThreshold_); Rect inner(edgeThreshold_, edgeThreshold_, sz.width - 2 * edgeThreshold_, sz.height - 2 * edgeThreshold_);
buf_(inner).setTo(Scalar::all(255)); buf_(inner).setTo(Scalar::all(255), stream);
cuda::bitwise_and(maskPyr_[level], buf_, maskPyr_[level]); cuda::bitwise_and(maskPyr_[level], buf_, maskPyr_[level], cv::noArray(), stream);
} }
} }
// takes keypoints and culls them by the response // takes keypoints and culls them by the response
static void cull(GpuMat& keypoints, int& count, int n_points) static void cull(GpuMat& keypoints, int& count, int n_points, Stream& stream)
{ {
using namespace cv::cuda::device::orb; using namespace cv::cuda::device::orb;
@ -670,11 +670,11 @@ namespace
return; return;
} }
count = cull_gpu(keypoints.ptr<int>(cuda::FastFeatureDetector::LOCATION_ROW), keypoints.ptr<float>(cuda::FastFeatureDetector::RESPONSE_ROW), count, n_points); count = cull_gpu(keypoints.ptr<int>(cuda::FastFeatureDetector::LOCATION_ROW), keypoints.ptr<float>(cuda::FastFeatureDetector::RESPONSE_ROW), count, n_points, StreamAccessor::getStream(stream));
} }
} }
void ORB_Impl::computeKeyPointsPyramid() void ORB_Impl::computeKeyPointsPyramid(Stream& stream)
{ {
using namespace cv::cuda::device::orb; using namespace cv::cuda::device::orb;
@ -690,7 +690,7 @@ namespace
fastDetector_->setMaxNumPoints(0.05 * imagePyr_[level].size().area()); fastDetector_->setMaxNumPoints(0.05 * imagePyr_[level].size().area());
GpuMat fastKpRange; GpuMat fastKpRange;
fastDetector_->detectAsync(imagePyr_[level], fastKpRange, maskPyr_[level], Stream::Null()); fastDetector_->detectAsync(imagePyr_[level], fastKpRange, maskPyr_[level], stream);
keyPointsCount_[level] = fastKpRange.cols; keyPointsCount_[level] = fastKpRange.cols;
@ -698,28 +698,28 @@ namespace
continue; continue;
ensureSizeIsEnough(3, keyPointsCount_[level], fastKpRange.type(), keyPointsPyr_[level]); ensureSizeIsEnough(3, keyPointsCount_[level], fastKpRange.type(), keyPointsPyr_[level]);
fastKpRange.copyTo(keyPointsPyr_[level].rowRange(0, 2)); fastKpRange.copyTo(keyPointsPyr_[level].rowRange(0, 2), stream);
const int n_features = static_cast<int>(n_features_per_level_[level]); const int n_features = static_cast<int>(n_features_per_level_[level]);
if (scoreType_ == ORB::HARRIS_SCORE) if (scoreType_ == ORB::HARRIS_SCORE)
{ {
// Keep more points than necessary as FAST does not give amazing corners // Keep more points than necessary as FAST does not give amazing corners
cull(keyPointsPyr_[level], keyPointsCount_[level], 2 * n_features); cull(keyPointsPyr_[level], keyPointsCount_[level], 2 * n_features, stream);
// Compute the Harris cornerness (better scoring than FAST) // Compute the Harris cornerness (better scoring than FAST)
HarrisResponses_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(1), keyPointsCount_[level], 7, HARRIS_K, 0); HarrisResponses_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(1), keyPointsCount_[level], 7, HARRIS_K, StreamAccessor::getStream(stream));
} }
//cull to the final desired level, using the new Harris scores or the original FAST scores. //cull to the final desired level, using the new Harris scores or the original FAST scores.
cull(keyPointsPyr_[level], keyPointsCount_[level], n_features); cull(keyPointsPyr_[level], keyPointsCount_[level], n_features, stream);
// Compute orientation // Compute orientation
IC_Angle_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2), keyPointsCount_[level], half_patch_size, 0); IC_Angle_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2), keyPointsCount_[level], half_patch_size, StreamAccessor::getStream(stream));
} }
} }
void ORB_Impl::computeDescriptors(OutputArray _descriptors) void ORB_Impl::computeDescriptors(OutputArray _descriptors, Stream& stream)
{ {
using namespace cv::cuda::device::orb; using namespace cv::cuda::device::orb;
@ -750,17 +750,17 @@ namespace
{ {
// preprocess the resized image // preprocess the resized image
ensureSizeIsEnough(imagePyr_[level].size(), imagePyr_[level].type(), buf_); ensureSizeIsEnough(imagePyr_[level].size(), imagePyr_[level].type(), buf_);
blurFilter_->apply(imagePyr_[level], buf_); blurFilter_->apply(imagePyr_[level], buf_, stream);
} }
computeOrbDescriptor_gpu(blurForDescriptor_ ? buf_ : imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2), computeOrbDescriptor_gpu(blurForDescriptor_ ? buf_ : imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2),
keyPointsCount_[level], pattern_.ptr<int>(0), pattern_.ptr<int>(1), descRange, descriptorSize(), WTA_K_, 0); keyPointsCount_[level], pattern_.ptr<int>(0), pattern_.ptr<int>(1), descRange, descriptorSize(), WTA_K_, StreamAccessor::getStream(stream));
offset += keyPointsCount_[level]; offset += keyPointsCount_[level];
} }
} }
void ORB_Impl::mergeKeyPoints(OutputArray _keypoints) void ORB_Impl::mergeKeyPoints(OutputArray _keypoints, Stream& stream)
{ {
using namespace cv::cuda::device::orb; using namespace cv::cuda::device::orb;
@ -791,13 +791,13 @@ namespace
float locScale = level != firstLevel_ ? sf : 1.0f; float locScale = level != firstLevel_ ? sf : 1.0f;
mergeLocation_gpu(keyPointsPyr_[level].ptr<short2>(0), keyPointsRange.ptr<float>(0), keyPointsRange.ptr<float>(1), keyPointsCount_[level], locScale, 0); mergeLocation_gpu(keyPointsPyr_[level].ptr<short2>(0), keyPointsRange.ptr<float>(0), keyPointsRange.ptr<float>(1), keyPointsCount_[level], locScale, StreamAccessor::getStream(stream));
GpuMat range = keyPointsRange.rowRange(2, 4); GpuMat range = keyPointsRange.rowRange(2, 4);
keyPointsPyr_[level](Range(1, 3), Range(0, keyPointsCount_[level])).copyTo(range); keyPointsPyr_[level](Range(1, 3), Range(0, keyPointsCount_[level])).copyTo(range, stream);
keyPointsRange.row(4).setTo(Scalar::all(level)); keyPointsRange.row(4).setTo(Scalar::all(level), stream);
keyPointsRange.row(5).setTo(Scalar::all(patchSize_ * sf)); keyPointsRange.row(5).setTo(Scalar::all(patchSize_ * sf), stream);
offset += keyPointsCount_[level]; offset += keyPointsCount_[level];
} }

@ -47,7 +47,7 @@
#include "opencv2/core/cuda/common.hpp" #include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/utility.hpp" #include "opencv2/core/cuda/utility.hpp"
#include <thrust/execution_policy.h>
namespace cv { namespace cuda { namespace device namespace cv { namespace cuda { namespace device
{ {
namespace gfft namespace gfft
@ -91,12 +91,12 @@ namespace cv { namespace cuda { namespace device
} }
} }
int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count) int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count, cudaStream_t stream)
{ {
void* counter_ptr; void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) ); cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) ); cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) );
bindTexture(&eigTex, eig); bindTexture(&eigTex, eig);
@ -104,17 +104,18 @@ namespace cv { namespace cuda { namespace device
dim3 grid(divUp(eig.cols, block.x), divUp(eig.rows, block.y)); dim3 grid(divUp(eig.cols, block.x), divUp(eig.rows, block.y));
if (mask.data) if (mask.data)
findCorners<<<grid, block>>>(threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols); findCorners<<<grid, block, 0, stream>>>(threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols);
else else
findCorners<<<grid, block>>>(threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols); findCorners<<<grid, block, 0, stream>>>(threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
int count; int count;
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
if (stream)
cudaSafeCall(cudaStreamSynchronize(stream));
else
cudaSafeCall( cudaDeviceSynchronize() );
return std::min(count, max_count); return std::min(count, max_count);
} }
@ -128,13 +129,19 @@ namespace cv { namespace cuda { namespace device
}; };
void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count) void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count, cudaStream_t stream)
{ {
bindTexture(&eigTex, eig); bindTexture(&eigTex, eig);
thrust::device_ptr<float2> ptr(corners); thrust::device_ptr<float2> ptr(corners);
#if THRUST_VERSION >= 100802
if (stream)
thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()).on(stream), ptr, ptr + count, EigGreater());
else
thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()), ptr, ptr + count, EigGreater());
#else
thrust::sort(ptr, ptr + count, EigGreater()); thrust::sort(ptr, ptr + count, EigGreater());
#endif
} }
} // namespace optical_flow } // namespace optical_flow
}}} }}}

@ -55,8 +55,8 @@ namespace cv { namespace cuda { namespace device
{ {
namespace gfft namespace gfft
{ {
int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count); int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count, cudaStream_t stream);
void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count); void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count, cudaStream_t stream);
} }
}}} }}}
@ -97,9 +97,6 @@ namespace
void GoodFeaturesToTrackDetector::detect(InputArray _image, OutputArray _corners, InputArray _mask, Stream& stream) void GoodFeaturesToTrackDetector::detect(InputArray _image, OutputArray _corners, InputArray _mask, Stream& stream)
{ {
// TODO : implement async version
(void) stream;
using namespace cv::cuda::device::gfft; using namespace cv::cuda::device::gfft;
GpuMat image = _image.getGpuMat(); GpuMat image = _image.getGpuMat();
@ -108,14 +105,14 @@ namespace
CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()) ); CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()) );
ensureSizeIsEnough(image.size(), CV_32FC1, eig_); ensureSizeIsEnough(image.size(), CV_32FC1, eig_);
cornerCriteria_->compute(image, eig_); cornerCriteria_->compute(image, eig_, stream);
double maxVal = 0; double maxVal = 0;
cuda::minMax(eig_, 0, &maxVal); cuda::minMax(eig_, 0, &maxVal);
cudaStream_t stream_ = StreamAccessor::getStream(stream);
ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_); ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);
int total = findCorners_gpu(eig_, static_cast<float>(maxVal * qualityLevel_), mask, tmpCorners_.ptr<float2>(), tmpCorners_.cols); int total = findCorners_gpu(eig_, static_cast<float>(maxVal * qualityLevel_), mask, tmpCorners_.ptr<float2>(), tmpCorners_.cols, stream_);
if (total == 0) if (total == 0)
{ {
@ -123,18 +120,18 @@ namespace
return; return;
} }
sortCorners_gpu(eig_, tmpCorners_.ptr<float2>(), total); sortCorners_gpu(eig_, tmpCorners_.ptr<float2>(), total, stream_);
if (minDistance_ < 1) if (minDistance_ < 1)
{ {
tmpCorners_.colRange(0, maxCorners_ > 0 ? std::min(maxCorners_, total) : total).copyTo(_corners); tmpCorners_.colRange(0, maxCorners_ > 0 ? std::min(maxCorners_, total) : total).copyTo(_corners, stream);
} }
else else
{ {
std::vector<Point2f> tmp(total); std::vector<Point2f> tmp(total);
Mat tmpMat(1, total, CV_32FC2, (void*)&tmp[0]); Mat tmpMat(1, total, CV_32FC2, (void*)&tmp[0]);
tmpCorners_.colRange(0, total).download(tmpMat); tmpCorners_.colRange(0, total).download(tmpMat, stream);
stream.waitForCompletion();
std::vector<Point2f> tmp2; std::vector<Point2f> tmp2;
tmp2.reserve(total); tmp2.reserve(total);
@ -203,7 +200,7 @@ namespace
_corners.create(1, static_cast<int>(tmp2.size()), CV_32FC2); _corners.create(1, static_cast<int>(tmp2.size()), CV_32FC2);
GpuMat corners = _corners.getGpuMat(); GpuMat corners = _corners.getGpuMat();
corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0])); corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0]), stream);
} }
} }
} }

Loading…
Cancel
Save