From 8bb9e4302e097dd3a0ff432da4d8e4037dae3040 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Wed, 10 Aug 2011 11:32:48 +0000 Subject: [PATCH] added registerPageLocked/unregisterPageLocked functions added convert functions to BruteForceMatcher_GPU other minor fixes --- modules/gpu/include/opencv2/gpu/gpu.hpp | 29 ++++-- modules/gpu/src/brute_force_matcher.cpp | 95 +++++++++++++------ modules/gpu/src/cuda/brute_force_matcher.cu | 62 ++++-------- modules/gpu/src/cuda/calib3d.cu | 9 +- modules/gpu/src/cuda/canny.cu | 7 +- modules/gpu/src/cuda/element_operations.cu | 18 ++-- modules/gpu/src/cuda/hist.cu | 21 ++-- modules/gpu/src/cuda/imgproc.cu | 10 +- modules/gpu/src/cuda/internal_shared.hpp | 44 ++++++++- modules/gpu/src/cuda/matrix_operations.cu | 16 ++-- modules/gpu/src/cuda/surf.cu | 4 +- modules/gpu/src/matrix_operations.cpp | 19 ++-- .../opencv2/gpu/device/detail/transform.hpp | 44 ++++----- .../gpu/src/opencv2/gpu/device/functional.hpp | 72 +++++++------- .../gpu/src/opencv2/gpu/device/transform.hpp | 8 +- .../gpu/src/opencv2/gpu/device/utility.hpp | 10 +- modules/gpu/src/surf.cpp | 34 +++---- samples/gpu/performance/tests.cpp | 2 +- 18 files changed, 276 insertions(+), 228 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 3d73633cba..9ef46a477d 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -149,6 +149,11 @@ namespace cv // It is convertable to cv::Mat header without reference counting // so you can use it with other opencv functions. + // Page-locks the matrix m memory and maps it for the device(s) + CV_EXPORTS void registerPageLocked(Mat& m); + // Unmaps the memory of matrix m, and makes it pageable again. + CV_EXPORTS void unregisterPageLocked(Mat& m); + class CV_EXPORTS CudaMem { public: @@ -1254,8 +1259,10 @@ namespace cv GpuMat& trainIdx, GpuMat& distance, const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null()); - // Download trainIdx and distance to CPU vector with DMatch + // Download trainIdx and distance and convert it to CPU vector with DMatch static void matchDownload(const GpuMat& trainIdx, const GpuMat& distance, std::vector& matches); + // Convert trainIdx and distance to vector with DMatch + static void matchConvert(const Mat& trainIdx, const Mat& distance, std::vector& matches); // Find one best match for each query descriptor. void match(const GpuMat& queryDescs, const GpuMat& trainDescs, std::vector& matches, @@ -1273,13 +1280,13 @@ namespace cv GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, const GpuMat& maskCollection, Stream& stream = Stream::Null()); - // Download trainIdx, imgIdx and distance to CPU vector with DMatch - static void matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, - std::vector& matches); + // Download trainIdx, imgIdx and distance and convert it to vector with DMatch + static void matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, std::vector& matches); + // Convert trainIdx, imgIdx and distance to vector with DMatch + static void matchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, std::vector& matches); // Find one best match from train collection for each query descriptor. - void match(const GpuMat& queryDescs, std::vector& matches, - const std::vector& masks = std::vector()); + void match(const GpuMat& queryDescs, std::vector& matches, const std::vector& masks = std::vector()); // Find k best matches for each query descriptor (in increasing order of distances). // trainIdx.at(queryIdx, i) will contain index of i'th best trains (i < k). @@ -1291,12 +1298,15 @@ namespace cv void knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k, const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null()); - // Download trainIdx and distance to CPU vector with DMatch + // Download trainIdx and distance and convert it to vector with DMatch // compactResult is used when mask is not empty. If compactResult is false matches // vector will have the same size as queryDescriptors rows. If compactResult is true // matches vector will not contain matches for fully masked out query descriptors. static void knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, std::vector< std::vector >& matches, bool compactResult = false); + // Convert trainIdx and distance to vector with DMatch + static void knnMatchConvert(const Mat& trainIdx, const Mat& distance, + std::vector< std::vector >& matches, bool compactResult = false); // Find k best matches for each query descriptor (in increasing order of distances). // compactResult is used when mask is not empty. If compactResult is false matches @@ -1326,13 +1336,16 @@ namespace cv GpuMat& trainIdx, GpuMat& nMatches, GpuMat& distance, float maxDistance, const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null()); - // Download trainIdx, nMatches and distance to CPU vector with DMatch. + // Download trainIdx, nMatches and distance and convert it to vector with DMatch. // matches will be sorted in increasing order of distances. // compactResult is used when mask is not empty. If compactResult is false matches // vector will have the same size as queryDescriptors rows. If compactResult is true // matches vector will not contain matches for fully masked out query descriptors. static void radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& nMatches, const GpuMat& distance, std::vector< std::vector >& matches, bool compactResult = false); + // Convert trainIdx, nMatches and distance to vector with DMatch. + static void radiusMatchConvert(const Mat& trainIdx, const Mat& nMatches, const Mat& distance, + std::vector< std::vector >& matches, bool compactResult = false); // Find best matches for each query descriptor which have distance less than maxDistance // in increasing order of distances). diff --git a/modules/gpu/src/brute_force_matcher.cpp b/modules/gpu/src/brute_force_matcher.cpp index f4ec3cc52f..19521a2a64 100644 --- a/modules/gpu/src/brute_force_matcher.cpp +++ b/modules/gpu/src/brute_force_matcher.cpp @@ -56,17 +56,21 @@ bool cv::gpu::BruteForceMatcher_GPU_base::empty() const { throw_nogpu(); return bool cv::gpu::BruteForceMatcher_GPU_base::isMaskSupported() const { throw_nogpu(); return true; } void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat&, const GpuMat&, vector&) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat&, const Mat&, std::vector&) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat&, const GpuMat&, vector&, const GpuMat&) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat&, GpuMat&, const vector&) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat&, const GpuMat&, const GpuMat&, std::vector&) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat&, const Mat&, const Mat&, std::vector&) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat&, std::vector&, const std::vector&) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, const GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat&, const GpuMat&, std::vector< std::vector >&, bool) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::knnMatchConvert(const Mat&, const Mat&, std::vector< std::vector >&, bool) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, const GpuMat&, std::vector< std::vector >&, int, const GpuMat&, bool) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, std::vector< std::vector >&, int, const std::vector&, bool) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, float, const GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat&, const GpuMat&, const GpuMat&, std::vector< std::vector >&, bool) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat&, const Mat&, const Mat&, std::vector< std::vector >&, bool) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, const GpuMat&, std::vector< std::vector >&, float, const GpuMat&, bool) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, std::vector< std::vector >&, float, const std::vector&, bool) { throw_nogpu(); } @@ -216,8 +220,18 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs, func(queryDescs, trainDescs, mask, trainIdx, trainIdx, distance, cc_12, StreamAccessor::getStream(stream)); } -void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& distance, - vector& matches) +void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& distance, vector& matches) +{ + if (trainIdx.empty() || distance.empty()) + return; + + Mat trainIdxCPU = trainIdx; + Mat distanceCPU = distance; + + matchConvert(trainIdxCPU, distanceCPU, matches); +} + +void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, const Mat& distance, std::vector& matches) { if (trainIdx.empty() || distance.empty()) return; @@ -227,14 +241,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const int nQuery = trainIdx.cols; - Mat trainIdxCPU = trainIdx; - Mat distanceCPU = distance; - matches.clear(); matches.reserve(nQuery); - const int* trainIdx_ptr = trainIdxCPU.ptr(); - const float* distance_ptr = distanceCPU.ptr(); + const int* trainIdx_ptr = trainIdx.ptr(); + const float* distance_ptr = distance.ptr(); for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++distance_ptr) { int trainIdx = *trainIdx_ptr; @@ -347,8 +358,19 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes func(queryDescs, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc_12, StreamAccessor::getStream(stream)); } -void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, - const GpuMat& distance, vector& matches) +void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, vector& matches) +{ + if (trainIdx.empty() || imgIdx.empty() || distance.empty()) + return; + + Mat trainIdxCPU = trainIdx; + Mat imgIdxCPU = imgIdx; + Mat distanceCPU = distance; + + matchConvert(trainIdxCPU, imgIdxCPU, distanceCPU, matches); +} + +void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, std::vector& matches) { if (trainIdx.empty() || imgIdx.empty() || distance.empty()) return; @@ -359,16 +381,12 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const int nQuery = trainIdx.cols; - Mat trainIdxCPU = trainIdx; - Mat imgIdxCPU = imgIdx; - Mat distanceCPU = distance; - matches.clear(); matches.reserve(nQuery); - const int* trainIdx_ptr = trainIdxCPU.ptr(); - const int* imgIdx_ptr = imgIdxCPU.ptr(); - const float* distance_ptr = distanceCPU.ptr(); + const int* trainIdx_ptr = trainIdx.ptr(); + const int* imgIdx_ptr = imgIdx.ptr(); + const float* distance_ptr = distance.ptr(); for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr) { int trainIdx = *trainIdx_ptr; @@ -385,8 +403,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, } } -void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat& queryDescs, vector& matches, - const vector& masks) +void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat& queryDescs, vector& matches, const vector& masks) { GpuMat trainCollection; GpuMat maskCollection; @@ -462,15 +479,24 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainId if (trainIdx.empty() || distance.empty()) return; + Mat trainIdxCPU = trainIdx; + Mat distanceCPU = distance; + + knnMatchConvert(trainIdxCPU, distanceCPU, matches, compactResult); +} + +void cv::gpu::BruteForceMatcher_GPU_base::knnMatchConvert(const Mat& trainIdx, const Mat& distance, + std::vector< std::vector >& matches, bool compactResult) +{ + if (trainIdx.empty() || distance.empty()) + return; + CV_Assert(trainIdx.type() == CV_32SC1); CV_Assert(distance.type() == CV_32FC1 && distance.size() == trainIdx.size()); const int nQuery = distance.rows; const int k = trainIdx.cols; - Mat trainIdxCPU = trainIdx; - Mat distanceCPU = distance; - matches.clear(); matches.reserve(nQuery); @@ -480,8 +506,8 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainId vector& curMatches = matches.back(); curMatches.reserve(k); - int* trainIdx_ptr = trainIdxCPU.ptr(queryIdx); - float* distance_ptr = distanceCPU.ptr(queryIdx); + const int* trainIdx_ptr = trainIdx.ptr(queryIdx); + const float* distance_ptr = distance.ptr(queryIdx); for (int i = 0; i < k; ++i, ++trainIdx_ptr, ++distance_ptr) { int trainIdx = *trainIdx_ptr; @@ -614,24 +640,33 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trai if (trainIdx.empty() || nMatches.empty() || distance.empty()) return; + Mat trainIdxCPU = trainIdx; + Mat nMatchesCPU = nMatches; + Mat distanceCPU = distance; + + radiusMatchConvert(trainIdxCPU, nMatchesCPU, distanceCPU, matches, compactResult); +} + +void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx, const Mat& nMatches, const Mat& distance, + std::vector< std::vector >& matches, bool compactResult) +{ + if (trainIdx.empty() || nMatches.empty() || distance.empty()) + return; + CV_Assert(trainIdx.type() == CV_32SC1); CV_Assert(nMatches.type() == CV_32SC1 && nMatches.isContinuous() && nMatches.cols >= trainIdx.rows); CV_Assert(distance.type() == CV_32FC1 && distance.size() == trainIdx.size()); const int nQuery = trainIdx.rows; - Mat trainIdxCPU = trainIdx; - Mat nMatchesCPU = nMatches; - Mat distanceCPU = distance; - matches.clear(); matches.reserve(nQuery); - const unsigned int* nMatches_ptr = nMatchesCPU.ptr(); + const unsigned int* nMatches_ptr = nMatches.ptr(); for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx) { - const int* trainIdx_ptr = trainIdxCPU.ptr(queryIdx); - const float* distance_ptr = distanceCPU.ptr(queryIdx); + const int* trainIdx_ptr = trainIdx.ptr(queryIdx); + const float* distance_ptr = distance.ptr(queryIdx); const int nMatches = std::min(static_cast(nMatches_ptr[queryIdx]), trainIdx.cols); diff --git a/modules/gpu/src/cuda/brute_force_matcher.cu b/modules/gpu/src/cuda/brute_force_matcher.cu index 17c5c802f7..6b1361901c 100644 --- a/modules/gpu/src/cuda/brute_force_matcher.cu +++ b/modules/gpu/src/cuda/brute_force_matcher.cu @@ -56,9 +56,8 @@ namespace cv { namespace gpu { namespace bfmatcher /////////////////////////////////////////////////////////////////////////////// // Mask strategy - class SingleMask + struct SingleMask { - public: explicit SingleMask(const PtrStep& mask_) : mask(mask_) {} __device__ __forceinline__ bool operator()(int queryIdx, int trainIdx) const @@ -66,13 +65,11 @@ namespace cv { namespace gpu { namespace bfmatcher return mask.ptr(queryIdx)[trainIdx] != 0; } - private: - PtrStep mask; + const PtrStep mask; }; - class MaskCollection + struct MaskCollection { - public: explicit MaskCollection(PtrStep* maskCollection_) : maskCollection(maskCollection_) {} __device__ __forceinline__ void nextMask() @@ -86,15 +83,14 @@ namespace cv { namespace gpu { namespace bfmatcher return curMask.data == 0 || (ForceGlob::Load(curMask.ptr(queryIdx), trainIdx, val), (val != 0)); } - private: - PtrStep* maskCollection; + const PtrStep* maskCollection; PtrStep curMask; }; class WithOutMask { public: - __device__ __forceinline__ void nextMask() + __device__ __forceinline__ void nextMask() const { } __device__ __forceinline__ bool operator()(int queryIdx, int trainIdx) const @@ -128,9 +124,8 @@ namespace cv { namespace gpu { namespace bfmatcher /////////////////////////////////////////////////////////////////////////////// // Distance - template class L1Dist + template struct L1Dist { - public: typedef int ResultType; typedef int ValueType; @@ -151,12 +146,10 @@ namespace cv { namespace gpu { namespace bfmatcher return mySum; } - private: int mySum; }; - template <> class L1Dist + template <> struct L1Dist { - public: typedef float ResultType; typedef float ValueType; @@ -177,13 +170,11 @@ namespace cv { namespace gpu { namespace bfmatcher return mySum; } - private: float mySum; }; - class L2Dist + struct L2Dist { - public: typedef float ResultType; typedef float ValueType; @@ -205,13 +196,11 @@ namespace cv { namespace gpu { namespace bfmatcher return sqrtf(mySum); } - private: float mySum; }; - class HammingDist + struct HammingDist { - public: typedef int ResultType; typedef int ValueType; @@ -232,7 +221,6 @@ namespace cv { namespace gpu { namespace bfmatcher return mySum; } - private: int mySum; }; @@ -425,10 +413,8 @@ namespace cv { namespace gpu { namespace bfmatcher /////////////////////////////////////////////////////////////////////////////// // ReduceDescCalculator - template - class ReduceDescCalculatorSimple + template struct ReduceDescCalculatorSimple { - public: __device__ __forceinline__ void prepare(const T* queryDescs_, int, void*) { queryDescs = queryDescs_; @@ -440,14 +426,12 @@ namespace cv { namespace gpu { namespace bfmatcher reduceDescDiff(queryDescs, trainDescs, desc_len, dist, sdiff_row); } - private: const T* queryDescs; }; template - class ReduceDescCalculatorCached + struct ReduceDescCalculatorCached { - public: __device__ __forceinline__ void prepare(const T* queryDescs, int desc_len, U* smem) { loadDescsVals(queryDescs, desc_len, queryVals, smem); @@ -459,7 +443,6 @@ namespace cv { namespace gpu { namespace bfmatcher reduceDescDiffCached(queryVals, trainDescs, desc_len, dist, sdiff_row); } - private: U queryVals[MAX_DESCRIPTORS_LEN / BLOCK_DIM_X]; }; @@ -497,10 +480,8 @@ namespace cv { namespace gpu { namespace bfmatcher /////////////////////////////////////////////////////////////////////////////// // Train collection loop strategy - template - class SingleTrain + template struct SingleTrain { - public: explicit SingleTrain(const DevMem2D_& trainDescs_) : trainDescs(trainDescs_) { } @@ -517,14 +498,11 @@ namespace cv { namespace gpu { namespace bfmatcher return trainDescs.cols; } - private: - DevMem2D_ trainDescs; + const DevMem2D_ trainDescs; }; - template - class TrainCollection + template struct TrainCollection { - public: TrainCollection(const DevMem2D_* trainCollection_, int nImg_, int desclen_) : trainCollection(trainCollection_), nImg(nImg_), desclen(desclen_) { @@ -536,7 +514,7 @@ namespace cv { namespace gpu { namespace bfmatcher { for (int imgIdx = 0; imgIdx < nImg; ++imgIdx) { - DevMem2D_ trainDescs = trainCollection[imgIdx]; + const DevMem2D_ trainDescs = trainCollection[imgIdx]; m.nextMask(); matchDescs(queryIdx, imgIdx, trainDescs, m, reduceDescCalc, myMin, myBestTrainIdx, myBestImgIdx, sdiff_row); } @@ -547,7 +525,6 @@ namespace cv { namespace gpu { namespace bfmatcher return desclen; } - private: const DevMem2D_* trainCollection; int nImg; int desclen; @@ -806,7 +783,7 @@ namespace cv { namespace gpu { namespace bfmatcher // Calc distance kernel template - __global__ void calcDistance(PtrStep_ queryDescs_, DevMem2D_ trainDescs_, Mask mask, PtrStepf distance) + __global__ void calcDistance(const PtrStep_ queryDescs_, const DevMem2D_ trainDescs_, const Mask mask, PtrStepf distance) { __shared__ typename Dist::ResultType sdiff[BLOCK_DIM_X * BLOCK_DIM_Y]; @@ -989,8 +966,7 @@ namespace cv { namespace gpu { namespace bfmatcher /////////////////////////////////////////////////////////////////////////////// // find knn match kernel - template - __global__ void findBestMatch(DevMem2Df allDist_, int i, PtrStepi trainIdx_, PtrStepf distance_) + template __global__ void findBestMatch(DevMem2Df allDist_, int i, PtrStepi trainIdx_, PtrStepf distance_) { const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64; __shared__ float sdist[SMEM_SIZE]; @@ -1130,8 +1106,8 @@ namespace cv { namespace gpu { namespace bfmatcher // Radius Match kernel template - __global__ void radiusMatch(PtrStep_ queryDescs_, DevMem2D_ trainDescs_, - float maxDistance, Mask mask, DevMem2Di trainIdx_, unsigned int* nMatches, PtrStepf distance) + __global__ void radiusMatch(const PtrStep_ queryDescs_, const DevMem2D_ trainDescs_, + float maxDistance, const Mask mask, DevMem2Di trainIdx_, unsigned int* nMatches, PtrStepf distance) { #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 diff --git a/modules/gpu/src/cuda/calib3d.cu b/modules/gpu/src/cuda/calib3d.cu index 13136d9a48..2a30393817 100644 --- a/modules/gpu/src/cuda/calib3d.cu +++ b/modules/gpu/src/cuda/calib3d.cu @@ -42,6 +42,7 @@ #include "internal_shared.hpp" #include "opencv2/gpu/device/transform.hpp" +#include "opencv2/gpu/device/functional.hpp" #define SOLVE_PNP_RANSAC_MAX_NUM_ITERS 200 @@ -56,9 +57,9 @@ namespace cv { namespace gpu __constant__ float3 crot2; __constant__ float3 ctransl; - struct TransformOp + struct TransformOp : unary_function { - __device__ __forceinline__ float3 operator()(float3 p) const + __device__ __forceinline__ float3 operator()(const float3& p) const { return make_float3( crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x, @@ -89,9 +90,9 @@ namespace cv { namespace gpu __constant__ float3 cproj0; __constant__ float3 cproj1; - struct ProjectOp + struct ProjectOp : unary_function { - __device__ __forceinline__ float2 operator()(float3 p) const + __device__ __forceinline__ float2 operator()(const float3& p) const { // Rotate and translate in 3D float3 t = make_float3( diff --git a/modules/gpu/src/cuda/canny.cu b/modules/gpu/src/cuda/canny.cu index b2acae9711..19f9b384c4 100644 --- a/modules/gpu/src/cuda/canny.cu +++ b/modules/gpu/src/cuda/canny.cu @@ -49,7 +49,7 @@ using namespace cv::gpu::device; namespace cv { namespace gpu { namespace canny { - __global__ void calcSobelRowPass(PtrStep src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols) + __global__ void calcSobelRowPass(const PtrStep src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols) { __shared__ int smem[16][18]; @@ -100,7 +100,8 @@ namespace cv { namespace gpu { namespace canny } }; - template __global__ void calcMagnitude(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols) + template __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, + PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols) { __shared__ int sdx[18][16]; __shared__ int sdy[18][16]; @@ -179,7 +180,7 @@ namespace cv { namespace gpu { namespace canny #define CANNY_SHIFT 15 #define TG22 (int)(0.4142135623730950488016887242097*(1< - struct NotEqual + template struct NotEqual : binary_function { - __device__ __forceinline__ uchar operator()(const T1& src1, const T2& src2) + __device__ __forceinline__ uchar operator()(const T1& src1, const T2& src2) const { return static_cast(static_cast(src1 != src2) * 255); } @@ -467,8 +466,7 @@ namespace cv { namespace gpu { namespace mathfunc ////////////////////////////////////////////////////////////////////////// // pow - template::is_signed> - struct PowOp + template::is_signed> struct PowOp : unary_function { float power; PowOp(float power_) : power(power_) {} @@ -479,13 +477,12 @@ namespace cv { namespace gpu { namespace mathfunc } }; - template - struct PowOp + template struct PowOp : unary_function { float power; PowOp(float power_) : power(power_) {} - __device__ __forceinline__ float operator()(const T& e) + __device__ __forceinline__ float operator()(const T& e) const { T res = saturate_cast(__powf((float)e, power)); @@ -495,13 +492,12 @@ namespace cv { namespace gpu { namespace mathfunc } }; - template<> - struct PowOp + template<> struct PowOp : unary_function { float power; PowOp(float power_) : power(power_) {} - __device__ __forceinline__ float operator()(const float& e) + __device__ __forceinline__ float operator()(const float& e) const { return __powf(fabs(e), power); } diff --git a/modules/gpu/src/cuda/hist.cu b/modules/gpu/src/cuda/hist.cu index 23f8733fb1..cafcb0909a 100644 --- a/modules/gpu/src/cuda/hist.cu +++ b/modules/gpu/src/cuda/hist.cu @@ -105,7 +105,7 @@ namespace cv { namespace gpu { namespace histograms if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag); } - __global__ void histogram256(PtrStep_ d_Data, uint* d_PartialHistograms, uint dataCount, uint cols) + __global__ void histogram256(const PtrStep_ d_Data, uint* d_PartialHistograms, uint dataCount, uint cols) { //Per-warp subhistogram storage __shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY]; @@ -189,21 +189,18 @@ namespace cv { namespace gpu { namespace histograms cudaSafeCall( cudaDeviceSynchronize() ); } - __global__ void equalizeHist(DevMem2D src, PtrStep dst, const int* lut) - { - __shared__ int s_lut[256]; - - const int tid = threadIdx.y * blockDim.x + threadIdx.x; - - s_lut[tid] = lut[tid]; - __syncthreads(); + __constant__ int c_lut[256]; + __global__ void equalizeHist(const DevMem2D src, PtrStep dst) + { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; if (x < src.cols && y < src.rows) { - dst.ptr(y)[x] = __float2int_rn(255.0f * s_lut[src.ptr(y)[x]] / (src.cols * src.rows)); + const uchar val = src.ptr(y)[x]; + const int lut = c_lut[val]; + dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut); } } @@ -212,7 +209,9 @@ namespace cv { namespace gpu { namespace histograms dim3 block(16, 16); dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); - equalizeHist<<>>(src, dst, lut); + cudaSafeCall( cudaMemcpyToSymbol(cv::gpu::histograms::c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) ); + + equalizeHist<<>>(src, dst); cudaSafeCall( cudaGetLastError() ); if (stream == 0) diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu index 1f760d7d33..ab6d9cd49c 100644 --- a/modules/gpu/src/cuda/imgproc.cu +++ b/modules/gpu/src/cuda/imgproc.cu @@ -49,7 +49,7 @@ using namespace cv::gpu::device; /////////////////////////////////// Remap /////////////////////////////////////////////// namespace cv { namespace gpu { namespace imgproc { - texture tex_remap; + texture tex_remap(0, cudaFilterModeLinear, cudaAddressModeWrap); __global__ void remap_1c(const float* mapx, const float* mapy, size_t map_step, uchar* out, size_t out_step, int width, int height) { @@ -131,16 +131,12 @@ namespace cv { namespace gpu { namespace imgproc grid.x = divUp(dst.cols, threads.x); grid.y = divUp(dst.rows, threads.y); - tex_remap.filterMode = cudaFilterModeLinear; - tex_remap.addressMode[0] = tex_remap.addressMode[1] = cudaAddressModeWrap; - cudaChannelFormatDesc desc = cudaCreateChannelDesc(); - cudaSafeCall( cudaBindTexture2D(0, tex_remap, src.data, desc, src.cols, src.rows, src.step) ); + TextureBinder tex_remap(&tex_remap, src); remap_1c<<>>(xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); - cudaSafeCall( cudaUnbindTexture(tex_remap) ); } void remap_gpu_3c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst) @@ -151,8 +147,8 @@ namespace cv { namespace gpu { namespace imgproc grid.y = divUp(dst.rows, threads.y); remap_3c<<>>(src.data, src.step, xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows); - cudaSafeCall( cudaGetLastError() ); + cudaSafeCall( cudaDeviceSynchronize() ); } diff --git a/modules/gpu/src/cuda/internal_shared.hpp b/modules/gpu/src/cuda/internal_shared.hpp index 860d627151..9a3086b67b 100644 --- a/modules/gpu/src/cuda/internal_shared.hpp +++ b/modules/gpu/src/cuda/internal_shared.hpp @@ -77,7 +77,6 @@ namespace cv // Returns true if the GPU analogue exists, false otherwise. bool tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType); - static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; } template static inline void uploadConstant(const char* name, const T& value) @@ -117,6 +116,49 @@ namespace cv cudaSafeCall( cudaUnbindTexture(tex) ); } + class TextureBinder + { + public: + TextureBinder() : tex_(0) {} + template TextureBinder(const textureReference* tex, const DevMem2D_& img) : tex_(0) + { + bind(tex, img); + } + template TextureBinder(const char* tex_name, const DevMem2D_& img) : tex_(0) + { + bind(tex_name, img); + } + ~TextureBinder() { unbind(); } + + template void bind(const textureReference* tex, const DevMem2D_& img) + { + unbind(); + + cudaChannelFormatDesc desc = cudaCreateChannelDesc(); + cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) ); + + tex_ = tex; + } + template void bind(const char* tex_name, const DevMem2D_& img) + { + const textureReference* tex; + cudaSafeCall( cudaGetTextureReference(&tex, tex_name) ); + bind(tex, img); + } + + void unbind() + { + if (tex_) + { + cudaUnbindTexture(tex_); + tex_ = 0; + } + } + + private: + const textureReference* tex_; + }; + class NppStreamHandler { public: diff --git a/modules/gpu/src/cuda/matrix_operations.cu b/modules/gpu/src/cuda/matrix_operations.cu index 2602c05749..3636853b4d 100644 --- a/modules/gpu/src/cuda/matrix_operations.cu +++ b/modules/gpu/src/cuda/matrix_operations.cu @@ -43,6 +43,7 @@ #include "internal_shared.hpp" #include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/transform.hpp" +#include "opencv2/gpu/device/functional.hpp" using namespace cv::gpu::device; @@ -62,7 +63,7 @@ namespace cv { namespace gpu { namespace matrix_operations { /////////////////////////////////////////////////////////////////////////// template - __global__ void copy_to_with_mask(T * mat_src, T * mat_dst, const unsigned char * mask, int cols, int rows, size_t step_mat, size_t step_mask, int channels) + __global__ void copy_to_with_mask(const T* mat_src, T* mat_dst, const uchar* mask, int cols, int rows, size_t step_mat, size_t step_mask, int channels) { size_t x = blockIdx.x * blockDim.x + threadIdx.x; size_t y = blockIdx.y * blockDim.y + threadIdx.y; @@ -162,7 +163,7 @@ namespace cv { namespace gpu { namespace matrix_operations { } template - __global__ void set_to_without_mask(T * mat, int cols, int rows, size_t step, int channels) + __global__ void set_to_without_mask(T* mat, int cols, int rows, size_t step, int channels) { size_t x = blockIdx.x * blockDim.x + threadIdx.x; size_t y = blockIdx.y * blockDim.y + threadIdx.y; @@ -175,7 +176,7 @@ namespace cv { namespace gpu { namespace matrix_operations { } template - __global__ void set_to_with_mask(T * mat, const unsigned char * mask, int cols, int rows, size_t step, int channels, size_t step_mask) + __global__ void set_to_with_mask(T* mat, const uchar* mask, int cols, int rows, size_t step, int channels, size_t step_mask) { size_t x = blockIdx.x * blockDim.x + threadIdx.x; size_t y = blockIdx.y * blockDim.y + threadIdx.y; @@ -237,19 +238,16 @@ namespace cv { namespace gpu { namespace matrix_operations { //////////////////////////////// ConvertTo //////////////////////////////// /////////////////////////////////////////////////////////////////////////// - template - class Convertor + template struct Convertor : unary_function { - public: Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {} - __device__ __forceinline__ D operator()(const T& src) + __device__ __forceinline__ D operator()(const T& src) const { return saturate_cast(alpha * src + beta); } - private: - double alpha, beta; + const double alpha, beta; }; template diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu index 551ccd9a72..a22077e2f6 100644 --- a/modules/gpu/src/cuda/surf.cu +++ b/modules/gpu/src/cuda/surf.cu @@ -225,7 +225,7 @@ namespace cv { namespace gpu { namespace surf }; template - __global__ void icvFindMaximaInLayer(PtrStepf det, PtrStepf trace, int4* maxPosBuffer, unsigned int* maxCounter) + __global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer, unsigned int* maxCounter) { #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 @@ -346,7 +346,7 @@ namespace cv { namespace gpu { namespace surf //////////////////////////////////////////////////////////////////////// // INTERPOLATION - __global__ void icvInterpolateKeypoint(PtrStepf det, const int4* maxPosBuffer, + __global__ void icvInterpolateKeypoint(const PtrStepf det, const int4* maxPosBuffer, float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian, unsigned int* featureCounter) { diff --git a/modules/gpu/src/matrix_operations.cpp b/modules/gpu/src/matrix_operations.cpp index a49c8a5569..874e6e506d 100644 --- a/modules/gpu/src/matrix_operations.cpp +++ b/modules/gpu/src/matrix_operations.cpp @@ -45,13 +45,10 @@ using namespace cv; using namespace cv::gpu; -//////////////////////////////////////////////////////////////////////// -//////////////////////////////// GpuMat //////////////////////////////// -//////////////////////////////////////////////////////////////////////// - - #if !defined (HAVE_CUDA) +void cv::gpu::registerPageLocked(Mat&) { throw_nogpu(); } +void cv::gpu::unregisterPageLocked(Mat&) { throw_nogpu(); } void cv::gpu::CudaMem::create(int /*_rows*/, int /*_cols*/, int /*_type*/, int /*type_alloc*/) { throw_nogpu(); } bool cv::gpu::CudaMem::canMapHostMemory() { throw_nogpu(); return false; } void cv::gpu::CudaMem::release() { throw_nogpu(); } @@ -59,9 +56,15 @@ GpuMat cv::gpu::CudaMem::createGpuMatHeader () const { throw_nogpu(); return Gpu #else /* !defined (HAVE_CUDA) */ -/////////////////////////////////////////////////////////////////////// -//////////////////////////////// CudaMem ////////////////////////////// -/////////////////////////////////////////////////////////////////////// +void registerPageLocked(Mat& m) +{ + cudaSafeCall( cudaHostRegister(m.ptr(), m.step * m.rows, cudaHostRegisterPortable) ); +} + +void unregisterPageLocked(Mat& m) +{ + cudaSafeCall( cudaHostUnregister(m.ptr()) ); +} bool cv::gpu::CudaMem::canMapHostMemory() { diff --git a/modules/gpu/src/opencv2/gpu/device/detail/transform.hpp b/modules/gpu/src/opencv2/gpu/device/detail/transform.hpp index a0b931ca7f..22eb7ffada 100644 --- a/modules/gpu/src/opencv2/gpu/device/detail/transform.hpp +++ b/modules/gpu/src/opencv2/gpu/device/detail/transform.hpp @@ -52,15 +52,13 @@ namespace cv { namespace gpu { namespace device { //! Mask accessor - class MaskReader + struct MaskReader { - public: explicit MaskReader(const PtrStep& mask_): mask(mask_) {} __device__ __forceinline__ bool operator()(int y, int x) const { return mask.ptr(y)[x]; } - private: - PtrStep mask; + const PtrStep mask; }; struct NoMask @@ -159,7 +157,7 @@ namespace cv { namespace gpu { namespace device template <> struct OpUnroller<3> { template - static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y) + static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y) { if (mask(y, x_shifted)) dst.x = op(src.x); @@ -170,7 +168,7 @@ namespace cv { namespace gpu { namespace device } template - static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y) + static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y) { if (mask(y, x_shifted)) dst.x = op(src1.x, src2.x); @@ -183,7 +181,7 @@ namespace cv { namespace gpu { namespace device template <> struct OpUnroller<4> { template - static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y) + static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y) { if (mask(y, x_shifted)) dst.x = op(src.x); @@ -196,7 +194,7 @@ namespace cv { namespace gpu { namespace device } template - static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y) + static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y) { if (mask(y, x_shifted)) dst.x = op(src1.x, src2.x); @@ -210,7 +208,7 @@ namespace cv { namespace gpu { namespace device }; template - __global__ static void transformSmart(const DevMem2D_ src_, PtrStep_ dst_, const Mask mask, UnOp op) + __global__ static void transformSmart(const DevMem2D_ src_, PtrStep_ dst_, const Mask mask, const UnOp op) { typedef typename UnReadWriteTraits::read_type read_type; typedef typename UnReadWriteTraits::write_type write_type; @@ -227,7 +225,7 @@ namespace cv { namespace gpu { namespace device if (x_shifted + shift - 1 < src_.cols) { - read_type src_n_el = ((const read_type*)src)[x]; + const read_type src_n_el = ((const read_type*)src)[x]; write_type dst_n_el; OpUnroller::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y); @@ -246,7 +244,7 @@ namespace cv { namespace gpu { namespace device } template - static __global__ void transformSimple(const DevMem2D_ src, PtrStep_ dst, const Mask mask, UnOp op) + static __global__ void transformSimple(const DevMem2D_ src, PtrStep_ dst, const Mask mask, const UnOp op) { const int x = blockDim.x * blockIdx.x + threadIdx.x; const int y = blockDim.y * blockIdx.y + threadIdx.y; @@ -259,7 +257,7 @@ namespace cv { namespace gpu { namespace device template __global__ static void transformSmart(const DevMem2D_ src1_, const PtrStep_ src2_, PtrStep_ dst_, - const Mask mask, BinOp op) + const Mask mask, const BinOp op) { typedef typename BinReadWriteTraits::read_type1 read_type1; typedef typename BinReadWriteTraits::read_type2 read_type2; @@ -278,8 +276,8 @@ namespace cv { namespace gpu { namespace device if (x_shifted + shift - 1 < src1_.cols) { - read_type1 src1_n_el = ((const read_type1*)src1)[x]; - read_type2 src2_n_el = ((const read_type2*)src2)[x]; + const read_type1 src1_n_el = ((const read_type1*)src1)[x]; + const read_type2 src2_n_el = ((const read_type2*)src2)[x]; write_type dst_n_el; OpUnroller::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y); @@ -299,15 +297,15 @@ namespace cv { namespace gpu { namespace device template static __global__ void transformSimple(const DevMem2D_ src1, const PtrStep_ src2, PtrStep_ dst, - const Mask mask, BinOp op) + const Mask mask, const BinOp op) { const int x = blockDim.x * blockIdx.x + threadIdx.x; const int y = blockDim.y * blockIdx.y + threadIdx.y; if (x < src1.cols && y < src1.rows && mask(y, x)) { - T1 src1_data = src1.ptr(y)[x]; - T2 src2_data = src2.ptr(y)[x]; + const T1 src1_data = src1.ptr(y)[x]; + const T2 src2_data = src2.ptr(y)[x]; dst.ptr(y)[x] = op(src1_data, src2_data); } } @@ -316,7 +314,7 @@ namespace cv { namespace gpu { namespace device template<> struct TransformDispatcher { template - static void call(const DevMem2D_& src, const DevMem2D_& dst, UnOp op, const Mask& mask, cudaStream_t stream) + static void call(const DevMem2D_& src, const DevMem2D_& dst, const UnOp& op, const Mask& mask, cudaStream_t stream) { dim3 threads(16, 16, 1); dim3 grid(1, 1, 1); @@ -332,7 +330,7 @@ namespace cv { namespace gpu { namespace device } template - static void call(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, BinOp op, const Mask& mask, cudaStream_t stream) + static void call(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, const BinOp& op, const Mask& mask, cudaStream_t stream) { dim3 threads(16, 16, 1); dim3 grid(1, 1, 1); @@ -350,7 +348,7 @@ namespace cv { namespace gpu { namespace device template<> struct TransformDispatcher { template - static void call(const DevMem2D_& src, const DevMem2D_& dst, UnOp op, const Mask& mask, cudaStream_t stream) + static void call(const DevMem2D_& src, const DevMem2D_& dst, const UnOp& op, const Mask& mask, cudaStream_t stream) { const int shift = UnReadWriteTraits::shift; @@ -368,7 +366,7 @@ namespace cv { namespace gpu { namespace device } template - static void call(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, BinOp op, const Mask& mask, cudaStream_t stream) + static void call(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, const BinOp& op, const Mask& mask, cudaStream_t stream) { const int shift = BinReadWriteTraits::shift; @@ -413,13 +411,13 @@ namespace cv { namespace gpu { namespace device }; template - static void transform_caller(const DevMem2D_& src, const DevMem2D_& dst, UnOp op, const Mask& mask, cudaStream_t stream) + static void transform_caller(const DevMem2D_& src, const DevMem2D_& dst, const UnOp& op, const Mask& mask, cudaStream_t stream) { TransformDispatcher< UseSmartUn::value >::call(src, dst, op, mask, stream); } template - static void transform_caller(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, BinOp op, const Mask& mask, cudaStream_t stream) + static void transform_caller(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, const BinOp& op, const Mask& mask, cudaStream_t stream) { TransformDispatcher< UseSmartBin::value >::call(src1, src2, dst, op, mask, stream); } diff --git a/modules/gpu/src/opencv2/gpu/device/functional.hpp b/modules/gpu/src/opencv2/gpu/device/functional.hpp index 28889945e6..b3abdf64a1 100644 --- a/modules/gpu/src/opencv2/gpu/device/functional.hpp +++ b/modules/gpu/src/opencv2/gpu/device/functional.hpp @@ -73,7 +73,7 @@ namespace cv { namespace gpu { namespace device using thrust::bit_and; using thrust::bit_or; using thrust::bit_xor; - template struct bit_not : public unary_function + template struct bit_not : unary_function { __forceinline__ __device__ T operator ()(const T& v) const {return ~v;} }; @@ -81,12 +81,12 @@ namespace cv { namespace gpu { namespace device using thrust::identity; #define OPENCV_GPU_IMPLEMENT_MINMAX(name, type, op) \ - template <> struct name : public binary_function \ + template <> struct name : binary_function \ { \ __forceinline__ __device__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \ }; - template struct maximum : public binary_function + template struct maximum : binary_function { __forceinline__ __device__ T operator()(const T& lhs, const T& rhs) const {return lhs < rhs ? rhs : lhs;} }; @@ -100,7 +100,7 @@ namespace cv { namespace gpu { namespace device OPENCV_GPU_IMPLEMENT_MINMAX(maximum, float, fmax) OPENCV_GPU_IMPLEMENT_MINMAX(maximum, double, fmax) - template struct minimum : public binary_function + template struct minimum : binary_function { __forceinline__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs < rhs ? lhs : rhs;} }; @@ -126,31 +126,31 @@ namespace cv { namespace gpu { namespace device using thrust::not2; #define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(func) \ - template struct func ## _func : public unary_function \ + template struct func ## _func : unary_function \ { \ - __forceinline__ __device__ float operator ()(const T& v) \ + __forceinline__ __device__ float operator ()(const T& v) const \ { \ return func ## f(v); \ } \ }; \ - template <> struct func ## _func : public unary_function \ + template <> struct func ## _func : unary_function \ { \ - __forceinline__ __device__ double operator ()(double v) \ + __forceinline__ __device__ double operator ()(double v) const \ { \ return func(v); \ } \ }; #define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(func) \ - template struct func ## _func : public binary_function \ + template struct func ## _func : binary_function \ { \ - __forceinline__ __device__ float operator ()(const T& v1, const T& v2) \ + __forceinline__ __device__ float operator ()(const T& v1, const T& v2) const \ { \ return func ## f(v1, v2); \ } \ }; \ - template <> struct func ## _func : public binary_function \ + template <> struct func ## _func : binary_function \ { \ - __forceinline__ __device__ double operator ()(double v1, double v2) \ + __forceinline__ __device__ double operator ()(double v1, double v2) const \ { \ return func(v1, v2); \ } \ @@ -184,7 +184,7 @@ namespace cv { namespace gpu { namespace device #undef OPENCV_GPU_IMPLEMENT_UN_FUNCTOR #undef OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR - template struct hypot_sqr_func : public binary_function + template struct hypot_sqr_func : binary_function { __forceinline__ __device__ T operator ()(T src1, T src2) const { @@ -192,15 +192,15 @@ namespace cv { namespace gpu { namespace device } }; - template struct saturate_cast_func : public unary_function + template struct saturate_cast_func : unary_function { - __forceinline__ __device__ D operator ()(const T& v) + __forceinline__ __device__ D operator ()(const T& v) const { return saturate_cast(v); } }; - template struct thresh_binary_func : public unary_function + template struct thresh_binary_func : unary_function { __forceinline__ __host__ __device__ thresh_binary_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {} @@ -209,10 +209,10 @@ namespace cv { namespace gpu { namespace device return src > thresh ? maxVal : 0; } - T thresh; - T maxVal; + const T thresh; + const T maxVal; }; - template struct thresh_binary_inv_func : public unary_function + template struct thresh_binary_inv_func : unary_function { __forceinline__ __host__ __device__ thresh_binary_inv_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {} @@ -221,10 +221,10 @@ namespace cv { namespace gpu { namespace device return src > thresh ? 0 : maxVal; } - T thresh; - T maxVal; + const T thresh; + const T maxVal; }; - template struct thresh_trunc_func : public unary_function + template struct thresh_trunc_func : unary_function { explicit __forceinline__ __host__ __device__ thresh_trunc_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {} @@ -233,11 +233,10 @@ namespace cv { namespace gpu { namespace device return minimum()(src, thresh); } - T thresh; + const T thresh; }; - template struct thresh_to_zero_func : public unary_function + template struct thresh_to_zero_func : unary_function { - public: explicit __forceinline__ __host__ __device__ thresh_to_zero_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {} __forceinline__ __device__ T operator()(const T& src) const @@ -245,11 +244,10 @@ namespace cv { namespace gpu { namespace device return src > thresh ? src : 0; } - T thresh; + const T thresh; }; - template struct thresh_to_zero_inv_func : public unary_function + template struct thresh_to_zero_inv_func : unary_function { - public: explicit __forceinline__ __host__ __device__ thresh_to_zero_inv_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {} __forceinline__ __device__ T operator()(const T& src) const @@ -257,36 +255,36 @@ namespace cv { namespace gpu { namespace device return src > thresh ? 0 : src; } - T thresh; + const T thresh; }; - template struct binder1st : public unary_function + template struct binder1st : unary_function { __forceinline__ __host__ __device__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {} - __forceinline__ __device__ typename Op::result_type operator ()(const typename Op::second_argument_type& a) + __forceinline__ __device__ typename Op::result_type operator ()(const typename Op::second_argument_type& a) const { return op(arg1, a); } - Op op; - typename Op::first_argument_type arg1; + const Op op; + const typename Op::first_argument_type arg1; }; template static __forceinline__ __host__ __device__ binder1st bind1st(const Op& op, const T& x) { return binder1st(op, typename Op::first_argument_type(x)); } - template struct binder2nd : public unary_function + template struct binder2nd : unary_function { __forceinline__ __host__ __device__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {} - __forceinline__ __device__ typename Op::result_type operator ()(const typename Op::first_argument_type& a) + __forceinline__ __device__ typename Op::result_type operator ()(const typename Op::first_argument_type& a) const { return op(a, arg2); } - Op op; - typename Op::second_argument_type arg2; + const Op op; + const typename Op::second_argument_type arg2; }; template static __forceinline__ __host__ __device__ binder2nd bind2nd(const Op& op, const T& x) { diff --git a/modules/gpu/src/opencv2/gpu/device/transform.hpp b/modules/gpu/src/opencv2/gpu/device/transform.hpp index f2e447269e..4f756e36da 100644 --- a/modules/gpu/src/opencv2/gpu/device/transform.hpp +++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp @@ -48,12 +48,12 @@ namespace cv { namespace gpu { namespace device { template - static void transform(const DevMem2D_& src, const DevMem2D_& dst, UnOp op, cudaStream_t stream = 0) + static void transform(const DevMem2D_& src, const DevMem2D_& dst, const UnOp& op, cudaStream_t stream = 0) { detail::transform_caller(src, dst, op, detail::NoMask(), stream); } template - static void transform(const DevMem2D_& src, const DevMem2D_& dst, const PtrStep& mask, UnOp op, + static void transform(const DevMem2D_& src, const DevMem2D_& dst, const PtrStep& mask, const UnOp& op, cudaStream_t stream = 0) { detail::transform_caller(src, dst, op, detail::MaskReader(mask), stream); @@ -61,13 +61,13 @@ namespace cv { namespace gpu { namespace device template static void transform(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, - BinOp op, cudaStream_t stream = 0) + const BinOp& op, cudaStream_t stream = 0) { detail::transform_caller(src1, src2, dst, op, detail::NoMask(), stream); } template static void transform(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, - const PtrStep& mask, BinOp op, cudaStream_t stream = 0) + const PtrStep& mask, const BinOp& op, cudaStream_t stream = 0) { detail::transform_caller(src1, src2, dst, op, detail::MaskReader(mask), stream); } diff --git a/modules/gpu/src/opencv2/gpu/device/utility.hpp b/modules/gpu/src/opencv2/gpu/device/utility.hpp index a3b4dafa94..b0dca8afeb 100644 --- a/modules/gpu/src/opencv2/gpu/device/utility.hpp +++ b/modules/gpu/src/opencv2/gpu/device/utility.hpp @@ -65,15 +65,15 @@ namespace cv { namespace gpu { namespace device { - template void __host__ __device__ __forceinline__ swap(T &a, T &b) + template void __host__ __device__ __forceinline__ swap(T& a, T& b) { - T temp = a; + const T temp = a; a = b; b = temp; } // warp-synchronous 32 elements reduction - template __device__ __forceinline__ void warpReduce32(volatile T* data, T& partial_reduction, int tid, Op op) + template __device__ __forceinline__ void warpReduce32(volatile T* data, T& partial_reduction, int tid, const Op& op) { data[tid] = partial_reduction; @@ -88,7 +88,7 @@ namespace cv { namespace gpu { namespace device } // warp-synchronous 16 elements reduction - template __device__ __forceinline__ void warpReduce16(volatile T* data, T& partial_reduction, int tid, Op op) + template __device__ __forceinline__ void warpReduce16(volatile T* data, T& partial_reduction, int tid, const Op& op) { data[tid] = partial_reduction; @@ -102,7 +102,7 @@ namespace cv { namespace gpu { namespace device } // warp-synchronous reduction - template __device__ __forceinline__ void warpReduce(volatile T* data, T& partial_reduction, int tid, Op op) + template __device__ __forceinline__ void warpReduce(volatile T* data, T& partial_reduction, int tid, const Op& op) { if (tid < n) data[tid] = partial_reduction; diff --git a/modules/gpu/src/surf.cpp b/modules/gpu/src/surf.cpp index e4c06ac92c..9a4e6ec91c 100644 --- a/modules/gpu/src/surf.cpp +++ b/modules/gpu/src/surf.cpp @@ -107,9 +107,9 @@ namespace maxCandidates = min(static_cast(1.5 * maxFeatures), 65535); CV_Assert(maxFeatures > 0); - - cudaSafeCall( cudaMalloc((void**)&d_counters, (nOctaves + 1) * sizeof(unsigned int)) ); - cudaSafeCall( cudaMemset(d_counters, 0, (nOctaves + 1) * sizeof(unsigned int)) ); + + counters.create(1, nOctaves + 1, CV_32SC1); + counters.setTo(Scalar::all(0)); uploadConstant("cv::gpu::surf::c_max_candidates", maxCandidates); uploadConstant("cv::gpu::surf::c_max_features", maxFeatures); @@ -118,30 +118,20 @@ namespace uploadConstant("cv::gpu::surf::c_nOctaveLayers", nOctaveLayers); uploadConstant("cv::gpu::surf::c_hessianThreshold", static_cast(hessianThreshold)); - bindTexture("cv::gpu::surf::imgTex", (DevMem2D)img); + imgTex.bind("cv::gpu::surf::imgTex", (DevMem2D)img); integralBuffered(img, sum, intBuffer); - bindTexture("cv::gpu::surf::sumTex", (DevMem2D_)sum); + sumTex.bind("cv::gpu::surf::sumTex", (DevMem2D_)sum); if (use_mask) { min(mask, 1.0, mask1); integralBuffered(mask1, maskSum, intBuffer); - bindTexture("cv::gpu::surf::maskSumTex", (DevMem2D_)maskSum); + maskSumTex.bind("cv::gpu::surf::maskSumTex", (DevMem2D_)maskSum); } } - ~SURF_GPU_Invoker() - { - cudaSafeCall( cudaFree(d_counters) ); - - unbindTexture("cv::gpu::surf::imgTex"); - unbindTexture("cv::gpu::surf::sumTex"); - if (use_mask) - unbindTexture("cv::gpu::surf::maskSumTex"); - } - void detectKeypoints(GpuMat& keypoints) { ensureSizeIsEnough(img_rows * (nOctaveLayers + 2), img_cols, CV_32FC1, det); @@ -162,11 +152,11 @@ namespace icvCalcLayerDetAndTrace_gpu(det, trace, img_rows, img_cols, octave, nOctaveLayers); - icvFindMaximaInLayer_gpu(det, trace, maxPosBuffer.ptr(), d_counters + 1 + octave, + icvFindMaximaInLayer_gpu(det, trace, maxPosBuffer.ptr(), counters.ptr() + 1 + octave, img_rows, img_cols, octave, use_mask, nOctaveLayers); unsigned int maxCounter; - cudaSafeCall( cudaMemcpy(&maxCounter, d_counters + 1 + octave, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); + cudaSafeCall( cudaMemcpy(&maxCounter, counters.ptr() + 1 + octave, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); maxCounter = std::min(maxCounter, static_cast(maxCandidates)); if (maxCounter > 0) @@ -174,11 +164,11 @@ namespace icvInterpolateKeypoint_gpu(det, maxPosBuffer.ptr(), maxCounter, keypoints.ptr(SURF_GPU::SF_X), keypoints.ptr(SURF_GPU::SF_Y), keypoints.ptr(SURF_GPU::SF_LAPLACIAN), keypoints.ptr(SURF_GPU::SF_SIZE), - keypoints.ptr(SURF_GPU::SF_HESSIAN), d_counters); + keypoints.ptr(SURF_GPU::SF_HESSIAN), counters.ptr()); } } unsigned int featureCounter; - cudaSafeCall( cudaMemcpy(&featureCounter, d_counters, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); + cudaSafeCall( cudaMemcpy(&featureCounter, counters.ptr(), sizeof(unsigned int), cudaMemcpyDeviceToHost) ); featureCounter = std::min(featureCounter, static_cast(maxFeatures)); keypoints.cols = featureCounter; @@ -226,7 +216,9 @@ namespace int maxCandidates; int maxFeatures; - unsigned int* d_counters; + GpuMat counters; + + TextureBinder imgTex, sumTex, maskSumTex; }; } diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp index ef2e2b7a68..1b2cbbc30b 100644 --- a/samples/gpu/performance/tests.cpp +++ b/samples/gpu/performance/tests.cpp @@ -318,7 +318,7 @@ TEST(BruteForceMatcher) GPU_OFF; SUBTEST << "knnMatch"; - int knn = 10; + int knn = 2; CPU_ON; matcher.knnMatch(query, train, matches, knn);