diff --git a/modules/gpu/src/brute_force_matcher.cpp b/modules/gpu/src/brute_force_matcher.cpp index 19521a2a64..d6bbb7eb67 100644 --- a/modules/gpu/src/brute_force_matcher.cpp +++ b/modules/gpu/src/brute_force_matcher.cpp @@ -105,13 +105,13 @@ namespace cv { namespace gpu { namespace bfmatcher template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); + const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); + const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); + const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, @@ -428,7 +428,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con using namespace cv::gpu::bfmatcher; typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); + const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); static const match_caller_t match_callers[3][8] = { @@ -454,23 +454,28 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con ensureSizeIsEnough(nQuery, k, CV_32S, trainIdx); ensureSizeIsEnough(nQuery, k, CV_32F, distance); - ensureSizeIsEnough(nQuery, nTrain, CV_32FC1, allDist); + if (k != 2) + ensureSizeIsEnough(nQuery, nTrain, CV_32FC1, allDist); if (stream) { stream.enqueueMemSet(trainIdx, Scalar::all(-1)); - stream.enqueueMemSet(allDist, Scalar::all(numeric_limits::max())); + if (k != 2) + stream.enqueueMemSet(allDist, Scalar::all(numeric_limits::max())); } else { trainIdx.setTo(Scalar::all(-1)); - allDist.setTo(Scalar::all(numeric_limits::max())); + if (k != 2) + allDist.setTo(Scalar::all(numeric_limits::max())); } match_caller_t func = match_callers[distType][queryDescs.depth()]; CV_Assert(func != 0); + + bool cc_12 = TargetArchs::builtWith(FEATURE_SET_COMPUTE_12) && DeviceInfo().supports(FEATURE_SET_COMPUTE_12); - func(queryDescs, trainDescs, k, mask, trainIdx, distance, allDist, StreamAccessor::getStream(stream)); + func(queryDescs, trainDescs, k, mask, trainIdx, distance, allDist, cc_12, StreamAccessor::getStream(stream)); } void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, diff --git a/modules/gpu/src/cuda/brute_force_matcher.cu b/modules/gpu/src/cuda/brute_force_matcher.cu index 6b1361901c..4cd1142d57 100644 --- a/modules/gpu/src/cuda/brute_force_matcher.cu +++ b/modules/gpu/src/cuda/brute_force_matcher.cu @@ -87,9 +87,8 @@ namespace cv { namespace gpu { namespace bfmatcher PtrStep curMask; }; - class WithOutMask + struct WithOutMask { - public: __device__ __forceinline__ void nextMask() const { } @@ -102,21 +101,19 @@ namespace cv { namespace gpu { namespace bfmatcher /////////////////////////////////////////////////////////////////////////////// // Reduce Sum - template struct SumReductor; + template struct SumReductor; template <> struct SumReductor<16> { - template static __device__ void reduce(T* sdiff_row, T& mySum) + template static __device__ void reduce(volatile T* sdiff_row, T& mySum) { - volatile T* smem = sdiff_row; - - smem[threadIdx.x] = mySum; + sdiff_row[threadIdx.x] = mySum; if (threadIdx.x < 8) { - smem[threadIdx.x] = mySum += smem[threadIdx.x + 8]; - smem[threadIdx.x] = mySum += smem[threadIdx.x + 4]; - smem[threadIdx.x] = mySum += smem[threadIdx.x + 2]; - smem[threadIdx.x] = mySum += smem[threadIdx.x + 1]; + sdiff_row[threadIdx.x] = mySum += sdiff_row[threadIdx.x + 8]; + sdiff_row[threadIdx.x] = mySum += sdiff_row[threadIdx.x + 4]; + sdiff_row[threadIdx.x] = mySum += sdiff_row[threadIdx.x + 2]; + sdiff_row[threadIdx.x] = mySum += sdiff_row[threadIdx.x + 1]; } } }; @@ -344,7 +341,7 @@ namespace cv { namespace gpu { namespace bfmatcher /////////////////////////////////////////////////////////////////////////////// // warpReduceMinIdxIdx - template struct MinIdxIdxWarpReductor; + template struct MinIdxIdxWarpReductor; template <> struct MinIdxIdxWarpReductor<16> { template @@ -435,6 +432,7 @@ namespace cv { namespace gpu { namespace bfmatcher __device__ __forceinline__ void prepare(const T* queryDescs, int desc_len, U* smem) { loadDescsVals(queryDescs, desc_len, queryVals, smem); + __syncthreads(); } template @@ -778,6 +776,173 @@ namespace cv { namespace gpu { namespace bfmatcher /////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////// Knn Match //////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////// + + template + __device__ void distanceCalcLoop(const PtrStep_& query, const DevMem2D_& train, const Mask& m, int queryIdx, + typename Dist::ResultType& distMin1, typename Dist::ResultType& distMin2, int& bestTrainIdx1, int& bestTrainIdx2, + typename Dist::ResultType* smem) + { + ReduceDescCalculator reduceDescCalc; + + reduceDescCalc.prepare(query.ptr(queryIdx), train.cols, (typename Dist::ValueType*)smem); + + typename Dist::ResultType* sdiffRow = smem + BLOCK_DIM_X * threadIdx.y; + + for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += BLOCK_DIM_Y) + { + if (m(queryIdx, trainIdx)) + { + Dist dist; + + const T* trainRow = train.ptr(trainIdx); + + reduceDescCalc.calc(trainRow, train.cols, dist, sdiffRow); + + if (threadIdx.x == 0) + { + typename Dist::ResultType val = dist; + + if (val < distMin1) + { + distMin1 = val; + bestTrainIdx1 = trainIdx; + } + else if (val < distMin2) + { + distMin2 = val; + bestTrainIdx2 = trainIdx; + } + } + } + } + } + + template + __global__ void knnMatch2(const PtrStep_ query, const DevMem2D_ train, const Mask m, PtrStep_ trainIdx, PtrStep_ distance) + { + typedef typename Dist::ResultType ResultType; + typedef typename Dist::ValueType ValueType; + + __shared__ ResultType smem[BLOCK_DIM_X * BLOCK_DIM_Y]; + + const int queryIdx = blockIdx.x; + + ResultType distMin1 = numeric_limits::max(); + ResultType distMin2 = numeric_limits::max(); + + int bestTrainIdx1 = -1; + int bestTrainIdx2 = -1; + + distanceCalcLoop(query, train, m, queryIdx, + distMin1, distMin2, bestTrainIdx1, bestTrainIdx2, smem); + __syncthreads(); + + volatile ResultType* sdistMinRow = smem; + volatile int* sbestTrainIdxRow = (int*)(sdistMinRow + 2 * BLOCK_DIM_Y); + + if (threadIdx.x == 0) + { + sdistMinRow[threadIdx.y] = distMin1; + sdistMinRow[threadIdx.y + BLOCK_DIM_Y] = distMin2; + + sbestTrainIdxRow[threadIdx.y] = bestTrainIdx1; + sbestTrainIdxRow[threadIdx.y + BLOCK_DIM_Y] = bestTrainIdx2; + } + __syncthreads(); + + if (threadIdx.x == 0 && threadIdx.y == 0) + { + distMin1 = numeric_limits::max(); + distMin2 = numeric_limits::max(); + + bestTrainIdx1 = -1; + bestTrainIdx2 = -1; + + #pragma unroll + for (int i = 0; i < BLOCK_DIM_Y; ++i) + { + ResultType val = sdistMinRow[i]; + + if (val < distMin1) + { + distMin1 = val; + bestTrainIdx1 = sbestTrainIdxRow[i]; + } + else if (val < distMin2) + { + distMin2 = val; + bestTrainIdx2 = sbestTrainIdxRow[i]; + } + } + + #pragma unroll + for (int i = BLOCK_DIM_Y; i < 2 * BLOCK_DIM_Y; ++i) + { + ResultType val = sdistMinRow[i]; + + if (val < distMin2) + { + distMin2 = val; + bestTrainIdx2 = sbestTrainIdxRow[i]; + } + } + + trainIdx.ptr(queryIdx)[0] = make_int2(bestTrainIdx1, bestTrainIdx2); + distance.ptr(queryIdx)[0] = make_float2(distMin1, distMin2); + } + } + + template + void knnMatch2Simple_caller(const DevMem2D_& queryDescs, const DevMem2D_& trainDescs, const Mask& mask, + const DevMem2D_& trainIdx, const DevMem2D_& distance, cudaStream_t stream) + { + dim3 grid(queryDescs.rows, 1, 1); + dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); + + knnMatch2, T> + <<>>(queryDescs, trainDescs, mask, trainIdx, distance); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + template + void knnMatch2Cached_caller(const DevMem2D_& queryDescs, const DevMem2D_& trainDescs, const Mask& mask, + const DevMem2D_& trainIdx, const DevMem2D_& distance, cudaStream_t stream) + { + StaticAssert= MAX_DESCRIPTORS_LEN>::check(); // block size must be greter than descriptors length + StaticAssert::check(); // max descriptors length must divide to blockDimX + + dim3 grid(queryDescs.rows, 1, 1); + dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); + + knnMatch2, T> + <<>>(queryDescs, trainDescs, mask, trainIdx, distance); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + template + void knnMatch2Dispatcher(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2D_& trainIdx, const DevMem2D_& distance, bool cc_12, cudaStream_t stream) + { + if (query.cols < 64) + knnMatch2Cached_caller<16, 16, 64, false, Dist>(query, train, mask, trainIdx, distance, stream); + else if (query.cols == 64) + knnMatch2Cached_caller<16, 16, 64, true, Dist>(query, train, mask, trainIdx, distance, stream); + else if (query.cols < 128) + knnMatch2Cached_caller<16, 16, 128, false, Dist>(query, train, mask, trainIdx, distance, stream); + else if (query.cols == 128 && cc_12) + knnMatch2Cached_caller<16, 16, 128, true, Dist>(query, train, mask, trainIdx, distance, stream); + else if (query.cols < 256 && cc_12) + knnMatch2Cached_caller<16, 16, 256, false, Dist>(query, train, mask, trainIdx, distance, stream); + else if (query.cols == 256 && cc_12) + knnMatch2Cached_caller<16, 16, 256, true, Dist>(query, train, mask, trainIdx, distance, stream); + else + knnMatch2Simple_caller<16, 16, Dist>(query, train, mask, trainIdx, distance, stream); + } /////////////////////////////////////////////////////////////////////////////// // Calc distance kernel @@ -1026,77 +1191,74 @@ namespace cv { namespace gpu { namespace bfmatcher findKnnMatch_caller<256>(knn, trainIdx, distance, allDist, stream); } - template - void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream) + template < typename Dist, typename T > + void knnMatchDispatcher(const DevMem2D_& queryDescs, const DevMem2D_& trainDescs, int knn, + const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream) { if (mask.data) { - calcDistanceDispatcher< L1Dist >((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, SingleMask(mask), allDist, stream); + if (knn == 2) + { + knnMatch2Dispatcher(queryDescs, trainDescs, SingleMask(mask), (DevMem2D_)trainIdx, (DevMem2D_)distance, cc_12, stream); + return; + } + + calcDistanceDispatcher(queryDescs, trainDescs, SingleMask(mask), allDist, stream); } else { - calcDistanceDispatcher< L1Dist >((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, WithOutMask(), allDist, stream); + if (knn == 2) + { + knnMatch2Dispatcher(queryDescs, trainDescs, WithOutMask(), (DevMem2D_)trainIdx, (DevMem2D_)distance, cc_12, stream); + return; + } + + calcDistanceDispatcher(queryDescs, trainDescs, WithOutMask(), allDist, stream); } findKnnMatchDispatcher(knn, trainIdx, distance, allDist, stream); } - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); + template + void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, + const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream) + { + knnMatchDispatcher< L1Dist >((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, knn, mask, trainIdx, distance, allDist, cc_12, stream); + } + + template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); + template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); + template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); + template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); + template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); + template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream) + const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream) { - if (mask.data) - { - calcDistanceDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, - SingleMask(mask), allDist, stream); - } - else - { - calcDistanceDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, - WithOutMask(), allDist, stream); - } - - findKnnMatchDispatcher(knn, trainIdx, distance, allDist, stream); + knnMatchDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, knn, mask, trainIdx, distance, allDist, cc_12, stream); } - template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); - template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); - template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); - template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); - template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); - template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); + template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); + template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); + template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); + template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); + template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); + template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream) + const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream) { - if (mask.data) - { - calcDistanceDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, - SingleMask(mask), allDist, stream); - } - else - { - calcDistanceDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, - WithOutMask(), allDist, stream); - } - - findKnnMatchDispatcher(knn, trainIdx, distance, allDist, stream); + knnMatchDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, knn, mask, trainIdx, distance, allDist, cc_12, stream); } - template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); - template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); - template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); - template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); - template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); + template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); + template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); + template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); + template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); + template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); /////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////// Radius Match ////////////////////////////////// diff --git a/modules/gpu/test/test_features2d.cpp b/modules/gpu/test/test_features2d.cpp index 516c0f5920..248ca761a4 100644 --- a/modules/gpu/test/test_features2d.cpp +++ b/modules/gpu/test/test_features2d.cpp @@ -320,7 +320,7 @@ TEST_P(BruteForceMatcher, KnnMatch) PRINT_PARAM(distStr); PRINT_PARAM(dim); - const int knn = 3; + const int knn = 2; std::vector< std::vector > matches; diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp index 1b2cbbc30b..ebc2dad2ad 100644 --- a/samples/gpu/performance/tests.cpp +++ b/samples/gpu/performance/tests.cpp @@ -286,7 +286,7 @@ TEST(BruteForceMatcher) { // Init CPU matcher - int desc_len = 128; + int desc_len = 64; BruteForceMatcher< L2 > matcher; @@ -328,7 +328,7 @@ TEST(BruteForceMatcher) d_matcher.knnMatch(d_query, d_train, d_matches, knn); GPU_OFF; - SUBTEST << "radiusMatch"; + /*SUBTEST << "radiusMatch"; float max_distance = 3.8f; CPU_ON; @@ -337,7 +337,7 @@ TEST(BruteForceMatcher) GPU_ON; d_matcher.radiusMatch(d_query, d_train, d_matches, max_distance); - GPU_OFF; + GPU_OFF;*/ }