From d22516872cab0fa7c9b661f85e1eb1d36b2ff7cb Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 13 Apr 2015 17:12:04 +0300 Subject: [PATCH] fix BruteForceMatcher resource distribution added launch bounds attributes for all CUDA kernels --- modules/gpu/src/cuda/bf_knnmatch.cu | 9 +++++++++ modules/gpu/src/cuda/bf_match.cu | 6 ++++++ modules/gpu/src/cuda/bf_radius_match.cu | 2 ++ 3 files changed, 17 insertions(+) diff --git a/modules/gpu/src/cuda/bf_knnmatch.cu b/modules/gpu/src/cuda/bf_knnmatch.cu index 66e37d088a..3e5bc741ff 100644 --- a/modules/gpu/src/cuda/bf_knnmatch.cu +++ b/modules/gpu/src/cuda/bf_knnmatch.cu @@ -374,6 +374,7 @@ namespace cv { namespace gpu { namespace device } template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void matchUnrolledCached(const PtrStepSz query, const PtrStepSz train, const Mask mask, int2* bestTrainIdx, float2* bestDistance) { extern __shared__ int smem[]; @@ -424,6 +425,7 @@ namespace cv { namespace gpu { namespace device } template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void matchUnrolledCached(const PtrStepSz query, const PtrStepSz* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance) { extern __shared__ int smem[]; @@ -553,6 +555,7 @@ namespace cv { namespace gpu { namespace device } template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void matchUnrolled(const PtrStepSz query, const PtrStepSz train, const Mask mask, int2* bestTrainIdx, float2* bestDistance) { extern __shared__ int smem[]; @@ -601,6 +604,7 @@ namespace cv { namespace gpu { namespace device } template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void matchUnrolled(const PtrStepSz query, const PtrStepSz* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance) { extern __shared__ int smem[]; @@ -727,6 +731,7 @@ namespace cv { namespace gpu { namespace device } template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void match(const PtrStepSz query, const PtrStepSz train, const Mask mask, int2* bestTrainIdx, float2* bestDistance) { extern __shared__ int smem[]; @@ -775,6 +780,7 @@ namespace cv { namespace gpu { namespace device } template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void match(const PtrStepSz query, const PtrStepSz* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance) { extern __shared__ int smem[]; @@ -902,6 +908,7 @@ namespace cv { namespace gpu { namespace device // Calc distance kernel template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void calcDistanceUnrolled(const PtrStepSz query, const PtrStepSz train, const Mask mask, PtrStepf allDist) { extern __shared__ int smem[]; @@ -966,6 +973,7 @@ namespace cv { namespace gpu { namespace device } template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void calcDistance(const PtrStepSz query, const PtrStepSz train, const Mask mask, PtrStepf allDist) { extern __shared__ int smem[]; @@ -1066,6 +1074,7 @@ namespace cv { namespace gpu { namespace device // find knn match kernel template + __launch_bounds__(BLOCK_SIZE) __global__ void findBestMatch(PtrStepSzf allDist, int i, PtrStepi trainIdx, PtrStepf distance) { const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64; diff --git a/modules/gpu/src/cuda/bf_match.cu b/modules/gpu/src/cuda/bf_match.cu index f7bdcdc0f1..c2ae48bb30 100644 --- a/modules/gpu/src/cuda/bf_match.cu +++ b/modules/gpu/src/cuda/bf_match.cu @@ -136,6 +136,7 @@ namespace cv { namespace gpu { namespace device } template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void matchUnrolledCached(const PtrStepSz query, const PtrStepSz train, const Mask mask, int* bestTrainIdx, float* bestDistance) { extern __shared__ int smem[]; @@ -184,6 +185,7 @@ namespace cv { namespace gpu { namespace device } template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void matchUnrolledCached(const PtrStepSz query, const PtrStepSz* trains, int n, const Mask mask, int* bestTrainIdx, int* bestImgIdx, float* bestDistance) { @@ -296,6 +298,7 @@ namespace cv { namespace gpu { namespace device } template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void matchUnrolled(const PtrStepSz query, const PtrStepSz train, const Mask mask, int* bestTrainIdx, float* bestDistance) { extern __shared__ int smem[]; @@ -342,6 +345,7 @@ namespace cv { namespace gpu { namespace device } template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void matchUnrolled(const PtrStepSz query, const PtrStepSz* trains, int n, const Mask mask, int* bestTrainIdx, int* bestImgIdx, float* bestDistance) { @@ -451,6 +455,7 @@ namespace cv { namespace gpu { namespace device } template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void match(const PtrStepSz query, const PtrStepSz train, const Mask mask, int* bestTrainIdx, float* bestDistance) { extern __shared__ int smem[]; @@ -497,6 +502,7 @@ namespace cv { namespace gpu { namespace device } template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void match(const PtrStepSz query, const PtrStepSz* trains, int n, const Mask mask, int* bestTrainIdx, int* bestImgIdx, float* bestDistance) { diff --git a/modules/gpu/src/cuda/bf_radius_match.cu b/modules/gpu/src/cuda/bf_radius_match.cu index 44cd2b55f9..d83f9f7f96 100644 --- a/modules/gpu/src/cuda/bf_radius_match.cu +++ b/modules/gpu/src/cuda/bf_radius_match.cu @@ -56,6 +56,7 @@ namespace cv { namespace gpu { namespace device // Match Unrolled template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void matchUnrolled(const PtrStepSz query, int imgIdx, const PtrStepSz train, float maxDistance, const Mask mask, PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) { @@ -164,6 +165,7 @@ namespace cv { namespace gpu { namespace device // Match template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void match(const PtrStepSz query, int imgIdx, const PtrStepSz train, float maxDistance, const Mask mask, PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) {