diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index d32aa327a0..48d1c35ae6 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -628,6 +628,11 @@ namespace cv //! computes minimum eigen value of 2x2 derivative covariation matrix at each pixel - the cornerness criteria CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType=BORDER_REFLECT101); + //! computes cross-correlation of two images using FFT + //! supports source images of 32FC1 type only + //! result matrix will have 32FC1 type + CV_EXPORTS void crossCorr(const GpuMat& image, const GpuMat& templ, GpuMat& result); + //! computes the proximity map for the raster template and the image where the template is searched for CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method); diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu index 4ba2a89797..f8abe3733f 100644 --- a/modules/gpu/src/cuda/imgproc.cu +++ b/modules/gpu/src/cuda/imgproc.cu @@ -40,6 +40,7 @@ // //M*/ +#include #include "internal_shared.hpp" #include "opencv2/gpu/device/border_interpolate.hpp" @@ -749,5 +750,32 @@ namespace cv { namespace gpu { namespace imgproc cudaSafeCall(cudaThreadSynchronize()); } + ////////////////////////////////////////////////////////////////////////// + // multiplyAndNormalizeSpects + + __global__ void multiplyAndNormalizeSpectsKernel( + int n, float scale, const cufftComplex* a, + const cufftComplex* b, cufftComplex* c) + { + int x = blockIdx.x * blockDim.x + threadIdx.x; + if (x < n) + { + cufftComplex v = cuCmulf(a[x], cuConjf(b[x])); + c[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale); + } + } + + + // Performs per-element multiplication and normalization of two spectrums + void multiplyAndNormalizeSpects(int n, float scale, const cufftComplex* a, + const cufftComplex* b, cufftComplex* c) + { + dim3 threads(256); + dim3 grid(divUp(n, threads.x)); + + multiplyAndNormalizeSpectsKernel<<>>(n, scale, a, b, c); + cudaSafeCall(cudaThreadSynchronize()); + } + }}} diff --git a/modules/gpu/src/cuda/match_template.cu b/modules/gpu/src/cuda/match_template.cu index b775e6616d..3ac98c88ec 100644 --- a/modules/gpu/src/cuda/match_template.cu +++ b/modules/gpu/src/cuda/match_template.cu @@ -40,7 +40,6 @@ // //M*/ -#include #include "internal_shared.hpp" #include "opencv2/gpu/device/vecmath.hpp" @@ -256,29 +255,6 @@ void matchTemplateNaive_SQDIFF_8U(const DevMem2D image, const DevMem2D templ, } -__global__ void multiplyAndNormalizeSpectsKernel( - int n, float scale, const cufftComplex* a, - const cufftComplex* b, cufftComplex* c) -{ - int x = blockIdx.x * blockDim.x + threadIdx.x; - if (x < n) - { - cufftComplex v = cuCmulf(a[x], cuConjf(b[x])); - c[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale); - } -} - - -void multiplyAndNormalizeSpects(int n, float scale, const cufftComplex* a, - const cufftComplex* b, cufftComplex* c) -{ - dim3 threads(256); - dim3 grid(divUp(n, threads.x)); - multiplyAndNormalizeSpectsKernel<<>>(n, scale, a, b, c); - cudaSafeCall(cudaThreadSynchronize()); -} - - template __global__ void matchTemplatePreparedKernel_SQDIFF_8U( int w, int h, const PtrStep_ image_sqsum, diff --git a/modules/gpu/src/imgproc_gpu.cpp b/modules/gpu/src/imgproc_gpu.cpp index 3eef44c8d9..4a3f9de8b3 100644 --- a/modules/gpu/src/imgproc_gpu.cpp +++ b/modules/gpu/src/imgproc_gpu.cpp @@ -41,6 +41,7 @@ //M*/ #include "precomp.hpp" +#include using namespace cv; using namespace cv::gpu; @@ -73,6 +74,7 @@ void cv::gpu::histRange(const GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); void cv::gpu::histRange(const GpuMat&, GpuMat*, const GpuMat*) { throw_nogpu(); } void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, int, int, double, int) { throw_nogpu(); } void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, int, int, int) { throw_nogpu(); } +void cv::gpu::crossCorr(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); } #else /* !defined (HAVE_CUDA) */ @@ -1062,6 +1064,135 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, i imgproc::cornerMinEigenVal_caller(blockSize, Dx, Dy, dst, gpuBorderType); } +////////////////////////////////////////////////////////////////////////////// +// crossCorr + +namespace +{ + // Estimates optimal block size + void crossCorrOptBlockSize(int w, int h, int tw, int th, int& bw, int& bh) + { + int major, minor; + getComputeCapability(getDevice(), major, minor); + + int scale = 40; + int bh_min = 1024; + int bw_min = 1024; + + // Check whether we use Fermi generation or newer GPU + if (major >= 2) + { + bh_min = 2048; + bw_min = 2048; + } + + bw = std::max(tw * scale, bw_min); + bh = std::max(th * scale, bh_min); + bw = std::min(bw, w); + bh = std::min(bh, h); + } +} + + +namespace cv { namespace gpu { namespace imgproc +{ + void multiplyAndNormalizeSpects(int n, float scale, const cufftComplex* a, + const cufftComplex* b, cufftComplex* c); +}}} + + +void cv::gpu::crossCorr(const GpuMat& image, const GpuMat& templ, GpuMat& result) +{ + CV_Assert(image.type() == CV_32F); + CV_Assert(templ.type() == CV_32F); + + result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F); + + Size block_size; + crossCorrOptBlockSize(result.cols, result.rows, templ.cols, templ.rows, + block_size.width, block_size.height); + + Size dft_size; + dft_size.width = getOptimalDFTSize(block_size.width + templ.cols - 1); + dft_size.height = getOptimalDFTSize(block_size.width + templ.rows - 1); + + block_size.width = std::min(dft_size.width - templ.cols + 1, result.cols); + block_size.height = std::min(dft_size.height - templ.rows + 1, result.rows); + + cufftReal* image_data; + cufftReal* templ_data; + cufftReal* result_data; + cudaSafeCall(cudaMalloc((void**)&image_data, sizeof(cufftReal) * dft_size.area())); + cudaSafeCall(cudaMalloc((void**)&templ_data, sizeof(cufftReal) * dft_size.area())); + cudaSafeCall(cudaMalloc((void**)&result_data, sizeof(cufftReal) * dft_size.area())); + + int spect_len = dft_size.height * (dft_size.width / 2 + 1); + cufftComplex* image_spect; + cufftComplex* templ_spect; + cufftComplex* result_spect; + cudaSafeCall(cudaMalloc((void**)&image_spect, sizeof(cufftComplex) * spect_len)); + cudaSafeCall(cudaMalloc((void**)&templ_spect, sizeof(cufftComplex) * spect_len)); + cudaSafeCall(cudaMalloc((void**)&result_spect, sizeof(cufftComplex) * spect_len)); + + cufftHandle planR2C, planC2R; + cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R)); + cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C)); + + GpuMat templ_roi(templ.size(), CV_32S, templ.data, templ.step); + GpuMat templ_block(dft_size, CV_32S, templ_data, dft_size.width * sizeof(cufftReal)); + copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0, + templ_block.cols - templ_roi.cols, 0); + + cufftSafeCall(cufftExecR2C(planR2C, templ_data, templ_spect)); + + GpuMat image_block(dft_size, CV_32S, image_data, dft_size.width * sizeof(cufftReal)); + + // Process all blocks of the result matrix + for (int y = 0; y < result.rows; y += block_size.height) + { + for (int x = 0; x < result.cols; x += block_size.width) + { + // Locate ROI in the source matrix + Size image_roi_size; + image_roi_size.width = std::min(x + dft_size.width, image.cols) - x; + image_roi_size.height = std::min(y + dft_size.height, image.rows) - y; + GpuMat image_roi(image_roi_size, CV_32S, (void*)(image.ptr(y) + x), image.step); + + // Make source image block continous + copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows, 0, + image_block.cols - image_roi.cols, 0); + + cufftSafeCall(cufftExecR2C(planR2C, image_data, image_spect)); + + imgproc::multiplyAndNormalizeSpects(spect_len, 1.f / dft_size.area(), + image_spect, templ_spect, result_spect); + + cufftSafeCall(cufftExecC2R(planC2R, result_spect, result_data)); + + // Copy result block into appropriate part of the result matrix. + // We can't compute it inplace as the result of the CUFFT transforms + // is always continous, while the result matrix and its blocks can have gaps. + Size result_roi_size; + result_roi_size.width = std::min(x + block_size.width, result.cols) - x; + result_roi_size.height = std::min(y + block_size.height, result.rows) - y; + GpuMat result_roi(result_roi_size, CV_32F, (void*)(result.ptr(y) + x), result.step); + GpuMat result_block(result_roi_size, CV_32F, result_data, dft_size.width * sizeof(cufftReal)); + result_block.copyTo(result_roi); + } + } + + cufftSafeCall(cufftDestroy(planR2C)); + cufftSafeCall(cufftDestroy(planC2R)); + + cudaSafeCall(cudaFree(image_spect)); + cudaSafeCall(cudaFree(templ_spect)); + cudaSafeCall(cudaFree(result_spect)); + cudaSafeCall(cudaFree(image_data)); + cudaSafeCall(cudaFree(templ_data)); + cudaSafeCall(cudaFree(result_data)); +} + + #endif /* !defined (HAVE_CUDA) */ diff --git a/modules/gpu/src/match_template.cpp b/modules/gpu/src/match_template.cpp index 7ff33da6ec..1106ca9b73 100644 --- a/modules/gpu/src/match_template.cpp +++ b/modules/gpu/src/match_template.cpp @@ -41,7 +41,6 @@ //M*/ #include "precomp.hpp" -#include using namespace cv; using namespace cv::gpu; @@ -54,9 +53,6 @@ void cv::gpu::matchTemplate(const GpuMat&, const GpuMat&, GpuMat&, int) { throw_ namespace cv { namespace gpu { namespace imgproc { - void multiplyAndNormalizeSpects(int n, float scale, const cufftComplex* a, - const cufftComplex* b, cufftComplex* c); - void matchTemplateNaive_CCORR_8U( const DevMem2D image, const DevMem2D templ, DevMem2Df result, int cn); @@ -147,7 +143,7 @@ namespace cv { namespace gpu { namespace imgproc DevMem2Df result); void normalize_8U(int w, int h, const DevMem2D_ image_sqsum, - unsigned int templ_sqsum, DevMem2Df result, int cn); + unsigned int templ_sqsum, DevMem2Df result, int cn); void extractFirstChannel_32F(const DevMem2D image, DevMem2Df result, int cn); }}} @@ -155,11 +151,6 @@ namespace cv { namespace gpu { namespace imgproc namespace { - // Estimates optimal blocks size for FFT method - void estimateBlockSize(int w, int h, int tw, int th, int& bw, int& bh); - - // Performs FFT-based cross-correlation - void crossCorr_32F(const GpuMat& image, const GpuMat& templ, GpuMat& result); // Evaluates optimal template's area threshold. If // template's area is less than the threshold, we use naive match @@ -178,110 +169,6 @@ namespace void matchTemplate_CCOFF_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result); - void estimateBlockSize(int w, int h, int tw, int th, int& bw, int& bh) - { - int major, minor; - getComputeCapability(getDevice(), major, minor); - - int scale = 40; - int bh_min = 1024; - int bw_min = 1024; - - if (major >= 2) // Fermi generation or newer - { - bh_min = 2048; - bw_min = 2048; - } - - bw = std::max(tw * scale, bw_min); - bh = std::max(th * scale, bh_min); - bw = std::min(bw, w); - bh = std::min(bh, h); - } - - - void crossCorr_32F(const GpuMat& image, const GpuMat& templ, GpuMat& result) - { - CV_Assert(image.type() == CV_32F); - CV_Assert(templ.type() == CV_32F); - - result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F); - - Size block_size; - estimateBlockSize(result.cols, result.rows, templ.cols, templ.rows, - block_size.width, block_size.height); - - Size dft_size; - dft_size.width = getOptimalDFTSize(block_size.width + templ.cols - 1); - dft_size.height = getOptimalDFTSize(block_size.width + templ.rows - 1); - - block_size.width = std::min(dft_size.width - templ.cols + 1, result.cols); - block_size.height = std::min(dft_size.height - templ.rows + 1, result.rows); - - cufftReal* image_data; - cufftReal* templ_data; - cufftReal* result_data; - cudaSafeCall(cudaMalloc((void**)&image_data, sizeof(cufftReal) * dft_size.area())); - cudaSafeCall(cudaMalloc((void**)&templ_data, sizeof(cufftReal) * dft_size.area())); - cudaSafeCall(cudaMalloc((void**)&result_data, sizeof(cufftReal) * dft_size.area())); - - int spect_len = dft_size.height * (dft_size.width / 2 + 1); - cufftComplex* image_spect; - cufftComplex* templ_spect; - cufftComplex* result_spect; - cudaSafeCall(cudaMalloc((void**)&image_spect, sizeof(cufftComplex) * spect_len)); - cudaSafeCall(cudaMalloc((void**)&templ_spect, sizeof(cufftComplex) * spect_len)); - cudaSafeCall(cudaMalloc((void**)&result_spect, sizeof(cufftComplex) * spect_len)); - - cufftHandle planR2C, planC2R; - cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R)); - cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C)); - - GpuMat templ_roi(templ.size(), CV_32S, templ.data, templ.step); - GpuMat templ_block(dft_size, CV_32S, templ_data, dft_size.width * sizeof(cufftReal)); - copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0, - templ_block.cols - templ_roi.cols, 0); - cufftSafeCall(cufftExecR2C(planR2C, templ_data, templ_spect)); - - GpuMat image_block(dft_size, CV_32S, image_data, dft_size.width * sizeof(cufftReal)); - - for (int y = 0; y < result.rows; y += block_size.height) - { - for (int x = 0; x < result.cols; x += block_size.width) - { - Size image_roi_size; - image_roi_size.width = min(x + dft_size.width, image.cols) - x; - image_roi_size.height = min(y + dft_size.height, image.rows) - y; - GpuMat image_roi(image_roi_size, CV_32S, (void*)(image.ptr(y) + x), image.step); - copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows, 0, - image_block.cols - image_roi.cols, 0); - - cufftSafeCall(cufftExecR2C(planR2C, image_data, image_spect)); - imgproc::multiplyAndNormalizeSpects(spect_len, 1.f / dft_size.area(), - image_spect, templ_spect, result_spect); - cufftSafeCall(cufftExecC2R(planC2R, result_spect, result_data)); - - Size result_roi_size; - result_roi_size.width = min(x + block_size.width, result.cols) - x; - result_roi_size.height = min(y + block_size.height, result.rows) - y; - GpuMat result_roi(result_roi_size, CV_32F, (void*)(result.ptr(y) + x), result.step); - GpuMat result_block(result_roi_size, CV_32F, result_data, dft_size.width * sizeof(cufftReal)); - result_block.copyTo(result_roi); - } - } - - cufftSafeCall(cufftDestroy(planR2C)); - cufftSafeCall(cufftDestroy(planC2R)); - - cudaSafeCall(cudaFree(image_spect)); - cudaSafeCall(cudaFree(templ_spect)); - cudaSafeCall(cudaFree(result_spect)); - cudaSafeCall(cudaFree(image_data)); - cudaSafeCall(cudaFree(templ_data)); - cudaSafeCall(cudaFree(result_data)); - } - - int getTemplateThreshold(int method, int depth) { switch (method) @@ -309,7 +196,7 @@ namespace } GpuMat result_; - crossCorr_32F(image.reshape(1), templ.reshape(1), result_); + crossCorr(image.reshape(1), templ.reshape(1), result_); imgproc::extractFirstChannel_32F(result_, result, image.channels()); } @@ -541,5 +428,3 @@ void cv::gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& re } #endif - -