diff --git a/modules/gpuarithm/include/opencv2/gpuarithm.hpp b/modules/gpuarithm/include/opencv2/gpuarithm.hpp index 555fa7b86c..8fbe296d80 100644 --- a/modules/gpuarithm/include/opencv2/gpuarithm.hpp +++ b/modules/gpuarithm/include/opencv2/gpuarithm.hpp @@ -374,7 +374,23 @@ CV_EXPORTS void mulAndScaleSpectrums(InputArray src1, InputArray src2, OutputArr //! For complex-to-real transform it is assumed that the source matrix is packed in CUFFT's format. CV_EXPORTS void dft(InputArray src, OutputArray dst, Size dft_size, int flags=0, Stream& stream = Stream::Null()); -struct CV_EXPORTS ConvolveBuf +//! computes convolution (or cross-correlation) of two images using discrete Fourier transform +//! supports source images of 32FC1 type only +//! result matrix will have 32FC1 type +class CV_EXPORTS Convolution : public Algorithm +{ +public: + virtual void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr = false, Stream& stream = Stream::Null()) = 0; +}; +CV_EXPORTS Ptr createConvolution(Size user_block_size = Size()); + +__OPENCV_GPUARITHM_DEPR_BEFORE__ void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr = false, Stream& stream = Stream::Null()) __OPENCV_GPUARITHM_DEPR_AFTER__; +inline void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr , Stream& stream) +{ + createConvolution()->convolve(image, templ, result, ccorr, stream); +} + +struct ConvolveBuf { Size result_size; Size block_size; @@ -385,15 +401,15 @@ struct CV_EXPORTS ConvolveBuf GpuMat image_spect, templ_spect, result_spect; GpuMat image_block, templ_block, result_data; - void create(Size image_size, Size templ_size); - static Size estimateBlockSize(Size result_size, Size templ_size); + void create(Size, Size){} + static Size estimateBlockSize(Size, Size){ return Size(); } }; -//! computes convolution (or cross-correlation) of two images using discrete Fourier transform -//! supports source images of 32FC1 type only -//! result matrix will have 32FC1 type -CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr = false); -CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream = Stream::Null()); +__OPENCV_GPUARITHM_DEPR_BEFORE__ void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr, ConvolveBuf& buf, Stream& stream = Stream::Null()) __OPENCV_GPUARITHM_DEPR_AFTER__; +inline void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr, ConvolveBuf& buf, Stream& stream) +{ + createConvolution(buf.user_block_size)->convolve(image, templ, result, ccorr, stream); +} }} // namespace cv { namespace gpu { diff --git a/modules/gpuarithm/perf/perf_arithm.cpp b/modules/gpuarithm/perf/perf_arithm.cpp index 5f15fb47da..dfeafa0fa4 100644 --- a/modules/gpuarithm/perf/perf_arithm.cpp +++ b/modules/gpuarithm/perf/perf_arithm.cpp @@ -228,10 +228,11 @@ PERF_TEST_P(Sz_KernelSz_Ccorr, Convolve, cv::gpu::GpuMat d_templ = cv::gpu::createContinuous(templ_size, templ_size, CV_32FC1); d_templ.upload(templ); + cv::Ptr convolution = cv::gpu::createConvolution(); + cv::gpu::GpuMat dst; - cv::gpu::ConvolveBuf d_buf; - TEST_CYCLE() cv::gpu::convolve(d_image, d_templ, dst, ccorr, d_buf); + TEST_CYCLE() convolution->convolve(d_image, d_templ, dst, ccorr); GPU_SANITY_CHECK(dst); } diff --git a/modules/gpuarithm/src/arithm.cpp b/modules/gpuarithm/src/arithm.cpp index 88af76a170..6045cf5baf 100644 --- a/modules/gpuarithm/src/arithm.cpp +++ b/modules/gpuarithm/src/arithm.cpp @@ -54,9 +54,7 @@ void cv::gpu::mulAndScaleSpectrums(InputArray, InputArray, OutputArray, int, flo void cv::gpu::dft(InputArray, OutputArray, Size, int, Stream&) { throw_no_cuda(); } -void cv::gpu::ConvolveBuf::create(Size, Size) { throw_no_cuda(); } -void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool) { throw_no_cuda(); } -void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream&) { throw_no_cuda(); } +Ptr cv::gpu::createConvolution(Size) { throw_no_cuda(); return Ptr(); } #else /* !defined (HAVE_CUDA) */ @@ -486,136 +484,152 @@ void cv::gpu::dft(InputArray _src, OutputArray _dst, Size dft_size, int flags, S } ////////////////////////////////////////////////////////////////////////////// -// convolve +// Convolution -void cv::gpu::ConvolveBuf::create(Size image_size, Size templ_size) +#ifdef HAVE_CUFFT + +namespace { - result_size = Size(image_size.width - templ_size.width + 1, - image_size.height - templ_size.height + 1); - - block_size = user_block_size; - if (user_block_size.width == 0 || user_block_size.height == 0) - block_size = estimateBlockSize(result_size, templ_size); - - dft_size.width = 1 << int(ceil(std::log(block_size.width + templ_size.width - 1.) / std::log(2.))); - dft_size.height = 1 << int(ceil(std::log(block_size.height + templ_size.height - 1.) / std::log(2.))); - - // CUFFT has hard-coded kernels for power-of-2 sizes (up to 8192), - // see CUDA Toolkit 4.1 CUFFT Library Programming Guide - if (dft_size.width > 8192) - dft_size.width = getOptimalDFTSize(block_size.width + templ_size.width - 1); - if (dft_size.height > 8192) - dft_size.height = getOptimalDFTSize(block_size.height + templ_size.height - 1); - - // To avoid wasting time doing small DFTs - dft_size.width = std::max(dft_size.width, 512); - dft_size.height = std::max(dft_size.height, 512); - - createContinuous(dft_size, CV_32F, image_block); - createContinuous(dft_size, CV_32F, templ_block); - createContinuous(dft_size, CV_32F, result_data); - - spect_len = dft_size.height * (dft_size.width / 2 + 1); - createContinuous(1, spect_len, CV_32FC2, image_spect); - createContinuous(1, spect_len, CV_32FC2, templ_spect); - createContinuous(1, spect_len, CV_32FC2, result_spect); - - // Use maximum result matrix block size for the estimated DFT block size - block_size.width = std::min(dft_size.width - templ_size.width + 1, result_size.width); - block_size.height = std::min(dft_size.height - templ_size.height + 1, result_size.height); -} + class ConvolutionImpl : public Convolution + { + public: + explicit ConvolutionImpl(Size user_block_size_) : user_block_size(user_block_size_) {} + void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr = false, Stream& stream = Stream::Null()); -Size cv::gpu::ConvolveBuf::estimateBlockSize(Size result_size, Size /*templ_size*/) -{ - int width = (result_size.width + 2) / 3; - int height = (result_size.height + 2) / 3; - width = std::min(width, result_size.width); - height = std::min(height, result_size.height); - return Size(width, height); -} + private: + void create(Size image_size, Size templ_size); + static Size estimateBlockSize(Size result_size); + Size result_size; + Size block_size; + Size user_block_size; + Size dft_size; + int spect_len; -void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr) -{ - ConvolveBuf buf; - gpu::convolve(image, templ, result, ccorr, buf); -} + GpuMat image_spect, templ_spect, result_spect; + GpuMat image_block, templ_block, result_data; + }; -void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream) -{ -#ifndef HAVE_CUFFT - (void) image; - (void) templ; - (void) result; - (void) ccorr; - (void) buf; - (void) stream; - throw_no_cuda(); -#else - CV_Assert(image.type() == CV_32F); - CV_Assert(templ.type() == CV_32F); + void ConvolutionImpl::create(Size image_size, Size templ_size) + { + result_size = Size(image_size.width - templ_size.width + 1, + image_size.height - templ_size.height + 1); + + block_size = user_block_size; + if (user_block_size.width == 0 || user_block_size.height == 0) + block_size = estimateBlockSize(result_size); + + dft_size.width = 1 << int(ceil(std::log(block_size.width + templ_size.width - 1.) / std::log(2.))); + dft_size.height = 1 << int(ceil(std::log(block_size.height + templ_size.height - 1.) / std::log(2.))); + + // CUFFT has hard-coded kernels for power-of-2 sizes (up to 8192), + // see CUDA Toolkit 4.1 CUFFT Library Programming Guide + if (dft_size.width > 8192) + dft_size.width = getOptimalDFTSize(block_size.width + templ_size.width - 1); + if (dft_size.height > 8192) + dft_size.height = getOptimalDFTSize(block_size.height + templ_size.height - 1); + + // To avoid wasting time doing small DFTs + dft_size.width = std::max(dft_size.width, 512); + dft_size.height = std::max(dft_size.height, 512); + + createContinuous(dft_size, CV_32F, image_block); + createContinuous(dft_size, CV_32F, templ_block); + createContinuous(dft_size, CV_32F, result_data); + + spect_len = dft_size.height * (dft_size.width / 2 + 1); + createContinuous(1, spect_len, CV_32FC2, image_spect); + createContinuous(1, spect_len, CV_32FC2, templ_spect); + createContinuous(1, spect_len, CV_32FC2, result_spect); + + // Use maximum result matrix block size for the estimated DFT block size + block_size.width = std::min(dft_size.width - templ_size.width + 1, result_size.width); + block_size.height = std::min(dft_size.height - templ_size.height + 1, result_size.height); + } + + Size ConvolutionImpl::estimateBlockSize(Size result_size) + { + int width = (result_size.width + 2) / 3; + int height = (result_size.height + 2) / 3; + width = std::min(width, result_size.width); + height = std::min(height, result_size.height); + return Size(width, height); + } - buf.create(image.size(), templ.size()); - result.create(buf.result_size, CV_32F); + void ConvolutionImpl::convolve(InputArray _image, InputArray _templ, OutputArray _result, bool ccorr, Stream& _stream) + { + GpuMat image = _image.getGpuMat(); + GpuMat templ = _templ.getGpuMat(); - Size& block_size = buf.block_size; - Size& dft_size = buf.dft_size; + CV_Assert( image.type() == CV_32FC1 ); + CV_Assert( templ.type() == CV_32FC1 ); - GpuMat& image_block = buf.image_block; - GpuMat& templ_block = buf.templ_block; - GpuMat& result_data = buf.result_data; + create(image.size(), templ.size()); - GpuMat& image_spect = buf.image_spect; - GpuMat& templ_spect = buf.templ_spect; - GpuMat& result_spect = buf.result_spect; + _result.create(result_size, CV_32FC1); + GpuMat result = _result.getGpuMat(); - cufftHandle planR2C, planC2R; - cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R)); - cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C)); + cudaStream_t stream = StreamAccessor::getStream(_stream); - cufftSafeCall( cufftSetStream(planR2C, StreamAccessor::getStream(stream)) ); - cufftSafeCall( cufftSetStream(planC2R, StreamAccessor::getStream(stream)) ); + cufftHandle planR2C, planC2R; + cufftSafeCall( cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R) ); + cufftSafeCall( cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C) ); - GpuMat templ_roi(templ.size(), CV_32F, templ.data, templ.step); - gpu::copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0, - templ_block.cols - templ_roi.cols, 0, Scalar(), stream); + cufftSafeCall( cufftSetStream(planR2C, stream) ); + cufftSafeCall( cufftSetStream(planC2R, stream) ); - cufftSafeCall(cufftExecR2C(planR2C, templ_block.ptr(), - templ_spect.ptr())); + GpuMat templ_roi(templ.size(), CV_32FC1, templ.data, templ.step); + gpu::copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0, + templ_block.cols - templ_roi.cols, 0, Scalar(), _stream); - // Process all blocks of the result matrix - for (int y = 0; y < result.rows; y += block_size.height) - { - for (int x = 0; x < result.cols; x += block_size.width) + cufftSafeCall( cufftExecR2C(planR2C, templ_block.ptr(), templ_spect.ptr()) ); + + // Process all blocks of the result matrix + for (int y = 0; y < result.rows; y += block_size.height) { - Size image_roi_size(std::min(x + dft_size.width, image.cols) - x, - std::min(y + dft_size.height, image.rows) - y); - GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr(y) + x), - image.step); - gpu::copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows, - 0, image_block.cols - image_roi.cols, 0, Scalar(), stream); - - cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr(), - image_spect.ptr())); - gpu::mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0, - 1.f / dft_size.area(), ccorr, stream); - cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr(), - result_data.ptr())); - - Size result_roi_size(std::min(x + block_size.width, result.cols) - x, - std::min(y + block_size.height, result.rows) - y); - GpuMat result_roi(result_roi_size, result.type(), - (void*)(result.ptr(y) + x), result.step); - GpuMat result_block(result_roi_size, result_data.type(), - result_data.ptr(), result_data.step); - - result_block.copyTo(result_roi, stream); + for (int x = 0; x < result.cols; x += block_size.width) + { + Size image_roi_size(std::min(x + dft_size.width, image.cols) - x, + std::min(y + dft_size.height, image.rows) - y); + GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr(y) + x), + image.step); + gpu::copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows, + 0, image_block.cols - image_roi.cols, 0, Scalar(), _stream); + + cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr(), + image_spect.ptr())); + gpu::mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0, + 1.f / dft_size.area(), ccorr, _stream); + cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr(), + result_data.ptr())); + + Size result_roi_size(std::min(x + block_size.width, result.cols) - x, + std::min(y + block_size.height, result.rows) - y); + GpuMat result_roi(result_roi_size, result.type(), + (void*)(result.ptr(y) + x), result.step); + GpuMat result_block(result_roi_size, result_data.type(), + result_data.ptr(), result_data.step); + + result_block.copyTo(result_roi, _stream); + } } + + cufftSafeCall( cufftDestroy(planR2C) ); + cufftSafeCall( cufftDestroy(planC2R) ); } +} + +#endif - cufftSafeCall(cufftDestroy(planR2C)); - cufftSafeCall(cufftDestroy(planC2R)); +Ptr cv::gpu::createConvolution(Size user_block_size) +{ +#ifndef HAVE_CUBLAS + (void) user_block_size; + CV_Error(cv::Error::StsNotImplemented, "The library was build without CUFFT"); + return Ptr(); +#else + return new ConvolutionImpl(user_block_size); #endif } diff --git a/modules/gpuarithm/test/test_arithm.cpp b/modules/gpuarithm/test/test_arithm.cpp index 93fb0ae845..0534e219d8 100644 --- a/modules/gpuarithm/test/test_arithm.cpp +++ b/modules/gpuarithm/test/test_arithm.cpp @@ -419,8 +419,10 @@ GPU_TEST_P(Convolve, Accuracy) cv::Mat src = randomMat(size, CV_32FC1, 0.0, 100.0); cv::Mat kernel = randomMat(cv::Size(ksize, ksize), CV_32FC1, 0.0, 1.0); + cv::Ptr conv = cv::gpu::createConvolution(); + cv::gpu::GpuMat dst; - cv::gpu::convolve(loadMat(src), loadMat(kernel), dst, ccorr); + conv->convolve(loadMat(src), loadMat(kernel), dst, ccorr); cv::Mat dst_gold; convolveDFT(src, kernel, dst_gold, ccorr); diff --git a/modules/gpuimgproc/src/match_template.cpp b/modules/gpuimgproc/src/match_template.cpp index c5375c2882..059d41ca9f 100644 --- a/modules/gpuimgproc/src/match_template.cpp +++ b/modules/gpuimgproc/src/match_template.cpp @@ -172,15 +172,16 @@ namespace return; } - gpu::ConvolveBuf convolve_buf; - convolve_buf.user_block_size = buf.user_block_size; + Ptr conv = gpu::createConvolution(buf.user_block_size); if (image.channels() == 1) - gpu::convolve(image.reshape(1), templ.reshape(1), result, true, convolve_buf, stream); + { + conv->convolve(image.reshape(1), templ.reshape(1), result, true, stream); + } else { GpuMat result_; - gpu::convolve(image.reshape(1), templ.reshape(1), result_, true, convolve_buf, stream); + conv->convolve(image.reshape(1), templ.reshape(1), result_, true, stream); extractFirstChannel_32F(result_, result, image.channels(), StreamAccessor::getStream(stream)); } }