From 40ee754e281e530dcdf9dbd64437051535aed4f8 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Wed, 19 Oct 2011 09:53:22 +0000 Subject: [PATCH] added WITH_CUFFT and WITH_CUBLAS flags to cmake scripts fixed gpu module error reporting added asynchronous version of some functions --- CMakeLists.txt | 13 + cvconfig.h.cmake | 6 + modules/core/include/opencv2/core/types_c.h | 4 +- modules/core/src/system.cpp | 5 +- modules/gpu/CMakeLists.txt | 11 +- modules/gpu/include/opencv2/gpu/gpu.hpp | 44 +-- modules/gpu/src/arithm.cpp | 4 +- modules/gpu/src/cascadeclassifier.cpp | 11 +- modules/gpu/src/cuda/imgproc.cu | 121 +++--- modules/gpu/src/cuda/match_template.cu | 388 +++++++++----------- modules/gpu/src/cuda/safe_call.hpp | 31 +- modules/gpu/src/error.cpp | 194 +++++----- modules/gpu/src/imgproc.cpp | 278 ++++++++------ modules/gpu/src/match_template.cpp | 177 ++++----- modules/gpu/src/optical_flow.cpp | 10 +- modules/gpu/src/precomp.hpp | 45 ++- 16 files changed, 712 insertions(+), 630 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 628c23f5f5..ff8f20c646 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -452,8 +452,12 @@ set(WITH_EIGEN ON CACHE BOOL "Include Eigen2/Eigen3 support") if( CMAKE_VERSION VERSION_GREATER "2.8") set(WITH_CUDA ON CACHE BOOL "Include NVidia Cuda Runtime support") + set(WITH_CUFFT ON CACHE BOOL "Include NVidia Cuda Fast Fourier Transform (FFT) library support") + set(WITH_CUBLAS OFF CACHE BOOL "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support") else() set(WITH_CUDA OFF CACHE BOOL "Include NVidia Cuda Runtime support") + set(WITH_CUFFT OFF CACHE BOOL "Include NVidia Cuda Fast Fourier Transform (FFT) library support") + set(WITH_CUBLAS OFF CACHE BOOL "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support") endif() set(WITH_OPENNI OFF CACHE BOOL "Include OpenNI support") @@ -995,6 +999,15 @@ if(WITH_CUDA) if(CUDA_FOUND) set(HAVE_CUDA 1) + + if(WITH_CUFFT) + set(HAVE_CUFFT 1) + endif() + + if(WITH_CUBLAS) + set(HAVE_CUBLAS 1) + endif() + message(STATUS "CUDA detected: " ${CUDA_VERSION}) set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0)" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported") diff --git a/cvconfig.h.cmake b/cvconfig.h.cmake index 602c12d19f..36cf6bbb51 100644 --- a/cvconfig.h.cmake +++ b/cvconfig.h.cmake @@ -172,6 +172,12 @@ /* NVidia Cuda Runtime API*/ #cmakedefine HAVE_CUDA +/* NVidia Cuda Fast Fourier Transform (FFT) API*/ +#cmakedefine HAVE_CUFFT + +/* NVidia Cuda Basic Linear Algebra Subprograms (BLAS) API*/ +#cmakedefine HAVE_CUBLAS + /* Compile for 'real' NVIDIA GPU architectures */ #define CUDA_ARCH_BIN "${OPENCV_CUDA_ARCH_BIN}" diff --git a/modules/core/include/opencv2/core/types_c.h b/modules/core/include/opencv2/core/types_c.h index 11dbabfa7e..e2284a81d7 100644 --- a/modules/core/include/opencv2/core/types_c.h +++ b/modules/core/include/opencv2/core/types_c.h @@ -250,9 +250,7 @@ enum { CV_StsBadMemBlock= -214, /* an allocated block has been corrupted */ CV_StsAssert= -215, /* assertion failed */ CV_GpuNotSupported= -216, - CV_GpuApiCallError= -217, - CV_GpuNppCallError= -218, - CV_GpuCufftCallError= -219 + CV_GpuApiCallError= -217 }; /****************************************************************************************\ diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index 2922cdaf8f..7c9b805f8e 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -629,9 +629,8 @@ CV_IMPL const char* cvErrorStr( int status ) case CV_StsNotImplemented : return "The function/feature is not implemented"; case CV_StsBadMemBlock : return "Memory block has been corrupted"; case CV_StsAssert : return "Assertion failed"; - case CV_GpuNotSupported : return "No GPU support"; - case CV_GpuApiCallError : return "Gpu Api call"; - case CV_GpuNppCallError : return "Npp Api call"; + case CV_GpuNotSupported : return "No GPU support"; + case CV_GpuApiCallError : return "Gpu Api call"; }; sprintf(buf, "Unknown %s code %d", status >= 0 ? "status":"error", status); diff --git a/modules/gpu/CMakeLists.txt b/modules/gpu/CMakeLists.txt index 6c1a2225f8..74ccc320e3 100644 --- a/modules/gpu/CMakeLists.txt +++ b/modules/gpu/CMakeLists.txt @@ -120,12 +120,19 @@ set_target_properties(${the_target} PROPERTIES target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${DEPS} ) if (HAVE_CUDA) - target_link_libraries(${the_target} ${CUDA_LIBRARIES}) - CUDA_ADD_CUFFT_TO_TARGET(${the_target}) + target_link_libraries(${the_target} ${CUDA_LIBRARIES}) unset(CUDA_npp_LIBRARY CACHE) find_cuda_helper_libs(npp) target_link_libraries(${the_target} ${CUDA_npp_LIBRARY}) + + if(HAVE_CUFFT) + CUDA_ADD_CUFFT_TO_TARGET(${the_target}) + endif() + + if(HAVE_CUBLAS) + CUDA_ADD_CUBLAS_TO_TARGET(${the_target}) + endif() endif() if(MSVC) diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 1b7efb985e..ee9b734f1d 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -141,8 +141,8 @@ namespace cv //////////////////////////////// Error handling //////////////////////// - CV_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func); - CV_EXPORTS void nppError( int err, const char *file, const int line, const char *func); + //CV_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func); + //CV_EXPORTS void nppError( int err, const char *file, const int line, const char *func); //////////////////////////////// CudaMem //////////////////////////////// // CudaMem is limited cv::Mat with page locked memory allocation. @@ -628,11 +628,11 @@ namespace cv //! Does mean shift filtering on GPU. CV_EXPORTS void meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, - TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1)); + TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1), Stream& stream = Stream::Null()); //! Does mean shift procedure on GPU. CV_EXPORTS void meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, - TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1)); + TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1), Stream& stream = Stream::Null()); //! Does mean shift segmentation with elimination of small regions. CV_EXPORTS void meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr, int minsize, @@ -683,10 +683,12 @@ namespace cv //! rotate 8bit single or four channel image //! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC //! supports CV_8UC1, CV_8UC4 types - CV_EXPORTS void rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null()); + CV_EXPORTS void rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0, + int interpolation = INTER_LINEAR, Stream& stream = Stream::Null()); //! copies 2D array to a larger destination array and pads borders with user-specifiable constant - CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType, const Scalar& value = Scalar(), Stream& stream = Stream::Null()); + CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType, + const Scalar& value = Scalar(), Stream& stream = Stream::Null()); //! computes the integral image //! sum will have CV_32S type, but will contain unsigned int values @@ -715,21 +717,26 @@ namespace cv CV_EXPORTS void rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& stream = Stream::Null()); //! computes Harris cornerness criteria at each image pixel - CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, int borderType=BORDER_REFLECT101); - CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, double k, int borderType=BORDER_REFLECT101); + CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, + int borderType = BORDER_REFLECT101); + CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, double k, + int borderType = BORDER_REFLECT101); + CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, double k, + int borderType = BORDER_REFLECT101, Stream& stream = Stream::Null()); //! computes minimum eigen value of 2x2 derivative covariation matrix at each pixel - the cornerness criteria CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType=BORDER_REFLECT101); CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType=BORDER_REFLECT101); + CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, + int borderType=BORDER_REFLECT101, Stream& stream = Stream::Null()); //! performs per-element multiplication of two full (not packed) Fourier spectrums //! supports 32FC2 matrixes only (interleaved format) - CV_EXPORTS void mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB=false); + CV_EXPORTS void mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB=false, Stream& stream = Stream::Null()); //! performs per-element multiplication of two full (not packed) Fourier spectrums //! supports 32FC2 matrixes only (interleaved format) - CV_EXPORTS void mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, - float scale, bool conjB=false); + CV_EXPORTS void mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB=false, Stream& stream = Stream::Null()); //! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix. //! Param dft_size is the size of DFT transform. @@ -742,19 +749,14 @@ namespace cv //! in CUFFT's format. Result as full complex matrix for such kind of transform cannot be retrieved. //! //! For complex-to-real transform it is assumed that the source matrix is packed in CUFFT's format. - CV_EXPORTS void dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags=0); + CV_EXPORTS void dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags=0, Stream& stream = Stream::Null()); //! computes convolution (or cross-correlation) of two images using discrete Fourier transform //! supports source images of 32FC1 type only //! result matrix will have 32FC1 type - CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, - bool ccorr=false); - struct CV_EXPORTS ConvolveBuf; - - //! buffered version - CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, - bool ccorr, ConvolveBuf& buf); + CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr = false); + CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream = Stream::Null()); struct CV_EXPORTS ConvolveBuf { @@ -766,7 +768,7 @@ namespace cv private: static Size estimateBlockSize(Size result_size, Size templ_size); - friend void convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&); + friend void convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream& stream); Size result_size; Size block_size; @@ -778,7 +780,7 @@ namespace cv }; //! computes the proximity map for the raster template and the image where the template is searched for - CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method); + CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream& stream = Stream::Null()); //! smoothes the source image and downsamples it CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null()); diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp index 67da283eb6..ec938fea59 100644 --- a/modules/gpu/src/arithm.cpp +++ b/modules/gpu/src/arithm.cpp @@ -93,7 +93,7 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s) sz.width = src.cols; sz.height = src.rows; - nppSafeCall( nppiStTranspose_32u_C1R(const_cast(src.ptr()), static_cast(src.step), + ncvSafeCall( nppiStTranspose_32u_C1R(const_cast(src.ptr()), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz) ); } else // if (src.elemSize() == 8) @@ -104,7 +104,7 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s) sz.width = src.cols; sz.height = src.rows; - nppSafeCall( nppiStTranspose_64u_C1R(const_cast(src.ptr()), static_cast(src.step), + ncvSafeCall( nppiStTranspose_64u_C1R(const_cast(src.ptr()), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz) ); } diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp index 984b33f820..6bef7fbe68 100644 --- a/modules/gpu/src/cascadeclassifier.cpp +++ b/modules/gpu/src/cascadeclassifier.cpp @@ -66,10 +66,7 @@ struct cv::gpu::CascadeClassifier_GPU::CascadeClassifierImpl CascadeClassifierImpl(const string& filename) : lastAllocatedFrameSize(-1, -1) { ncvSetDebugOutputHandler(NCVDebugOutputHandler); - if (ncvStat != load(filename)) - { - CV_Error(CV_GpuApiCallError, "Error in GPU cacade load"); - } + ncvSafeCall( load(filename) ); } @@ -287,11 +284,7 @@ int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat& image, GpuMa } unsigned int numDetections; - NCVStatus ncvStat = impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, ncvMinSize, numDetections); - if (ncvStat != NCV_SUCCESS) - { - CV_Error(CV_GpuApiCallError, "Error in face detectioln"); - } + ncvSafeCall( impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, ncvMinSize, numDetections) ); return numDetections; } diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu index 4d54895edc..8149945647 100644 --- a/modules/gpu/src/cuda/imgproc.cu +++ b/modules/gpu/src/cuda/imgproc.cu @@ -120,8 +120,7 @@ namespace cv { namespace gpu { namespace imgproc return make_short2((short)x0, (short)y0); } - extern "C" __global__ void meanshift_kernel( unsigned char* out, size_t out_step, int cols, int rows, - int sp, int sr, int maxIter, float eps ) + __global__ void meanshift_kernel(unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps ) { int x0 = blockIdx.x * blockDim.x + threadIdx.x; int y0 = blockIdx.y * blockDim.y + threadIdx.y; @@ -130,10 +129,10 @@ namespace cv { namespace gpu { namespace imgproc do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps); } - extern "C" __global__ void meanshiftproc_kernel( unsigned char* outr, size_t outrstep, - unsigned char* outsp, size_t outspstep, - int cols, int rows, - int sp, int sr, int maxIter, float eps ) + __global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep, + unsigned char* outsp, size_t outspstep, + int cols, int rows, + int sp, int sr, int maxIter, float eps) { int x0 = blockIdx.x * blockDim.x + threadIdx.x; int y0 = blockIdx.y * blockDim.y + threadIdx.y; @@ -145,7 +144,7 @@ namespace cv { namespace gpu { namespace imgproc } } - extern "C" void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps) + void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream) { dim3 grid(1, 1, 1); dim3 threads(32, 8, 1); @@ -155,13 +154,16 @@ namespace cv { namespace gpu { namespace imgproc cudaChannelFormatDesc desc = cudaCreateChannelDesc(); cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) ); - meanshift_kernel<<< grid, threads >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps ); + meanshift_kernel<<< grid, threads, 0, stream >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps ); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); - cudaSafeCall( cudaUnbindTexture( tex_meanshift ) ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + + //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) ); } - extern "C" void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps) + + void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream) { dim3 grid(1, 1, 1); dim3 threads(32, 8, 1); @@ -171,11 +173,13 @@ namespace cv { namespace gpu { namespace imgproc cudaChannelFormatDesc desc = cudaCreateChannelDesc(); cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) ); - meanshiftproc_kernel<<< grid, threads >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps ); + meanshiftproc_kernel<<< grid, threads, 0, stream >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps ); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); - cudaSafeCall( cudaUnbindTexture( tex_meanshift ) ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + + //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) ); } /////////////////////////////////// drawColorDisp /////////////////////////////////////////////// @@ -389,15 +393,16 @@ namespace cv { namespace gpu { namespace imgproc } } - void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst) + void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst, cudaStream_t stream) { dim3 threads(32, 8); dim3 grid(divUp(Dx.cols, threads.x), divUp(Dx.rows, threads.y)); - extractCovData_kernel<<>>(Dx.cols, Dx.rows, Dx, Dy, dst); + extractCovData_kernel<<>>(Dx.cols, Dx.rows, Dx, Dy, dst); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } /////////////////////////////////////////// Corner Harris ///////////////////////////////////////////////// @@ -475,7 +480,7 @@ namespace cv { namespace gpu { namespace imgproc } void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, - int border_type) + int border_type, cudaStream_t stream) { const int rows = Dx.rows; const int cols = Dx.cols; @@ -492,7 +497,7 @@ namespace cv { namespace gpu { namespace imgproc switch (border_type) { case BORDER_REFLECT101_GPU: - cornerHarris_kernel<<>>( + cornerHarris_kernel<<>>( cols, rows, block_size, k, dst, BrdRowReflect101(cols), BrdColReflect101(rows)); break; case BORDER_REPLICATE_GPU: @@ -500,16 +505,18 @@ namespace cv { namespace gpu { namespace imgproc harrisDxTex.addressMode[1] = cudaAddressModeClamp; harrisDyTex.addressMode[0] = cudaAddressModeClamp; harrisDyTex.addressMode[1] = cudaAddressModeClamp; - cornerHarris_kernel<<>>(cols, rows, block_size, k, dst); + + cornerHarris_kernel<<>>(cols, rows, block_size, k, dst); break; } cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); - cudaSafeCall(cudaUnbindTexture(harrisDxTex)); - cudaSafeCall(cudaUnbindTexture(harrisDyTex)); + //cudaSafeCall(cudaUnbindTexture(harrisDxTex)); + //cudaSafeCall(cudaUnbindTexture(harrisDyTex)); } /////////////////////////////////////////// Corner Min Eigen Val ///////////////////////////////////////////////// @@ -592,7 +599,7 @@ namespace cv { namespace gpu { namespace imgproc } void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, - int border_type) + int border_type, cudaStream_t stream) { const int rows = Dx.rows; const int cols = Dx.cols; @@ -609,7 +616,7 @@ namespace cv { namespace gpu { namespace imgproc switch (border_type) { case BORDER_REFLECT101_GPU: - cornerMinEigenVal_kernel<<>>( + cornerMinEigenVal_kernel<<>>( cols, rows, block_size, dst, BrdRowReflect101(cols), BrdColReflect101(rows)); break; case BORDER_REPLICATE_GPU: @@ -617,16 +624,18 @@ namespace cv { namespace gpu { namespace imgproc minEigenValDxTex.addressMode[1] = cudaAddressModeClamp; minEigenValDyTex.addressMode[0] = cudaAddressModeClamp; minEigenValDyTex.addressMode[1] = cudaAddressModeClamp; - cornerMinEigenVal_kernel<<>>(cols, rows, block_size, dst); + + cornerMinEigenVal_kernel<<>>(cols, rows, block_size, dst); break; } cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaDeviceSynchronize()); + if (stream == 0) + cudaSafeCall(cudaDeviceSynchronize()); - cudaSafeCall(cudaUnbindTexture(minEigenValDxTex)); - cudaSafeCall(cudaUnbindTexture(minEigenValDyTex)); + //cudaSafeCall(cudaUnbindTexture(minEigenValDxTex)); + //cudaSafeCall(cudaUnbindTexture(minEigenValDyTex)); } ////////////////////////////// Column Sum ////////////////////////////////////// @@ -667,8 +676,7 @@ namespace cv { namespace gpu { namespace imgproc ////////////////////////////////////////////////////////////////////////// // mulSpectrums - __global__ void mulSpectrumsKernel(const PtrStep a, const PtrStep b, - DevMem2D_ c) + __global__ void mulSpectrumsKernel(const PtrStep a, const PtrStep b, DevMem2D_ c) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -680,25 +688,23 @@ namespace cv { namespace gpu { namespace imgproc } - void mulSpectrums(const PtrStep a, const PtrStep b, - DevMem2D_ c) + void mulSpectrums(const PtrStep a, const PtrStep b, DevMem2D_ c, cudaStream_t stream) { dim3 threads(256); dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y)); - mulSpectrumsKernel<<>>(a, b, c); + mulSpectrumsKernel<<>>(a, b, c); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } ////////////////////////////////////////////////////////////////////////// // mulSpectrums_CONJ - __global__ void mulSpectrumsKernel_CONJ( - const PtrStep a, const PtrStep b, - DevMem2D_ c) + __global__ void mulSpectrumsKernel_CONJ(const PtrStep a, const PtrStep b, DevMem2D_ c) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -710,25 +716,23 @@ namespace cv { namespace gpu { namespace imgproc } - void mulSpectrums_CONJ(const PtrStep a, const PtrStep b, - DevMem2D_ c) + void mulSpectrums_CONJ(const PtrStep a, const PtrStep b, DevMem2D_ c, cudaStream_t stream) { dim3 threads(256); dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y)); - mulSpectrumsKernel_CONJ<<>>(a, b, c); + mulSpectrumsKernel_CONJ<<>>(a, b, c); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } ////////////////////////////////////////////////////////////////////////// // mulAndScaleSpectrums - __global__ void mulAndScaleSpectrumsKernel( - const PtrStep a, const PtrStep b, - float scale, DevMem2D_ c) + __global__ void mulAndScaleSpectrumsKernel(const PtrStep a, const PtrStep b, float scale, DevMem2D_ c) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -741,25 +745,23 @@ namespace cv { namespace gpu { namespace imgproc } - void mulAndScaleSpectrums(const PtrStep a, const PtrStep b, - float scale, DevMem2D_ c) + void mulAndScaleSpectrums(const PtrStep a, const PtrStep b, float scale, DevMem2D_ c, cudaStream_t stream) { dim3 threads(256); dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y)); - mulAndScaleSpectrumsKernel<<>>(a, b, scale, c); + mulAndScaleSpectrumsKernel<<>>(a, b, scale, c); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + if (stream) + cudaSafeCall( cudaDeviceSynchronize() ); } ////////////////////////////////////////////////////////////////////////// // mulAndScaleSpectrums_CONJ - __global__ void mulAndScaleSpectrumsKernel_CONJ( - const PtrStep a, const PtrStep b, - float scale, DevMem2D_ c) + __global__ void mulAndScaleSpectrumsKernel_CONJ(const PtrStep a, const PtrStep b, float scale, DevMem2D_ c) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -772,16 +774,16 @@ namespace cv { namespace gpu { namespace imgproc } - void mulAndScaleSpectrums_CONJ(const PtrStep a, const PtrStep b, - float scale, DevMem2D_ c) + void mulAndScaleSpectrums_CONJ(const PtrStep a, const PtrStep b, float scale, DevMem2D_ c, cudaStream_t stream) { dim3 threads(256); dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y)); - mulAndScaleSpectrumsKernel_CONJ<<>>(a, b, scale, c); + mulAndScaleSpectrumsKernel_CONJ<<>>(a, b, scale, c); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } ////////////////////////////////////////////////////////////////////////// @@ -1015,17 +1017,18 @@ namespace cv { namespace gpu { namespace imgproc } } - void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel) + void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream) { cudaSafeCall(cudaMemcpyToSymbol(c_convolveKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) ); const dim3 block(16, 16); const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); - convolve<<>>(src, dst, kWidth, kHeight); + convolve<<>>(src, dst, kWidth, kHeight); cudaSafeCall(cudaGetLastError()); - cudaSafeCall(cudaDeviceSynchronize()); + if (stream == 0) + cudaSafeCall(cudaDeviceSynchronize()); } diff --git a/modules/gpu/src/cuda/match_template.cu b/modules/gpu/src/cuda/match_template.cu index 242cf3c762..e954a26635 100644 --- a/modules/gpu/src/cuda/match_template.cu +++ b/modules/gpu/src/cuda/match_template.cu @@ -78,11 +78,11 @@ __device__ __forceinline__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a __device__ __forceinline__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); } __device__ __forceinline__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } +////////////////////////////////////////////////////////////////////// +// Naive_CCORR -template -__global__ void matchTemplateNaiveKernel_CCORR( - int w, int h, const PtrStepb image, const PtrStepb templ, - DevMem2Df result) +template +__global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, DevMem2Df result) { typedef typename TypeVec::vec_type Type; typedef typename TypeVec::vec_type Typef; @@ -106,73 +106,49 @@ __global__ void matchTemplateNaiveKernel_CCORR( } } +template +void matchTemplateNaive_CCORR(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream) +{ + const dim3 threads(32, 8); + const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); -void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ, - DevMem2Df result, int cn) + matchTemplateNaiveKernel_CCORR<<>>(templ.cols, templ.rows, image, templ, result); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); +} + +void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream) { - dim3 threads(32, 8); - dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); + typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream); - switch (cn) + static const caller_t callers[] = { - case 1: - matchTemplateNaiveKernel_CCORR<<>>( - templ.cols, templ.rows, image, templ, result); - break; - case 2: - matchTemplateNaiveKernel_CCORR<<>>( - templ.cols, templ.rows, image, templ, result); - break; - case 3: - matchTemplateNaiveKernel_CCORR<<>>( - templ.cols, templ.rows, image, templ, result); - break; - case 4: - matchTemplateNaiveKernel_CCORR<<>>( - templ.cols, templ.rows, image, templ, result); - break; - } - cudaSafeCall( cudaGetLastError() ); + 0, matchTemplateNaive_CCORR, matchTemplateNaive_CCORR, matchTemplateNaive_CCORR, matchTemplateNaive_CCORR + }; - cudaSafeCall( cudaDeviceSynchronize() ); + callers[cn](image, templ, result, stream); } -void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ, - DevMem2Df result, int cn) +void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream) { - dim3 threads(32, 8); - dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); + typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream); - switch (cn) + static const caller_t callers[] = { - case 1: - matchTemplateNaiveKernel_CCORR<<>>( - templ.cols, templ.rows, image, templ, result); - break; - case 2: - matchTemplateNaiveKernel_CCORR<<>>( - templ.cols, templ.rows, image, templ, result); - break; - case 3: - matchTemplateNaiveKernel_CCORR<<>>( - templ.cols, templ.rows, image, templ, result); - break; - case 4: - matchTemplateNaiveKernel_CCORR<<>>( - templ.cols, templ.rows, image, templ, result); - break; - } - cudaSafeCall( cudaGetLastError() ); + 0, matchTemplateNaive_CCORR, matchTemplateNaive_CCORR, matchTemplateNaive_CCORR, matchTemplateNaive_CCORR + }; - cudaSafeCall( cudaDeviceSynchronize() ); + callers[cn](image, templ, result, stream); } +////////////////////////////////////////////////////////////////////// +// Naive_SQDIFF template -__global__ void matchTemplateNaiveKernel_SQDIFF( - int w, int h, const PtrStepb image, const PtrStepb templ, - DevMem2Df result) +__global__ void matchTemplateNaiveKernel_SQDIFF(int w, int h, const PtrStepb image, const PtrStepb templ, DevMem2Df result) { typedef typename TypeVec::vec_type Type; typedef typename TypeVec::vec_type Typef; @@ -200,73 +176,48 @@ __global__ void matchTemplateNaiveKernel_SQDIFF( } } - -void matchTemplateNaive_SQDIFF_32F(const DevMem2Db image, const DevMem2Db templ, - DevMem2Df result, int cn) +template +void matchTemplateNaive_SQDIFF(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream) { - dim3 threads(32, 8); - dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); + const dim3 threads(32, 8); + const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); - switch (cn) - { - case 1: - matchTemplateNaiveKernel_SQDIFF<<>>( - templ.cols, templ.rows, image, templ, result); - break; - case 2: - matchTemplateNaiveKernel_SQDIFF<<>>( - templ.cols, templ.rows, image, templ, result); - break; - case 3: - matchTemplateNaiveKernel_SQDIFF<<>>( - templ.cols, templ.rows, image, templ, result); - break; - case 4: - matchTemplateNaiveKernel_SQDIFF<<>>( - templ.cols, templ.rows, image, templ, result); - break; - } + matchTemplateNaiveKernel_SQDIFF<<>>(templ.cols, templ.rows, image, templ, result); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } +void matchTemplateNaive_SQDIFF_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream) +{ + typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream); + + static const caller_t callers[] = + { + 0, matchTemplateNaive_SQDIFF, matchTemplateNaive_SQDIFF, matchTemplateNaive_SQDIFF, matchTemplateNaive_SQDIFF + }; + + callers[cn](image, templ, result, stream); +} -void matchTemplateNaive_SQDIFF_8U(const DevMem2Db image, const DevMem2Db templ, - DevMem2Df result, int cn) +void matchTemplateNaive_SQDIFF_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream) { - dim3 threads(32, 8); - dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); + typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream); - switch (cn) + static const caller_t callers[] = { - case 1: - matchTemplateNaiveKernel_SQDIFF<<>>( - templ.cols, templ.rows, image, templ, result); - break; - case 2: - matchTemplateNaiveKernel_SQDIFF<<>>( - templ.cols, templ.rows, image, templ, result); - break; - case 3: - matchTemplateNaiveKernel_SQDIFF<<>>( - templ.cols, templ.rows, image, templ, result); - break; - case 4: - matchTemplateNaiveKernel_SQDIFF<<>>( - templ.cols, templ.rows, image, templ, result); - break; - } - cudaSafeCall( cudaGetLastError() ); + 0, matchTemplateNaive_SQDIFF, matchTemplateNaive_SQDIFF, matchTemplateNaive_SQDIFF, matchTemplateNaive_SQDIFF + }; - cudaSafeCall( cudaDeviceSynchronize() ); + callers[cn](image, templ, result, stream); } +////////////////////////////////////////////////////////////////////// +// Prepared_SQDIFF template -__global__ void matchTemplatePreparedKernel_SQDIFF_8U( - int w, int h, const PtrStep image_sqsum, - unsigned int templ_sqsum, DevMem2Df result) +__global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep image_sqsum, unsigned int templ_sqsum, DevMem2Df result) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -281,37 +232,34 @@ __global__ void matchTemplatePreparedKernel_SQDIFF_8U( } } +template +void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_ image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream) +{ + const dim3 threads(32, 8); + const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); + + matchTemplatePreparedKernel_SQDIFF_8U<<>>(w, h, image_sqsum, templ_sqsum, result); + cudaSafeCall( cudaGetLastError() ); -void matchTemplatePrepared_SQDIFF_8U( - int w, int h, const DevMem2D_ image_sqsum, - unsigned int templ_sqsum, DevMem2Df result, int cn) + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); +} + +void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_ image_sqsum, unsigned int templ_sqsum, DevMem2Df result, int cn, + cudaStream_t stream) { - dim3 threads(32, 8); - dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); - switch (cn) + typedef void (*caller_t)(int w, int h, const DevMem2D_ image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream); + + static const caller_t callers[] = { - case 1: - matchTemplatePreparedKernel_SQDIFF_8U<1><<>>( - w, h, image_sqsum, templ_sqsum, result); - break; - case 2: - matchTemplatePreparedKernel_SQDIFF_8U<2><<>>( - w, h, image_sqsum, templ_sqsum, result); - break; - case 3: - matchTemplatePreparedKernel_SQDIFF_8U<3><<>>( - w, h, image_sqsum, templ_sqsum, result); - break; - case 4: - matchTemplatePreparedKernel_SQDIFF_8U<4><<>>( - w, h, image_sqsum, templ_sqsum, result); - break; - } - cudaSafeCall( cudaGetLastError() ); + 0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4> + }; - cudaSafeCall( cudaDeviceSynchronize() ); + callers[cn](w, h, image_sqsum, templ_sqsum, result, stream); } +////////////////////////////////////////////////////////////////////// +// Prepared_SQDIFF_NORMED // normAcc* are accurate normalization routines which make GPU matchTemplate // consistent with CPU one @@ -337,9 +285,7 @@ __device__ float normAcc_SQDIFF(float num, float denum) template -__global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U( - int w, int h, const PtrStep image_sqsum, - unsigned int templ_sqsum, DevMem2Df result) +__global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(int w, int h, const PtrStep image_sqsum, unsigned int templ_sqsum, DevMem2Df result) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -355,41 +301,37 @@ __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U( } } +template +void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_ image_sqsum, unsigned int templ_sqsum, + DevMem2Df result, cudaStream_t stream) +{ + const dim3 threads(32, 8); + const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); + + matchTemplatePreparedKernel_SQDIFF_NORMED_8U<<>>(w, h, image_sqsum, templ_sqsum, result); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); +} + -void matchTemplatePrepared_SQDIFF_NORMED_8U( - int w, int h, const DevMem2D_ image_sqsum, - unsigned int templ_sqsum, DevMem2Df result, int cn) +void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_ image_sqsum, unsigned int templ_sqsum, + DevMem2Df result, int cn, cudaStream_t stream) { - dim3 threads(32, 8); - dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); - switch (cn) + typedef void (*caller_t)(int w, int h, const DevMem2D_ image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream); + static const caller_t callers[] = { - case 1: - matchTemplatePreparedKernel_SQDIFF_NORMED_8U<1><<>>( - w, h, image_sqsum, templ_sqsum, result); - break; - case 2: - matchTemplatePreparedKernel_SQDIFF_NORMED_8U<2><<>>( - w, h, image_sqsum, templ_sqsum, result); - break; - case 3: - matchTemplatePreparedKernel_SQDIFF_NORMED_8U<3><<>>( - w, h, image_sqsum, templ_sqsum, result); - break; - case 4: - matchTemplatePreparedKernel_SQDIFF_NORMED_8U<4><<>>( - w, h, image_sqsum, templ_sqsum, result); - break; - } - cudaSafeCall( cudaGetLastError() ); + 0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4> + }; - cudaSafeCall( cudaDeviceSynchronize() ); + callers[cn](w, h, image_sqsum, templ_sqsum, result, stream); } +////////////////////////////////////////////////////////////////////// +// Prepared_CCOFF -__global__ void matchTemplatePreparedKernel_CCOFF_8U( - int w, int h, float templ_sum_scale, - const PtrStep image_sum, DevMem2Df result) +__global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep image_sum, DevMem2Df result) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -404,21 +346,20 @@ __global__ void matchTemplatePreparedKernel_CCOFF_8U( } } - -void matchTemplatePrepared_CCOFF_8U( - int w, int h, const DevMem2D_ image_sum, - unsigned int templ_sum, DevMem2Df result) +void matchTemplatePrepared_CCOFF_8U(int w, int h, const DevMem2D_ image_sum, unsigned int templ_sum, DevMem2Df result, cudaStream_t stream) { dim3 threads(32, 8); dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); - matchTemplatePreparedKernel_CCOFF_8U<<>>( - w, h, (float)templ_sum / (w * h), image_sum, result); + + matchTemplatePreparedKernel_CCOFF_8U<<>>(w, h, (float)templ_sum / (w * h), image_sum, result); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } + __global__ void matchTemplatePreparedKernel_CCOFF_8UC2( int w, int h, float templ_sum_scale_r, float templ_sum_scale_g, const PtrStep image_sum_r, @@ -442,25 +383,27 @@ __global__ void matchTemplatePreparedKernel_CCOFF_8UC2( } } - void matchTemplatePrepared_CCOFF_8UC2( int w, int h, const DevMem2D_ image_sum_r, const DevMem2D_ image_sum_g, unsigned int templ_sum_r, unsigned int templ_sum_g, - DevMem2Df result) + DevMem2Df result, cudaStream_t stream) { dim3 threads(32, 8); dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); - matchTemplatePreparedKernel_CCOFF_8UC2<<>>( + + matchTemplatePreparedKernel_CCOFF_8UC2<<>>( w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h), image_sum_r, image_sum_g, result); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } + __global__ void matchTemplatePreparedKernel_CCOFF_8UC3( int w, int h, float templ_sum_scale_r, @@ -492,7 +435,6 @@ __global__ void matchTemplatePreparedKernel_CCOFF_8UC3( } } - void matchTemplatePrepared_CCOFF_8UC3( int w, int h, const DevMem2D_ image_sum_r, @@ -501,11 +443,12 @@ void matchTemplatePrepared_CCOFF_8UC3( unsigned int templ_sum_r, unsigned int templ_sum_g, unsigned int templ_sum_b, - DevMem2Df result) + DevMem2Df result, cudaStream_t stream) { dim3 threads(32, 8); dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); - matchTemplatePreparedKernel_CCOFF_8UC3<<>>( + + matchTemplatePreparedKernel_CCOFF_8UC3<<>>( w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h), @@ -513,10 +456,12 @@ void matchTemplatePrepared_CCOFF_8UC3( image_sum_r, image_sum_g, image_sum_b, result); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } + __global__ void matchTemplatePreparedKernel_CCOFF_8UC4( int w, int h, float templ_sum_scale_r, @@ -554,7 +499,6 @@ __global__ void matchTemplatePreparedKernel_CCOFF_8UC4( } } - void matchTemplatePrepared_CCOFF_8UC4( int w, int h, const DevMem2D_ image_sum_r, @@ -565,11 +509,12 @@ void matchTemplatePrepared_CCOFF_8UC4( unsigned int templ_sum_g, unsigned int templ_sum_b, unsigned int templ_sum_a, - DevMem2Df result) + DevMem2Df result, cudaStream_t stream) { dim3 threads(32, 8); dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); - matchTemplatePreparedKernel_CCOFF_8UC4<<>>( + + matchTemplatePreparedKernel_CCOFF_8UC4<<>>( w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h), @@ -579,9 +524,12 @@ void matchTemplatePrepared_CCOFF_8UC4( result); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } +////////////////////////////////////////////////////////////////////// +// Prepared_CCOFF_NORMED __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U( int w, int h, float weight, @@ -607,12 +555,11 @@ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U( } } - void matchTemplatePrepared_CCOFF_NORMED_8U( int w, int h, const DevMem2D_ image_sum, const DevMem2D_ image_sqsum, unsigned int templ_sum, unsigned int templ_sqsum, - DevMem2Df result) + DevMem2Df result, cudaStream_t stream) { dim3 threads(32, 8); dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); @@ -620,15 +567,18 @@ void matchTemplatePrepared_CCOFF_NORMED_8U( float weight = 1.f / (w * h); float templ_sum_scale = templ_sum * weight; float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum; - matchTemplatePreparedKernel_CCOFF_NORMED_8U<<>>( + + matchTemplatePreparedKernel_CCOFF_NORMED_8U<<>>( w, h, weight, templ_sum_scale, templ_sqsum_scale, image_sum, image_sqsum, result); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } + __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2( int w, int h, float weight, float templ_sum_scale_r, float templ_sum_scale_g, @@ -663,14 +613,13 @@ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2( } } - void matchTemplatePrepared_CCOFF_NORMED_8UC2( int w, int h, const DevMem2D_ image_sum_r, const DevMem2D_ image_sqsum_r, const DevMem2D_ image_sum_g, const DevMem2D_ image_sqsum_g, unsigned int templ_sum_r, unsigned int templ_sqsum_r, unsigned int templ_sum_g, unsigned int templ_sqsum_g, - DevMem2Df result) + DevMem2Df result, cudaStream_t stream) { dim3 threads(32, 8); dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); @@ -680,7 +629,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC2( float templ_sum_scale_g = templ_sum_g * weight; float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r + templ_sqsum_g - weight * templ_sum_g * templ_sum_g; - matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<>>( + + matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<>>( w, h, weight, templ_sum_scale_r, templ_sum_scale_g, templ_sqsum_scale, @@ -689,10 +639,12 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC2( result); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } + __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3( int w, int h, float weight, float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, @@ -736,7 +688,6 @@ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3( } } - void matchTemplatePrepared_CCOFF_NORMED_8UC3( int w, int h, const DevMem2D_ image_sum_r, const DevMem2D_ image_sqsum_r, @@ -745,7 +696,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC3( unsigned int templ_sum_r, unsigned int templ_sqsum_r, unsigned int templ_sum_g, unsigned int templ_sqsum_g, unsigned int templ_sum_b, unsigned int templ_sqsum_b, - DevMem2Df result) + DevMem2Df result, cudaStream_t stream) { dim3 threads(32, 8); dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); @@ -757,7 +708,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC3( float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r + templ_sqsum_g - weight * templ_sum_g * templ_sum_g + templ_sqsum_b - weight * templ_sum_b * templ_sum_b; - matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<>>( + + matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<>>( w, h, weight, templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sqsum_scale, @@ -767,10 +719,12 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC3( result); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } + __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4( int w, int h, float weight, float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, @@ -821,7 +775,6 @@ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4( } } - void matchTemplatePrepared_CCOFF_NORMED_8UC4( int w, int h, const DevMem2D_ image_sum_r, const DevMem2D_ image_sqsum_r, @@ -832,7 +785,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC4( unsigned int templ_sum_g, unsigned int templ_sqsum_g, unsigned int templ_sum_b, unsigned int templ_sqsum_b, unsigned int templ_sum_a, unsigned int templ_sqsum_a, - DevMem2Df result) + DevMem2Df result, cudaStream_t stream) { dim3 threads(32, 8); dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); @@ -846,7 +799,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC4( + templ_sqsum_g - weight * templ_sum_g * templ_sum_g + templ_sqsum_b - weight * templ_sum_b * templ_sum_b + templ_sqsum_a - weight * templ_sum_a * templ_sum_a; - matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<>>( + + matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<>>( w, h, weight, templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a, templ_sqsum_scale, @@ -857,9 +811,12 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC4( result); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } +////////////////////////////////////////////////////////////////////// +// normalize template __global__ void normalizeKernel_8U( @@ -878,32 +835,36 @@ __global__ void normalizeKernel_8U( } } - void normalize_8U(int w, int h, const DevMem2D_ image_sqsum, - unsigned int templ_sqsum, DevMem2Df result, int cn) + unsigned int templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream) { dim3 threads(32, 8); dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); + switch (cn) { case 1: - normalizeKernel_8U<1><<>>(w, h, image_sqsum, templ_sqsum, result); + normalizeKernel_8U<1><<>>(w, h, image_sqsum, templ_sqsum, result); break; case 2: - normalizeKernel_8U<2><<>>(w, h, image_sqsum, templ_sqsum, result); + normalizeKernel_8U<2><<>>(w, h, image_sqsum, templ_sqsum, result); break; case 3: - normalizeKernel_8U<3><<>>(w, h, image_sqsum, templ_sqsum, result); + normalizeKernel_8U<3><<>>(w, h, image_sqsum, templ_sqsum, result); break; case 4: - normalizeKernel_8U<4><<>>(w, h, image_sqsum, templ_sqsum, result); + normalizeKernel_8U<4><<>>(w, h, image_sqsum, templ_sqsum, result); break; } + cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } +////////////////////////////////////////////////////////////////////// +// extractFirstChannel template __global__ void extractFirstChannel_32F(const PtrStepb image, DevMem2Df result) @@ -920,8 +881,7 @@ __global__ void extractFirstChannel_32F(const PtrStepb image, DevMem2Df result) } } - -void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn) +void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cudaStream_t stream) { dim3 threads(32, 8); dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); @@ -929,23 +889,21 @@ void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn) switch (cn) { case 1: - extractFirstChannel_32F<1><<>>(image, result); + extractFirstChannel_32F<1><<>>(image, result); break; case 2: - extractFirstChannel_32F<2><<>>(image, result); + extractFirstChannel_32F<2><<>>(image, result); break; case 3: - extractFirstChannel_32F<3><<>>(image, result); + extractFirstChannel_32F<3><<>>(image, result); break; case 4: - extractFirstChannel_32F<4><<>>(image, result); + extractFirstChannel_32F<4><<>>(image, result); break; } cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } - - }}} - diff --git a/modules/gpu/src/cuda/safe_call.hpp b/modules/gpu/src/cuda/safe_call.hpp index e3e00beaaf..6e0c219e19 100644 --- a/modules/gpu/src/cuda/safe_call.hpp +++ b/modules/gpu/src/cuda/safe_call.hpp @@ -45,16 +45,18 @@ #include "cuda_runtime_api.h" #include "cufft.h" -//#include +#include "NCV.hpp" #if defined(__GNUC__) #define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__, __func__) - #define cufftSafeCall(expr) ___cufftSafeCall(expr, __FILE__, __LINE__, __func__) #define nppSafeCall(expr) ___nppSafeCall(expr, __FILE__, __LINE__, __func__) + #define ncvSafeCall(expr) ___ncvSafeCall(expr, __FILE__, __LINE__, __func__) + #define cufftSafeCall(expr) ___cufftSafeCall(expr, __FILE__, __LINE__, __func__) #else /* defined(__CUDACC__) || defined(__MSVC__) */ #define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__) - #define cufftSafeCall(expr) ___cufftSafeCall(expr, __FILE__, __LINE__) #define nppSafeCall(expr) ___nppSafeCall(expr, __FILE__, __LINE__) + #define ncvSafeCall(expr) ___ncvSafeCall(expr, __FILE__, __LINE__) + #define cufftSafeCall(expr) ___cufftSafeCall(expr, __FILE__, __LINE__) #endif namespace cv @@ -62,8 +64,9 @@ namespace cv namespace gpu { void error(const char *error_string, const char *file, const int line, const char *func = ""); - void nppError(int err, const char *file, const int line, const char *func = ""); - void cufftError(int err, const char *file, const int line, const char *func = ""); + void nppError(int err, const char *file, const int line, const char *func = ""); + void ncvError(int err, const char *file, const int line, const char *func = ""); + void cufftError(int err, const char *file, const int line, const char *func = ""); static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "") { @@ -71,17 +74,23 @@ namespace cv cv::gpu::error(cudaGetErrorString(err), file, line, func); } - static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "") - { - if (CUFFT_SUCCESS != err) - cv::gpu::cufftError(err, file, line, func); - } - static inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "") { if (err < 0) cv::gpu::nppError(err, file, line, func); } + + static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "") + { + if (NCV_SUCCESS != err) + cv::gpu::ncvError(err, file, line, func); + } + + static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "") + { + if (CUFFT_SUCCESS != err) + cv::gpu::cufftError(err, file, line, func); + } } } diff --git a/modules/gpu/src/error.cpp b/modules/gpu/src/error.cpp index 1f38f6174c..e5090dbc3b 100644 --- a/modules/gpu/src/error.cpp +++ b/modules/gpu/src/error.cpp @@ -42,30 +42,45 @@ #include "precomp.hpp" - using namespace cv; using namespace cv::gpu; +using namespace std; - -#if !defined (HAVE_CUDA) - -#else /* !defined (HAVE_CUDA) */ - +#ifdef HAVE_CUDA namespace { #define error_entry(entry) { entry, #entry } - ////////////////////////////////////////////////////////////////////////// - // NPP errors - - struct NppError + struct ErrorEntry { - int error; + int code; string str; - } + }; + + struct ErrorEntryComparer + { + int code; + ErrorEntryComparer(int code_) : code(code_) {}; + bool operator()(const ErrorEntry& e) const { return e.code == code; } + }; + + string getErrorString(int code, const ErrorEntry* errors, size_t n) + { + size_t idx = find_if(errors, errors + n, ErrorEntryComparer(code)) - errors; + + const string& msg = (idx != n) ? errors[idx].str : string("Unknown error code"); + + ostringstream ostr; + ostr << msg << " [Code = " << code << "]"; + + return ostr.str(); + } + + ////////////////////////////////////////////////////////////////////////// + // NPP errors - npp_errors [] = + const ErrorEntry npp_errors [] = { error_entry( NPP_NOT_SUPPORTED_MODE_ERROR ), error_entry( NPP_ROUND_MODE_NOT_SUPPORTED_ERROR ), @@ -74,6 +89,7 @@ namespace #if defined (_MSC_VER) error_entry( NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY ), #endif + error_entry( NPP_BAD_ARG_ERROR ), error_entry( NPP_LUT_NUMBER_OF_LEVELS_ERROR ), error_entry( NPP_TEXTURE_BIND_ERROR ), @@ -110,106 +126,116 @@ namespace error_entry( NPP_ODD_ROI_WARNING ) }; - const size_t error_num = sizeof(npp_errors) / sizeof(npp_errors[0]); + const size_t npp_error_num = sizeof(npp_errors) / sizeof(npp_errors[0]); - struct Searcher + ////////////////////////////////////////////////////////////////////////// + // NCV errors + + const ErrorEntry ncv_errors [] = { - int err; - Searcher(int err_) : err(err_) {}; - bool operator()(const NppError& e) const { return e.error == err; } + error_entry( NCV_SUCCESS ), + error_entry( NCV_UNKNOWN_ERROR ), + error_entry( NCV_CUDA_ERROR ), + error_entry( NCV_NPP_ERROR ), + error_entry( NCV_FILE_ERROR ), + error_entry( NCV_NULL_PTR ), + error_entry( NCV_INCONSISTENT_INPUT ), + error_entry( NCV_TEXTURE_BIND_ERROR ), + error_entry( NCV_DIMENSIONS_INVALID ), + error_entry( NCV_INVALID_ROI ), + error_entry( NCV_INVALID_STEP ), + error_entry( NCV_INVALID_SCALE ), + error_entry( NCV_INVALID_SCALE ), + error_entry( NCV_ALLOCATOR_NOT_INITIALIZED ), + error_entry( NCV_ALLOCATOR_BAD_ALLOC ), + error_entry( NCV_ALLOCATOR_BAD_DEALLOC ), + error_entry( NCV_ALLOCATOR_INSUFFICIENT_CAPACITY ), + error_entry( NCV_ALLOCATOR_DEALLOC_ORDER ), + error_entry( NCV_ALLOCATOR_BAD_REUSE ), + error_entry( NCV_MEM_COPY_ERROR ), + error_entry( NCV_MEM_RESIDENCE_ERROR ), + error_entry( NCV_MEM_INSUFFICIENT_CAPACITY ), + error_entry( NCV_HAAR_INVALID_PIXEL_STEP ), + error_entry( NCV_HAAR_TOO_MANY_FEATURES_IN_CLASSIFIER ), + error_entry( NCV_HAAR_TOO_MANY_FEATURES_IN_CASCADE ), + error_entry( NCV_HAAR_TOO_LARGE_FEATURES ), + error_entry( NCV_HAAR_XML_LOADING_EXCEPTION ), + error_entry( NCV_NOIMPL_HAAR_TILTED_FEATURES ), + error_entry( NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW ), + error_entry( NPPST_SUCCESS ), + error_entry( NPPST_ERROR ), + error_entry( NPPST_CUDA_KERNEL_EXECUTION_ERROR ), + error_entry( NPPST_NULL_POINTER_ERROR ), + error_entry( NPPST_TEXTURE_BIND_ERROR ), + error_entry( NPPST_MEMCPY_ERROR ), + error_entry( NPPST_MEM_ALLOC_ERR ), + error_entry( NPPST_MEMFREE_ERR ), + error_entry( NPPST_INVALID_ROI ), + error_entry( NPPST_INVALID_STEP ), + error_entry( NPPST_INVALID_SCALE ), + error_entry( NPPST_MEM_INSUFFICIENT_BUFFER ), + error_entry( NPPST_MEM_RESIDENCE_ERROR ), + error_entry( NPPST_MEM_INTERNAL_ERROR ) }; + const size_t ncv_error_num = sizeof(npp_errors) / sizeof(npp_errors[0]); + ////////////////////////////////////////////////////////////////////////// // CUFFT errors - struct CufftError - { - int code; - string message; - }; - - const CufftError cufft_errors[] = - { - error_entry(CUFFT_INVALID_PLAN), - error_entry(CUFFT_ALLOC_FAILED), - error_entry(CUFFT_INVALID_TYPE), - error_entry(CUFFT_INVALID_VALUE), - error_entry(CUFFT_INTERNAL_ERROR), - error_entry(CUFFT_EXEC_FAILED), - error_entry(CUFFT_SETUP_FAILED), - error_entry(CUFFT_INVALID_SIZE), - error_entry(CUFFT_UNALIGNED_DATA) - }; - - struct CufftErrorComparer + const ErrorEntry cufft_errors[] = { - CufftErrorComparer(int code_): code(code_) {} - bool operator()(const CufftError& other) const - { - return other.code == code; - } - int code; + error_entry( CUFFT_INVALID_PLAN ), + error_entry( CUFFT_ALLOC_FAILED ), + error_entry( CUFFT_INVALID_TYPE ), + error_entry( CUFFT_INVALID_VALUE ), + error_entry( CUFFT_INTERNAL_ERROR ), + error_entry( CUFFT_EXEC_FAILED ), + error_entry( CUFFT_SETUP_FAILED ), + error_entry( CUFFT_INVALID_SIZE ), + error_entry( CUFFT_UNALIGNED_DATA ) }; const int cufft_error_num = sizeof(cufft_errors) / sizeof(cufft_errors[0]); - } namespace cv { namespace gpu { - const string getNppErrorString( int err ) - { - size_t idx = std::find_if(npp_errors, npp_errors + error_num, Searcher(err)) - npp_errors; - const string& msg = (idx != error_num) ? npp_errors[idx].str : string("Unknown error code"); - - std::stringstream interpreter; - interpreter << msg <<" [Code = " << err << "]"; - - return interpreter.str(); - } - - void nppError( int err, const char *file, const int line, const char *func) - { - cv::error( cv::Exception(CV_GpuNppCallError, getNppErrorString(err), func, file, line) ); - } - - const string getCufftErrorString(int err_code) - { - const CufftError* cufft_error = std::find_if( - cufft_errors, cufft_errors + cufft_error_num, - CufftErrorComparer(err_code)); - - bool found = cufft_error != cufft_errors + cufft_error_num; - - std::stringstream ss; - ss << (found ? cufft_error->message : "Unknown error code"); - ss << " [Code = " << err_code << "]"; - - return ss.str(); - } - - void cufftError(int err, const char *file, const int line, const char *func) - { - cv::error(cv::Exception(CV_GpuCufftCallError, getCufftErrorString(err), func, file, line)); - } - void error(const char *error_string, const char *file, const int line, const char *func) { int code = CV_GpuApiCallError; - if (std::uncaught_exception()) + if (uncaught_exception()) { const char* errorStr = cvErrorStr(code); const char* function = func ? func : "unknown function"; - std::cerr << "OpenCV Error: " << errorStr << "(" << error_string << ") in " << function << ", file " << file << ", line " << line; - std::cerr.flush(); + cerr << "OpenCV Error: " << errorStr << "(" << error_string << ") in " << function << ", file " << file << ", line " << line; + cerr.flush(); } else cv::error( cv::Exception(code, error_string, func, file, line) ); } + + void nppError(int code, const char *file, const int line, const char *func) + { + string msg = getErrorString(code, npp_errors, npp_error_num); + cv::gpu::error(msg.c_str(), file, line, func); + } + + void ncvError(int code, const char *file, const int line, const char *func) + { + string msg = getErrorString(code, ncv_errors, ncv_error_num); + cv::gpu::error(msg.c_str(), file, line, func); + } + + void cufftError(int code, const char *file, const int line, const char *func) + { + string msg = getErrorString(code, cufft_errors, cufft_error_num); + cv::gpu::error(msg.c_str(), file, line, func); + } } } diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp index 0aa88e81b5..528ac65794 100644 --- a/modules/gpu/src/imgproc.cpp +++ b/modules/gpu/src/imgproc.cpp @@ -48,8 +48,8 @@ using namespace cv::gpu; #if !defined (HAVE_CUDA) void cv::gpu::remap(const GpuMat&, GpuMat&, const GpuMat&, const GpuMat&, int, int, const Scalar&, Stream&){ throw_nogpu(); } -void cv::gpu::meanShiftFiltering(const GpuMat&, GpuMat&, int, int, TermCriteria) { throw_nogpu(); } -void cv::gpu::meanShiftProc(const GpuMat&, GpuMat&, GpuMat&, int, int, TermCriteria) { throw_nogpu(); } +void cv::gpu::meanShiftFiltering(const GpuMat&, GpuMat&, int, int, TermCriteria, Stream&) { throw_nogpu(); } +void cv::gpu::meanShiftProc(const GpuMat&, GpuMat&, GpuMat&, int, int, TermCriteria, Stream&) { throw_nogpu(); } void cv::gpu::drawColorDisp(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); } void cv::gpu::reprojectImageTo3D(const GpuMat&, GpuMat&, const Mat&, Stream&) { throw_nogpu(); } void cv::gpu::resize(const GpuMat&, GpuMat&, Size, double, double, int, Stream&) { throw_nogpu(); } @@ -82,14 +82,16 @@ void cv::gpu::equalizeHist(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nog void cv::gpu::equalizeHist(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, int, int, double, int) { throw_nogpu(); } void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, double, int) { throw_nogpu(); } +void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, double, int, Stream&) { throw_nogpu(); } void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, int, int, int) { throw_nogpu(); } void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, int) { throw_nogpu(); } -void cv::gpu::mulSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, bool) { throw_nogpu(); } -void cv::gpu::mulAndScaleSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, float, bool) { throw_nogpu(); } -void cv::gpu::dft(const GpuMat&, GpuMat&, Size, int) { throw_nogpu(); } +void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, int, Stream&) { throw_nogpu(); } +void cv::gpu::mulSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, bool, Stream&) { throw_nogpu(); } +void cv::gpu::mulAndScaleSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, float, bool, Stream&) { throw_nogpu(); } +void cv::gpu::dft(const GpuMat&, GpuMat&, Size, int, Stream&) { throw_nogpu(); } void cv::gpu::ConvolveBuf::create(Size, Size) { throw_nogpu(); } void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool) { throw_nogpu(); } -void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&) { throw_nogpu(); } +void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream& stream) { throw_nogpu(); } void cv::gpu::pyrDown(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); } void cv::gpu::pyrUp(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); } void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int, bool) { throw_nogpu(); } @@ -151,10 +153,10 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp namespace cv { namespace gpu { namespace imgproc { - extern "C" void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps); + void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream); }}} -void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, TermCriteria criteria) +void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, TermCriteria criteria, Stream& stream) { if( src.empty() ) CV_Error( CV_StsBadArg, "The input image is empty" ); @@ -174,7 +176,7 @@ void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, eps = 1.f; eps = (float)std::max(criteria.epsilon, 0.0); - imgproc::meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps); + imgproc::meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps, StreamAccessor::getStream(stream)); } //////////////////////////////////////////////////////////////////////// @@ -182,10 +184,10 @@ void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, namespace cv { namespace gpu { namespace imgproc { - extern "C" void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps); + void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream); }}} -void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, TermCriteria criteria) +void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, TermCriteria criteria, Stream& stream) { if( src.empty() ) CV_Error( CV_StsBadArg, "The input image is empty" ); @@ -206,7 +208,7 @@ void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int eps = 1.f; eps = (float)std::max(criteria.epsilon, 0.0); - imgproc::meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps); + imgproc::meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps, StreamAccessor::getStream(stream)); } //////////////////////////////////////////////////////////////////////// @@ -766,14 +768,14 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) ); Ncv32u bufSize; - nppSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) ); + ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) ); ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer); cudaStream_t stream = StreamAccessor::getStream(s); NppStStreamHandler h(stream); - nppSafeCall( nppiStIntegral_8u32u_C1R(const_cast(src.ptr()), static_cast(src.step), + ncvSafeCall( nppiStIntegral_8u32u_C1R(const_cast(src.ptr()), static_cast(src.step), sum.ptr(), static_cast(sum.step), roiSize, buffer.ptr(), bufSize, prop) ); if (stream == 0) @@ -819,7 +821,7 @@ void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& s) cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) ); Ncv32u bufSize; - nppSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop)); + ncvSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop)); GpuMat buf(1, bufSize, CV_8U); cudaStream_t stream = StreamAccessor::getStream(s); @@ -827,7 +829,7 @@ void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& s) NppStStreamHandler h(stream); sqsum.create(src.rows + 1, src.cols + 1, CV_64F); - nppSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast(src.ptr(0)), static_cast(src.step), + ncvSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast(src.ptr(0)), static_cast(src.step), sqsum.ptr(0), static_cast(sqsum.step), roiSize, buf.ptr(0), bufSize, prop)); if (stream == 0) @@ -1260,16 +1262,16 @@ void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& namespace cv { namespace gpu { namespace imgproc { - void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst); - void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type); - void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type); + void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst, cudaStream_t stream); + void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream); + void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream); }}} namespace { template - void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType) + void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream) { double scale = (double)(1 << ((ksize > 0 ? ksize : 3) - 1)) * blockSize; if (ksize < 0) @@ -1283,25 +1285,25 @@ namespace if (ksize > 0) { - Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, borderType); - Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, borderType); + Sobel(src, Dx, CV_32F, 1, 0, buf, ksize, scale, borderType, -1, stream); + Sobel(src, Dy, CV_32F, 0, 1, buf, ksize, scale, borderType, -1, stream); } else { - Scharr(src, Dx, CV_32F, 1, 0, scale, borderType); - Scharr(src, Dy, CV_32F, 0, 1, scale, borderType); + Scharr(src, Dx, CV_32F, 1, 0, buf, scale, borderType, -1, stream); + Scharr(src, Dy, CV_32F, 0, 1, buf, scale, borderType, -1, stream); } } - void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType) + void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream) { switch (src.type()) { case CV_8U: - extractCovData(src, Dx, Dy, blockSize, ksize, borderType); + extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream); break; case CV_32F: - extractCovData(src, Dx, Dy, blockSize, ksize, borderType); + extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream); break; default: CV_Error(CV_StsBadArg, "extractCovData: unsupported type of the source matrix"); @@ -1343,6 +1345,12 @@ void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ks } void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, double k, int borderType) +{ + GpuMat buf; + cornerHarris(src, dst, Dx, Dy, buf, blockSize, ksize, k, borderType); +} + +void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, double k, int borderType, Stream& stream) { CV_Assert(borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE); @@ -1350,9 +1358,9 @@ void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& D int gpuBorderType; CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType)); - extractCovData(src, Dx, Dy, blockSize, ksize, borderType); + extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream); dst.create(src.size(), CV_32F); - imgproc::cornerHarris_caller(blockSize, (float)k, Dx, Dy, dst, gpuBorderType); + imgproc::cornerHarris_caller(blockSize, (float)k, Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream)); } void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType) @@ -1362,6 +1370,12 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, i } void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType) +{ + GpuMat buf; + cornerMinEigenVal(src, dst, Dx, Dy, buf, blockSize, ksize, borderType); +} + +void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream) { CV_Assert(borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE); @@ -1369,9 +1383,9 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuM int gpuBorderType; CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType)); - extractCovData(src, Dx, Dy, blockSize, ksize, borderType); + extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream); dst.create(src.size(), CV_32F); - imgproc::cornerMinEigenVal_caller(blockSize, Dx, Dy, dst, gpuBorderType); + imgproc::cornerMinEigenVal_caller(blockSize, Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream)); } ////////////////////////////////////////////////////////////////////////////// @@ -1379,21 +1393,16 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuM namespace cv { namespace gpu { namespace imgproc { - void mulSpectrums(const PtrStep a, const PtrStep b, - DevMem2D_ c); + void mulSpectrums(const PtrStep a, const PtrStep b, DevMem2D_ c, cudaStream_t stream); - void mulSpectrums_CONJ(const PtrStep a, const PtrStep b, - DevMem2D_ c); + void mulSpectrums_CONJ(const PtrStep a, const PtrStep b, DevMem2D_ c, cudaStream_t stream); }}} -void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, - int flags, bool conjB) +void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB, Stream& stream) { - typedef void (*Caller)(const PtrStep, const PtrStep, - DevMem2D_); - static Caller callers[] = { imgproc::mulSpectrums, - imgproc::mulSpectrums_CONJ }; + typedef void (*Caller)(const PtrStep, const PtrStep, DevMem2D_, cudaStream_t stream); + static Caller callers[] = { imgproc::mulSpectrums, imgproc::mulSpectrums_CONJ }; CV_Assert(a.type() == b.type() && a.type() == CV_32FC2); CV_Assert(a.size() == b.size()); @@ -1401,7 +1410,7 @@ void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, c.create(a.size(), CV_32FC2); Caller caller = callers[(int)conjB]; - caller(a, b, c); + caller(a, b, c, StreamAccessor::getStream(stream)); } ////////////////////////////////////////////////////////////////////////////// @@ -1409,21 +1418,16 @@ void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, namespace cv { namespace gpu { namespace imgproc { - void mulAndScaleSpectrums(const PtrStep a, const PtrStep b, - float scale, DevMem2D_ c); + void mulAndScaleSpectrums(const PtrStep a, const PtrStep b, float scale, DevMem2D_ c, cudaStream_t stream); - void mulAndScaleSpectrums_CONJ(const PtrStep a, const PtrStep b, - float scale, DevMem2D_ c); + void mulAndScaleSpectrums_CONJ(const PtrStep a, const PtrStep b, float scale, DevMem2D_ c, cudaStream_t stream); }}} -void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, - int flags, float scale, bool conjB) +void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB, Stream& stream) { - typedef void (*Caller)(const PtrStep, const PtrStep, - float scale, DevMem2D_); - static Caller callers[] = { imgproc::mulAndScaleSpectrums, - imgproc::mulAndScaleSpectrums_CONJ }; + typedef void (*Caller)(const PtrStep, const PtrStep, float scale, DevMem2D_, cudaStream_t stream); + static Caller callers[] = { imgproc::mulAndScaleSpectrums, imgproc::mulAndScaleSpectrums_CONJ }; CV_Assert(a.type() == b.type() && a.type() == CV_32FC2); CV_Assert(a.size() == b.size()); @@ -1431,14 +1435,26 @@ void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, c.create(a.size(), CV_32FC2); Caller caller = callers[(int)conjB]; - caller(a, b, scale, c); + caller(a, b, scale, c, StreamAccessor::getStream(stream)); } ////////////////////////////////////////////////////////////////////////////// // dft -void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags) +void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags, Stream& stream) { +#ifndef HAVE_CUFFT + + OPENCV_GPU_UNUSED(src); + OPENCV_GPU_UNUSED(dst); + OPENCV_GPU_UNUSED(dft_size); + OPENCV_GPU_UNUSED(flags); + OPENCV_GPU_UNUSED(stream); + + throw_nogpu(); + +#else + CV_Assert(src.type() == CV_32F || src.type() == CV_32FC2); // We don't support unpacked output (in the case of real input) @@ -1483,6 +1499,8 @@ void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags) else cufftPlan2d(&plan, dft_size_opt.height, dft_size_opt.width, dft_type); + cufftSafeCall( cufftSetStream(plan, StreamAccessor::getStream(stream)) ); + if (is_complex_input) { if (is_complex_output) @@ -1514,7 +1532,9 @@ void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags) cufftSafeCall(cufftDestroy(plan)); if (is_scaled_dft) - multiply(dst, Scalar::all(1. / dft_size.area()), dst); + multiply(dst, Scalar::all(1. / dft_size.area()), dst, 1, -1, stream); + +#endif } ////////////////////////////////////////////////////////////////////////////// @@ -1563,8 +1583,7 @@ Size cv::gpu::ConvolveBuf::estimateBlockSize(Size result_size, Size templ_size) } -void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, - bool ccorr) +void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr) { ConvolveBuf buf; convolve(image, templ, result, ccorr, buf); @@ -1572,12 +1591,37 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, namespace cv { namespace gpu { namespace imgproc { - void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel); + void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream); }}} -void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, - bool ccorr, ConvolveBuf& buf) +void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream) { +#ifndef HAVE_CUFFT + + CV_Assert(image.type() == CV_32F); + CV_Assert(templ.type() == CV_32F); + CV_Assert(templ.cols <= 17 && templ.rows <= 17); + + result.create(image.size(), CV_32F); + + GpuMat& contKernel = buf.templ_block; + + if (templ.isContinuous()) + contKernel = templ; + else + { + contKernel = createContinuous(templ.size(), templ.type()); + + if (stream) + stream.enqueueCopy(templ, contKernel); + else + templ.copyTo(contKernel); + } + + imgproc::convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr(), StreamAccessor::getStream(stream)); + +#else + StaticAssert::check(); StaticAssert::check(); @@ -1587,77 +1631,91 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, if (templ.cols < 13 && templ.rows < 13) { result.create(image.size(), CV_32F); - GpuMat contKernel; + + GpuMat& contKernel = buf.templ_block; if (templ.isContinuous()) contKernel = templ; else { contKernel = createContinuous(templ.size(), templ.type()); - templ.copyTo(contKernel); - } - imgproc::convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr()); + if (stream) + stream.enqueueCopy(templ, contKernel); + else + templ.copyTo(contKernel); + } - return; + imgproc::convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr(), StreamAccessor::getStream(stream)); } + else + { + buf.create(image.size(), templ.size()); + result.create(buf.result_size, CV_32F); - buf.create(image.size(), templ.size()); - result.create(buf.result_size, CV_32F); + Size& block_size = buf.block_size; + Size& dft_size = buf.dft_size; - Size& block_size = buf.block_size; - Size& dft_size = buf.dft_size; + GpuMat& image_block = buf.image_block; + GpuMat& templ_block = buf.templ_block; + GpuMat& result_data = buf.result_data; - GpuMat& image_block = buf.image_block; - GpuMat& templ_block = buf.templ_block; - GpuMat& result_data = buf.result_data; + GpuMat& image_spect = buf.image_spect; + GpuMat& templ_spect = buf.templ_spect; + GpuMat& result_spect = buf.result_spect; - GpuMat& image_spect = buf.image_spect; - GpuMat& templ_spect = buf.templ_spect; - GpuMat& result_spect = buf.result_spect; + cufftHandle planR2C, planC2R; + cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R)); + cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C)); - cufftHandle planR2C, planC2R; - cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R)); - cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C)); + cufftSafeCall( cufftSetStream(planR2C, StreamAccessor::getStream(stream)) ); + cufftSafeCall( cufftSetStream(planC2R, StreamAccessor::getStream(stream)) ); - GpuMat templ_roi(templ.size(), CV_32F, templ.data, templ.step); - copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0, - templ_block.cols - templ_roi.cols, 0); + GpuMat templ_roi(templ.size(), CV_32F, templ.data, templ.step); + copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0, + templ_block.cols - templ_roi.cols, 0, Scalar(), stream); - cufftSafeCall(cufftExecR2C(planR2C, templ_block.ptr(), - templ_spect.ptr())); + cufftSafeCall(cufftExecR2C(planR2C, templ_block.ptr(), + templ_spect.ptr())); - // Process all blocks of the result matrix - for (int y = 0; y < result.rows; y += block_size.height) - { - for (int x = 0; x < result.cols; x += block_size.width) + // Process all blocks of the result matrix + for (int y = 0; y < result.rows; y += block_size.height) { - Size image_roi_size(std::min(x + dft_size.width, image.cols) - x, - std::min(y + dft_size.height, image.rows) - y); - GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr(y) + x), - image.step); - copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows, - 0, image_block.cols - image_roi.cols, 0); - - cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr(), - image_spect.ptr())); - mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0, - 1.f / dft_size.area(), ccorr); - cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr(), - result_data.ptr())); - - Size result_roi_size(std::min(x + block_size.width, result.cols) - x, - std::min(y + block_size.height, result.rows) - y); - GpuMat result_roi(result_roi_size, result.type(), - (void*)(result.ptr(y) + x), result.step); - GpuMat result_block(result_roi_size, result_data.type(), - result_data.ptr(), result_data.step); - result_block.copyTo(result_roi); + for (int x = 0; x < result.cols; x += block_size.width) + { + Size image_roi_size(std::min(x + dft_size.width, image.cols) - x, + std::min(y + dft_size.height, image.rows) - y); + GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr(y) + x), + image.step); + copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows, + 0, image_block.cols - image_roi.cols, 0, Scalar(), stream); + + cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr(), + image_spect.ptr())); + mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0, + 1.f / dft_size.area(), ccorr, stream); + cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr(), + result_data.ptr())); + + Size result_roi_size(std::min(x + block_size.width, result.cols) - x, + std::min(y + block_size.height, result.rows) - y); + GpuMat result_roi(result_roi_size, result.type(), + (void*)(result.ptr(y) + x), result.step); + GpuMat result_block(result_roi_size, result_data.type(), + result_data.ptr(), result_data.step); + + if (stream) + stream.enqueueCopy(result_block, result_roi); + else + result_block.copyTo(result_roi); + } } + + cufftSafeCall(cufftDestroy(planR2C)); + cufftSafeCall(cufftDestroy(planC2R)); } - cufftSafeCall(cufftDestroy(planR2C)); - cufftSafeCall(cufftDestroy(planC2R)); +#endif } ////////////////////////////////////////////////////////////////////////////// diff --git a/modules/gpu/src/match_template.cpp b/modules/gpu/src/match_template.cpp index 58bc8bb911..e74d0fdc55 100644 --- a/modules/gpu/src/match_template.cpp +++ b/modules/gpu/src/match_template.cpp @@ -47,43 +47,32 @@ using namespace cv::gpu; #if !defined (HAVE_CUDA) -void cv::gpu::matchTemplate(const GpuMat&, const GpuMat&, GpuMat&, int) { throw_nogpu(); } +void cv::gpu::matchTemplate(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); } #else namespace cv { namespace gpu { namespace imgproc { - void matchTemplateNaive_CCORR_8U( - const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn); + void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream); + void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream); - void matchTemplateNaive_CCORR_32F( - const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn); + void matchTemplateNaive_SQDIFF_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream); + void matchTemplateNaive_SQDIFF_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream); - void matchTemplateNaive_SQDIFF_8U( - const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn); + void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_ image_sqsum, unsigned int templ_sqsum, DevMem2Df result, + int cn, cudaStream_t stream); - void matchTemplateNaive_SQDIFF_32F( - const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn); - - void matchTemplatePrepared_SQDIFF_8U( - int w, int h, const DevMem2D_ image_sqsum, - unsigned int templ_sqsum, DevMem2Df result, int cn); - - void matchTemplatePrepared_SQDIFF_NORMED_8U( - int w, int h, const DevMem2D_ image_sqsum, - unsigned int templ_sqsum, DevMem2Df result, int cn); - - void matchTemplatePrepared_CCOFF_8U( - int w, int h, const DevMem2D_ image_sum, - unsigned int templ_sum, DevMem2Df result); + void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_ image_sqsum, unsigned int templ_sqsum, DevMem2Df result, + int cn, cudaStream_t stream); + void matchTemplatePrepared_CCOFF_8U(int w, int h, const DevMem2D_ image_sum, unsigned int templ_sum, DevMem2Df result, cudaStream_t stream); void matchTemplatePrepared_CCOFF_8UC2( - int w, int h, - const DevMem2D_ image_sum_r, - const DevMem2D_ image_sum_g, - unsigned int templ_sum_r, unsigned int templ_sum_g, - DevMem2Df result); - + int w, int h, + const DevMem2D_ image_sum_r, + const DevMem2D_ image_sum_g, + unsigned int templ_sum_r, + unsigned int templ_sum_g, + DevMem2Df result, cudaStream_t stream); void matchTemplatePrepared_CCOFF_8UC3( int w, int h, const DevMem2D_ image_sum_r, @@ -92,8 +81,7 @@ namespace cv { namespace gpu { namespace imgproc unsigned int templ_sum_r, unsigned int templ_sum_g, unsigned int templ_sum_b, - DevMem2Df result); - + DevMem2Df result, cudaStream_t stream); void matchTemplatePrepared_CCOFF_8UC4( int w, int h, const DevMem2D_ image_sum_r, @@ -104,22 +92,21 @@ namespace cv { namespace gpu { namespace imgproc unsigned int templ_sum_g, unsigned int templ_sum_b, unsigned int templ_sum_a, - DevMem2Df result); + DevMem2Df result, cudaStream_t stream); + void matchTemplatePrepared_CCOFF_NORMED_8U( int w, int h, const DevMem2D_ image_sum, const DevMem2D_ image_sqsum, unsigned int templ_sum, unsigned int templ_sqsum, - DevMem2Df result); - + DevMem2Df result, cudaStream_t stream); void matchTemplatePrepared_CCOFF_NORMED_8UC2( int w, int h, const DevMem2D_ image_sum_r, const DevMem2D_ image_sqsum_r, const DevMem2D_ image_sum_g, const DevMem2D_ image_sqsum_g, unsigned int templ_sum_r, unsigned int templ_sqsum_r, unsigned int templ_sum_g, unsigned int templ_sqsum_g, - DevMem2Df result); - + DevMem2Df result, cudaStream_t stream); void matchTemplatePrepared_CCOFF_NORMED_8UC3( int w, int h, const DevMem2D_ image_sum_r, const DevMem2D_ image_sqsum_r, @@ -128,8 +115,7 @@ namespace cv { namespace gpu { namespace imgproc unsigned int templ_sum_r, unsigned int templ_sqsum_r, unsigned int templ_sum_g, unsigned int templ_sqsum_g, unsigned int templ_sum_b, unsigned int templ_sqsum_b, - DevMem2Df result); - + DevMem2Df result, cudaStream_t stream); void matchTemplatePrepared_CCOFF_NORMED_8UC4( int w, int h, const DevMem2D_ image_sum_r, const DevMem2D_ image_sqsum_r, @@ -140,12 +126,12 @@ namespace cv { namespace gpu { namespace imgproc unsigned int templ_sum_g, unsigned int templ_sqsum_g, unsigned int templ_sum_b, unsigned int templ_sqsum_b, unsigned int templ_sum_a, unsigned int templ_sqsum_a, - DevMem2Df result); + DevMem2Df result, cudaStream_t stream); void normalize_8U(int w, int h, const DevMem2D_ image_sqsum, - unsigned int templ_sqsum, DevMem2Df result, int cn); + unsigned int templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream); - void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn); + void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cudaStream_t stream); }}} @@ -186,103 +172,111 @@ namespace } - void matchTemplate_CCORR_32F(const GpuMat& image, const GpuMat& templ, GpuMat& result) + void matchTemplate_CCORR_32F(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream) { result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F); if (templ.size().area() < getTemplateThreshold(CV_TM_CCORR, CV_32F)) { - imgproc::matchTemplateNaive_CCORR_32F(image, templ, result, image.channels()); + imgproc::matchTemplateNaive_CCORR_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream)); return; } GpuMat result_; - convolve(image.reshape(1), templ.reshape(1), result_, true); - imgproc::extractFirstChannel_32F(result_, result, image.channels()); + ConvolveBuf buf; + convolve(image.reshape(1), templ.reshape(1), result_, true, buf, stream); + imgproc::extractFirstChannel_32F(result_, result, image.channels(), StreamAccessor::getStream(stream)); } - void matchTemplate_CCORR_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result) + void matchTemplate_CCORR_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream) { if (templ.size().area() < getTemplateThreshold(CV_TM_CCORR, CV_8U)) { result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F); - imgproc::matchTemplateNaive_CCORR_8U(image, templ, result, image.channels()); + imgproc::matchTemplateNaive_CCORR_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream)); return; } GpuMat imagef, templf; - image.convertTo(imagef, CV_32F); - templ.convertTo(templf, CV_32F); - matchTemplate_CCORR_32F(imagef, templf, result); + if (stream) + { + stream.enqueueConvert(image, imagef, CV_32F); + stream.enqueueConvert(templ, templf, CV_32F); + } + else + { + image.convertTo(imagef, CV_32F); + templ.convertTo(templf, CV_32F); + } + matchTemplate_CCORR_32F(imagef, templf, result, stream); } - void matchTemplate_CCORR_NORMED_8U(const GpuMat& image, const GpuMat& templ, - GpuMat& result) + void matchTemplate_CCORR_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream) { - matchTemplate_CCORR_8U(image, templ, result); + matchTemplate_CCORR_8U(image, templ, result, stream); GpuMat img_sqsum; - sqrIntegral(image.reshape(1), img_sqsum); + sqrIntegral(image.reshape(1), img_sqsum, stream); unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0]; imgproc::normalize_8U(templ.cols, templ.rows, img_sqsum, templ_sqsum, - result, image.channels()); + result, image.channels(), StreamAccessor::getStream(stream)); } - void matchTemplate_SQDIFF_32F(const GpuMat& image, const GpuMat& templ, GpuMat& result) + void matchTemplate_SQDIFF_32F(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream) { result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F); - imgproc::matchTemplateNaive_SQDIFF_32F(image, templ, result, image.channels()); + imgproc::matchTemplateNaive_SQDIFF_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream)); } - void matchTemplate_SQDIFF_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result) + void matchTemplate_SQDIFF_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream) { if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, CV_8U)) { result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F); - imgproc::matchTemplateNaive_SQDIFF_8U(image, templ, result, image.channels()); + imgproc::matchTemplateNaive_SQDIFF_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream)); return; } GpuMat img_sqsum; - sqrIntegral(image.reshape(1), img_sqsum); + sqrIntegral(image.reshape(1), img_sqsum, stream); unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0]; - matchTemplate_CCORR_8U(image, templ, result); + matchTemplate_CCORR_8U(image, templ, result, stream); imgproc::matchTemplatePrepared_SQDIFF_8U( - templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels()); + templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream)); } - void matchTemplate_SQDIFF_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result) + void matchTemplate_SQDIFF_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream) { GpuMat img_sqsum; - sqrIntegral(image.reshape(1), img_sqsum); + sqrIntegral(image.reshape(1), img_sqsum, stream); unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0]; - matchTemplate_CCORR_8U(image, templ, result); + matchTemplate_CCORR_8U(image, templ, result, stream); imgproc::matchTemplatePrepared_SQDIFF_NORMED_8U( - templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels()); + templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream)); } - void matchTemplate_CCOFF_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result) + void matchTemplate_CCOFF_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream) { - matchTemplate_CCORR_8U(image, templ, result); + matchTemplate_CCORR_8U(image, templ, result, stream); if (image.channels() == 1) { GpuMat image_sum; - integral(image, image_sum); + integral(image, image_sum, stream); unsigned int templ_sum = (unsigned int)sum(templ)[0]; imgproc::matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, - image_sum, templ_sum, result); + image_sum, templ_sum, result, StreamAccessor::getStream(stream)); } else { @@ -291,7 +285,7 @@ namespace split(image, images); for (int i = 0; i < image.channels(); ++i) - integral(images[i], image_sums[i]); + integral(images[i], image_sums[i], stream); Scalar templ_sum = sum(templ); @@ -301,19 +295,19 @@ namespace imgproc::matchTemplatePrepared_CCOFF_8UC2( templ.cols, templ.rows, image_sums[0], image_sums[1], (unsigned int)templ_sum[0], (unsigned int)templ_sum[1], - result); + result, StreamAccessor::getStream(stream)); break; case 3: imgproc::matchTemplatePrepared_CCOFF_8UC3( templ.cols, templ.rows, image_sums[0], image_sums[1], image_sums[2], (unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2], - result); + result, StreamAccessor::getStream(stream)); break; case 4: imgproc::matchTemplatePrepared_CCOFF_8UC4( templ.cols, templ.rows, image_sums[0], image_sums[1], image_sums[2], image_sums[3], (unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2], - (unsigned int)templ_sum[3], result); + (unsigned int)templ_sum[3], result, StreamAccessor::getStream(stream)); break; default: CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels"); @@ -322,25 +316,34 @@ namespace } - void matchTemplate_CCOFF_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result) + void matchTemplate_CCOFF_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream) { GpuMat imagef, templf; - image.convertTo(imagef, CV_32F); - templ.convertTo(templf, CV_32F); - matchTemplate_CCORR_32F(imagef, templf, result); + if (stream) + { + stream.enqueueConvert(image, imagef, CV_32F); + stream.enqueueConvert(templ, templf, CV_32F); + } + else + { + image.convertTo(imagef, CV_32F); + templ.convertTo(templf, CV_32F); + } + + matchTemplate_CCORR_32F(imagef, templf, result, stream); if (image.channels() == 1) { GpuMat image_sum, image_sqsum; - integral(image, image_sum); - sqrIntegral(image, image_sqsum); + integral(image, image_sum, stream); + sqrIntegral(image, image_sqsum, stream); unsigned int templ_sum = (unsigned int)sum(templ)[0]; unsigned int templ_sqsum = (unsigned int)sqrSum(templ)[0]; imgproc::matchTemplatePrepared_CCOFF_NORMED_8U( templ.cols, templ.rows, image_sum, image_sqsum, - templ_sum, templ_sqsum, result); + templ_sum, templ_sqsum, result, StreamAccessor::getStream(stream)); } else { @@ -351,8 +354,8 @@ namespace split(image, images); for (int i = 0; i < image.channels(); ++i) { - integral(images[i], image_sums[i]); - sqrIntegral(images[i], image_sqsums[i]); + integral(images[i], image_sums[i], stream); + sqrIntegral(images[i], image_sqsums[i], stream); } Scalar templ_sum = sum(templ); @@ -367,7 +370,7 @@ namespace image_sums[1], image_sqsums[1], (unsigned int)templ_sum[0], (unsigned int)templ_sqsum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sqsum[1], - result); + result, StreamAccessor::getStream(stream)); break; case 3: imgproc::matchTemplatePrepared_CCOFF_NORMED_8UC3( @@ -378,7 +381,7 @@ namespace (unsigned int)templ_sum[0], (unsigned int)templ_sqsum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sqsum[1], (unsigned int)templ_sum[2], (unsigned int)templ_sqsum[2], - result); + result, StreamAccessor::getStream(stream)); break; case 4: imgproc::matchTemplatePrepared_CCOFF_NORMED_8UC4( @@ -391,7 +394,7 @@ namespace (unsigned int)templ_sum[1], (unsigned int)templ_sqsum[1], (unsigned int)templ_sum[2], (unsigned int)templ_sqsum[2], (unsigned int)templ_sum[3], (unsigned int)templ_sqsum[3], - result); + result, StreamAccessor::getStream(stream)); break; default: CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels"); @@ -401,12 +404,12 @@ namespace } -void cv::gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method) +void cv::gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream& stream) { CV_Assert(image.type() == templ.type()); CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows); - typedef void (*Caller)(const GpuMat&, const GpuMat&, GpuMat&); + typedef void (*Caller)(const GpuMat&, const GpuMat&, GpuMat&, Stream& stream); static const Caller callers8U[] = { ::matchTemplate_SQDIFF_8U, ::matchTemplate_SQDIFF_NORMED_8U, ::matchTemplate_CCORR_8U, ::matchTemplate_CCORR_NORMED_8U, @@ -424,7 +427,7 @@ void cv::gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& re Caller caller = callers[method]; CV_Assert(caller); - caller(image, templ, result); + caller(image, templ, result, stream); } #endif diff --git a/modules/gpu/src/optical_flow.cpp b/modules/gpu/src/optical_flow.cpp index 7891ef5548..19754c0578 100644 --- a/modules/gpu/src/optical_flow.cpp +++ b/modules/gpu/src/optical_flow.cpp @@ -59,10 +59,8 @@ namespace NCVMatrix& u, NCVMatrix& v, const cudaDeviceProp& devProp) { NCVMemStackAllocator gpuCounter(static_cast(devProp.textureAlignment)); - CV_Assert(gpuCounter.isInitialized()); - NCVStatus ncvStat = NCVBroxOpticalFlow(desc, gpuCounter, frame0, frame1, u, v, 0); - CV_Assert(ncvStat == NCV_SUCCESS); + ncvSafeCall( NCVBroxOpticalFlow(desc, gpuCounter, frame0, frame1, u, v, 0) ); return gpuCounter.maxSize(); } @@ -130,10 +128,8 @@ void cv::gpu::BroxOpticalFlow::operator ()(const GpuMat& frame0, const GpuMat& f ensureSizeIsEnough(1, bufSize, CV_8UC1, buf); NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast(devProp.textureAlignment), buf.ptr()); - CV_Assert(gpuAllocator.isInitialized()); - NCVStatus ncvStat = NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, stream); - CV_Assert(ncvStat == NCV_SUCCESS); + ncvSafeCall( NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, stream) ); } void cv::gpu::interpolateFrames(const GpuMat& frame0, const GpuMat& frame1, const GpuMat& fu, const GpuMat& fv, const GpuMat& bu, const GpuMat& bv, @@ -189,7 +185,7 @@ void cv::gpu::interpolateFrames(const GpuMat& frame0, const GpuMat& frame1, cons state.ppBuffers[4] = bui.ptr(); state.ppBuffers[5] = bvi.ptr(); - nppSafeCall( nppiStInterpolateFrames(&state) ); + ncvSafeCall( nppiStInterpolateFrames(&state) ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); diff --git a/modules/gpu/src/precomp.hpp b/modules/gpu/src/precomp.hpp index ea5259b075..3c84c93ba2 100644 --- a/modules/gpu/src/precomp.hpp +++ b/modules/gpu/src/precomp.hpp @@ -39,15 +39,16 @@ // the use of this software, even if advised of the possibility of such damage. // //M*/ + #ifndef __OPENCV_PRECOMP_H__ #define __OPENCV_PRECOMP_H__ #if _MSC_VER >= 1200 -#pragma warning( disable: 4251 4710 4711 4514 4996 ) + #pragma warning( disable: 4251 4710 4711 4514 4996 ) #endif #ifdef HAVE_CVCONFIG_H -#include "cvconfig.h" + #include "cvconfig.h" #endif #include @@ -65,33 +66,43 @@ #include "opencv2/calib3d/calib3d.hpp" #include "opencv2/core/internal.hpp" -#if defined(HAVE_CUDA) +#define OPENCV_GPU_UNUSED(x) (void)x + +#ifdef HAVE_CUDA - #include "internal_shared.hpp" #include "cuda_runtime_api.h" - #include "cufft.h" + #include "npp.h" + + #ifdef HAVE_CUFFT + #include "cufft.h" + #endif + + #ifdef HAVE_CUBLAS + #include "cublas.h" + #endif + + #include "internal_shared.hpp" #include "opencv2/gpu/stream_accessor.hpp" - #include "npp.h" #include "nvidia/core/NCV.hpp" #include "nvidia/NPP_staging/NPP_staging.hpp" #include "nvidia/NCVHaarObjectDetection.hpp" #include "nvidia/NCVBroxOpticalFlow.hpp" -#define CUDART_MINIMUM_REQUIRED_VERSION 4000 -#define NPP_MINIMUM_REQUIRED_VERSION 4000 + #define CUDART_MINIMUM_REQUIRED_VERSION 4000 + #define NPP_MINIMUM_REQUIRED_VERSION 4000 -#if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION) - #error "Insufficient Cuda Runtime library version, please update it." -#endif + #if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION) + #error "Insufficient Cuda Runtime library version, please update it." + #endif -#if (NPP_VERSION_MAJOR*1000+NPP_VERSION_MINOR*100+NPP_VERSION_BUILD < NPP_MINIMUM_REQUIRED_VERSION) - #error "Insufficient NPP version, please update it." -#endif + #if (NPP_VERSION_MAJOR * 1000 + NPP_VERSION_MINOR * 100 + NPP_VERSION_BUILD < NPP_MINIMUM_REQUIRED_VERSION) + #error "Insufficient NPP version, please update it." + #endif -#if defined(CUDA_ARCH_BIN_OR_PTX_10) - #error "OpenCV GPU module doesn't support NVIDIA compute capability 1.0" -#endif + #if defined(CUDA_ARCH_BIN_OR_PTX_10) + #error "OpenCV GPU module doesn't support NVIDIA compute capability 1.0" + #endif static inline void throw_nogpu() { CV_Error(CV_GpuNotSupported, "The called functionality is disabled for current build or platform"); }