added WITH_CUFFT and WITH_CUBLAS flags to cmake scripts

fixed gpu module error reporting
added asynchronous version of some functions
pull/13383/head
Vladislav Vinogradov 13 years ago
parent a73b509b1e
commit 40ee754e28
  1. 13
      CMakeLists.txt
  2. 6
      cvconfig.h.cmake
  3. 4
      modules/core/include/opencv2/core/types_c.h
  4. 5
      modules/core/src/system.cpp
  5. 11
      modules/gpu/CMakeLists.txt
  6. 44
      modules/gpu/include/opencv2/gpu/gpu.hpp
  7. 4
      modules/gpu/src/arithm.cpp
  8. 11
      modules/gpu/src/cascadeclassifier.cpp
  9. 121
      modules/gpu/src/cuda/imgproc.cu
  10. 388
      modules/gpu/src/cuda/match_template.cu
  11. 31
      modules/gpu/src/cuda/safe_call.hpp
  12. 194
      modules/gpu/src/error.cpp
  13. 278
      modules/gpu/src/imgproc.cpp
  14. 177
      modules/gpu/src/match_template.cpp
  15. 10
      modules/gpu/src/optical_flow.cpp
  16. 45
      modules/gpu/src/precomp.hpp

@ -452,8 +452,12 @@ set(WITH_EIGEN ON CACHE BOOL "Include Eigen2/Eigen3 support")
if( CMAKE_VERSION VERSION_GREATER "2.8")
set(WITH_CUDA ON CACHE BOOL "Include NVidia Cuda Runtime support")
set(WITH_CUFFT ON CACHE BOOL "Include NVidia Cuda Fast Fourier Transform (FFT) library support")
set(WITH_CUBLAS OFF CACHE BOOL "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support")
else()
set(WITH_CUDA OFF CACHE BOOL "Include NVidia Cuda Runtime support")
set(WITH_CUFFT OFF CACHE BOOL "Include NVidia Cuda Fast Fourier Transform (FFT) library support")
set(WITH_CUBLAS OFF CACHE BOOL "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support")
endif()
set(WITH_OPENNI OFF CACHE BOOL "Include OpenNI support")
@ -995,6 +999,15 @@ if(WITH_CUDA)
if(CUDA_FOUND)
set(HAVE_CUDA 1)
if(WITH_CUFFT)
set(HAVE_CUFFT 1)
endif()
if(WITH_CUBLAS)
set(HAVE_CUBLAS 1)
endif()
message(STATUS "CUDA detected: " ${CUDA_VERSION})
set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0)" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")

@ -172,6 +172,12 @@
/* NVidia Cuda Runtime API*/
#cmakedefine HAVE_CUDA
/* NVidia Cuda Fast Fourier Transform (FFT) API*/
#cmakedefine HAVE_CUFFT
/* NVidia Cuda Basic Linear Algebra Subprograms (BLAS) API*/
#cmakedefine HAVE_CUBLAS
/* Compile for 'real' NVIDIA GPU architectures */
#define CUDA_ARCH_BIN "${OPENCV_CUDA_ARCH_BIN}"

@ -250,9 +250,7 @@ enum {
CV_StsBadMemBlock= -214, /* an allocated block has been corrupted */
CV_StsAssert= -215, /* assertion failed */
CV_GpuNotSupported= -216,
CV_GpuApiCallError= -217,
CV_GpuNppCallError= -218,
CV_GpuCufftCallError= -219
CV_GpuApiCallError= -217
};
/****************************************************************************************\

@ -629,9 +629,8 @@ CV_IMPL const char* cvErrorStr( int status )
case CV_StsNotImplemented : return "The function/feature is not implemented";
case CV_StsBadMemBlock : return "Memory block has been corrupted";
case CV_StsAssert : return "Assertion failed";
case CV_GpuNotSupported : return "No GPU support";
case CV_GpuApiCallError : return "Gpu Api call";
case CV_GpuNppCallError : return "Npp Api call";
case CV_GpuNotSupported : return "No GPU support";
case CV_GpuApiCallError : return "Gpu Api call";
};
sprintf(buf, "Unknown %s code %d", status >= 0 ? "status":"error", status);

@ -120,12 +120,19 @@ set_target_properties(${the_target} PROPERTIES
target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${DEPS} )
if (HAVE_CUDA)
target_link_libraries(${the_target} ${CUDA_LIBRARIES})
CUDA_ADD_CUFFT_TO_TARGET(${the_target})
target_link_libraries(${the_target} ${CUDA_LIBRARIES})
unset(CUDA_npp_LIBRARY CACHE)
find_cuda_helper_libs(npp)
target_link_libraries(${the_target} ${CUDA_npp_LIBRARY})
if(HAVE_CUFFT)
CUDA_ADD_CUFFT_TO_TARGET(${the_target})
endif()
if(HAVE_CUBLAS)
CUDA_ADD_CUBLAS_TO_TARGET(${the_target})
endif()
endif()
if(MSVC)

@ -141,8 +141,8 @@ namespace cv
//////////////////////////////// Error handling ////////////////////////
CV_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func);
CV_EXPORTS void nppError( int err, const char *file, const int line, const char *func);
//CV_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func);
//CV_EXPORTS void nppError( int err, const char *file, const int line, const char *func);
//////////////////////////////// CudaMem ////////////////////////////////
// CudaMem is limited cv::Mat with page locked memory allocation.
@ -628,11 +628,11 @@ namespace cv
//! Does mean shift filtering on GPU.
CV_EXPORTS void meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,
TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));
TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1), Stream& stream = Stream::Null());
//! Does mean shift procedure on GPU.
CV_EXPORTS void meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr,
TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));
TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1), Stream& stream = Stream::Null());
//! Does mean shift segmentation with elimination of small regions.
CV_EXPORTS void meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr, int minsize,
@ -683,10 +683,12 @@ namespace cv
//! rotate 8bit single or four channel image
//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
//! supports CV_8UC1, CV_8UC4 types
CV_EXPORTS void rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());
CV_EXPORTS void rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0,
int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());
//! copies 2D array to a larger destination array and pads borders with user-specifiable constant
CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType, const Scalar& value = Scalar(), Stream& stream = Stream::Null());
CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType,
const Scalar& value = Scalar(), Stream& stream = Stream::Null());
//! computes the integral image
//! sum will have CV_32S type, but will contain unsigned int values
@ -715,21 +717,26 @@ namespace cv
CV_EXPORTS void rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& stream = Stream::Null());
//! computes Harris cornerness criteria at each image pixel
CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, int borderType=BORDER_REFLECT101);
CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, double k, int borderType=BORDER_REFLECT101);
CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k,
int borderType = BORDER_REFLECT101);
CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, double k,
int borderType = BORDER_REFLECT101);
CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, double k,
int borderType = BORDER_REFLECT101, Stream& stream = Stream::Null());
//! computes minimum eigen value of 2x2 derivative covariation matrix at each pixel - the cornerness criteria
CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType=BORDER_REFLECT101);
CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType=BORDER_REFLECT101);
CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize,
int borderType=BORDER_REFLECT101, Stream& stream = Stream::Null());
//! performs per-element multiplication of two full (not packed) Fourier spectrums
//! supports 32FC2 matrixes only (interleaved format)
CV_EXPORTS void mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB=false);
CV_EXPORTS void mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB=false, Stream& stream = Stream::Null());
//! performs per-element multiplication of two full (not packed) Fourier spectrums
//! supports 32FC2 matrixes only (interleaved format)
CV_EXPORTS void mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags,
float scale, bool conjB=false);
CV_EXPORTS void mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB=false, Stream& stream = Stream::Null());
//! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix.
//! Param dft_size is the size of DFT transform.
@ -742,19 +749,14 @@ namespace cv
//! in CUFFT's format. Result as full complex matrix for such kind of transform cannot be retrieved.
//!
//! For complex-to-real transform it is assumed that the source matrix is packed in CUFFT's format.
CV_EXPORTS void dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags=0);
CV_EXPORTS void dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags=0, Stream& stream = Stream::Null());
//! computes convolution (or cross-correlation) of two images using discrete Fourier transform
//! supports source images of 32FC1 type only
//! result matrix will have 32FC1 type
CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
bool ccorr=false);
struct CV_EXPORTS ConvolveBuf;
//! buffered version
CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
bool ccorr, ConvolveBuf& buf);
CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr = false);
CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream = Stream::Null());
struct CV_EXPORTS ConvolveBuf
{
@ -766,7 +768,7 @@ namespace cv
private:
static Size estimateBlockSize(Size result_size, Size templ_size);
friend void convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&);
friend void convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream& stream);
Size result_size;
Size block_size;
@ -778,7 +780,7 @@ namespace cv
};
//! computes the proximity map for the raster template and the image where the template is searched for
CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method);
CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream& stream = Stream::Null());
//! smoothes the source image and downsamples it
CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());

@ -93,7 +93,7 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
sz.width = src.cols;
sz.height = src.rows;
nppSafeCall( nppiStTranspose_32u_C1R(const_cast<Ncv32u*>(src.ptr<Ncv32u>()), static_cast<int>(src.step),
ncvSafeCall( nppiStTranspose_32u_C1R(const_cast<Ncv32u*>(src.ptr<Ncv32u>()), static_cast<int>(src.step),
dst.ptr<Ncv32u>(), static_cast<int>(dst.step), sz) );
}
else // if (src.elemSize() == 8)
@ -104,7 +104,7 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
sz.width = src.cols;
sz.height = src.rows;
nppSafeCall( nppiStTranspose_64u_C1R(const_cast<Ncv64u*>(src.ptr<Ncv64u>()), static_cast<int>(src.step),
ncvSafeCall( nppiStTranspose_64u_C1R(const_cast<Ncv64u*>(src.ptr<Ncv64u>()), static_cast<int>(src.step),
dst.ptr<Ncv64u>(), static_cast<int>(dst.step), sz) );
}

@ -66,10 +66,7 @@ struct cv::gpu::CascadeClassifier_GPU::CascadeClassifierImpl
CascadeClassifierImpl(const string& filename) : lastAllocatedFrameSize(-1, -1)
{
ncvSetDebugOutputHandler(NCVDebugOutputHandler);
if (ncvStat != load(filename))
{
CV_Error(CV_GpuApiCallError, "Error in GPU cacade load");
}
ncvSafeCall( load(filename) );
}
@ -287,11 +284,7 @@ int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat& image, GpuMa
}
unsigned int numDetections;
NCVStatus ncvStat = impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, ncvMinSize, numDetections);
if (ncvStat != NCV_SUCCESS)
{
CV_Error(CV_GpuApiCallError, "Error in face detectioln");
}
ncvSafeCall( impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, ncvMinSize, numDetections) );
return numDetections;
}

@ -120,8 +120,7 @@ namespace cv { namespace gpu { namespace imgproc
return make_short2((short)x0, (short)y0);
}
extern "C" __global__ void meanshift_kernel( unsigned char* out, size_t out_step, int cols, int rows,
int sp, int sr, int maxIter, float eps )
__global__ void meanshift_kernel(unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps )
{
int x0 = blockIdx.x * blockDim.x + threadIdx.x;
int y0 = blockIdx.y * blockDim.y + threadIdx.y;
@ -130,10 +129,10 @@ namespace cv { namespace gpu { namespace imgproc
do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);
}
extern "C" __global__ void meanshiftproc_kernel( unsigned char* outr, size_t outrstep,
unsigned char* outsp, size_t outspstep,
int cols, int rows,
int sp, int sr, int maxIter, float eps )
__global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep,
unsigned char* outsp, size_t outspstep,
int cols, int rows,
int sp, int sr, int maxIter, float eps)
{
int x0 = blockIdx.x * blockDim.x + threadIdx.x;
int y0 = blockIdx.y * blockDim.y + threadIdx.y;
@ -145,7 +144,7 @@ namespace cv { namespace gpu { namespace imgproc
}
}
extern "C" void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps)
void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
{
dim3 grid(1, 1, 1);
dim3 threads(32, 8, 1);
@ -155,13 +154,16 @@ namespace cv { namespace gpu { namespace imgproc
cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
meanshift_kernel<<< grid, threads >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
meanshift_kernel<<< grid, threads, 0, stream >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
//cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
}
extern "C" void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps)
void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
{
dim3 grid(1, 1, 1);
dim3 threads(32, 8, 1);
@ -171,11 +173,13 @@ namespace cv { namespace gpu { namespace imgproc
cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
meanshiftproc_kernel<<< grid, threads >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
meanshiftproc_kernel<<< grid, threads, 0, stream >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
//cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
}
/////////////////////////////////// drawColorDisp ///////////////////////////////////////////////
@ -389,15 +393,16 @@ namespace cv { namespace gpu { namespace imgproc
}
}
void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst)
void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(Dx.cols, threads.x), divUp(Dx.rows, threads.y));
extractCovData_kernel<<<grid, threads>>>(Dx.cols, Dx.rows, Dx, Dy, dst);
extractCovData_kernel<<<grid, threads, 0, stream>>>(Dx.cols, Dx.rows, Dx, Dy, dst);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
/////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////
@ -475,7 +480,7 @@ namespace cv { namespace gpu { namespace imgproc
}
void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst,
int border_type)
int border_type, cudaStream_t stream)
{
const int rows = Dx.rows;
const int cols = Dx.cols;
@ -492,7 +497,7 @@ namespace cv { namespace gpu { namespace imgproc
switch (border_type)
{
case BORDER_REFLECT101_GPU:
cornerHarris_kernel<<<grid, threads>>>(
cornerHarris_kernel<<<grid, threads, 0, stream>>>(
cols, rows, block_size, k, dst, BrdRowReflect101<void>(cols), BrdColReflect101<void>(rows));
break;
case BORDER_REPLICATE_GPU:
@ -500,16 +505,18 @@ namespace cv { namespace gpu { namespace imgproc
harrisDxTex.addressMode[1] = cudaAddressModeClamp;
harrisDyTex.addressMode[0] = cudaAddressModeClamp;
harrisDyTex.addressMode[1] = cudaAddressModeClamp;
cornerHarris_kernel<<<grid, threads>>>(cols, rows, block_size, k, dst);
cornerHarris_kernel<<<grid, threads, 0, stream>>>(cols, rows, block_size, k, dst);
break;
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall(cudaUnbindTexture(harrisDxTex));
cudaSafeCall(cudaUnbindTexture(harrisDyTex));
//cudaSafeCall(cudaUnbindTexture(harrisDxTex));
//cudaSafeCall(cudaUnbindTexture(harrisDyTex));
}
/////////////////////////////////////////// Corner Min Eigen Val /////////////////////////////////////////////////
@ -592,7 +599,7 @@ namespace cv { namespace gpu { namespace imgproc
}
void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst,
int border_type)
int border_type, cudaStream_t stream)
{
const int rows = Dx.rows;
const int cols = Dx.cols;
@ -609,7 +616,7 @@ namespace cv { namespace gpu { namespace imgproc
switch (border_type)
{
case BORDER_REFLECT101_GPU:
cornerMinEigenVal_kernel<<<grid, threads>>>(
cornerMinEigenVal_kernel<<<grid, threads, 0, stream>>>(
cols, rows, block_size, dst, BrdRowReflect101<void>(cols), BrdColReflect101<void>(rows));
break;
case BORDER_REPLICATE_GPU:
@ -617,16 +624,18 @@ namespace cv { namespace gpu { namespace imgproc
minEigenValDxTex.addressMode[1] = cudaAddressModeClamp;
minEigenValDyTex.addressMode[0] = cudaAddressModeClamp;
minEigenValDyTex.addressMode[1] = cudaAddressModeClamp;
cornerMinEigenVal_kernel<<<grid, threads>>>(cols, rows, block_size, dst);
cornerMinEigenVal_kernel<<<grid, threads, 0, stream>>>(cols, rows, block_size, dst);
break;
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaDeviceSynchronize());
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
cudaSafeCall(cudaUnbindTexture(minEigenValDxTex));
cudaSafeCall(cudaUnbindTexture(minEigenValDyTex));
//cudaSafeCall(cudaUnbindTexture(minEigenValDxTex));
//cudaSafeCall(cudaUnbindTexture(minEigenValDyTex));
}
////////////////////////////// Column Sum //////////////////////////////////////
@ -667,8 +676,7 @@ namespace cv { namespace gpu { namespace imgproc
//////////////////////////////////////////////////////////////////////////
// mulSpectrums
__global__ void mulSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,
DevMem2D_<cufftComplex> c)
__global__ void mulSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -680,25 +688,23 @@ namespace cv { namespace gpu { namespace imgproc
}
void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,
DevMem2D_<cufftComplex> c)
void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream)
{
dim3 threads(256);
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
mulSpectrumsKernel<<<grid, threads>>>(a, b, c);
mulSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, c);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
//////////////////////////////////////////////////////////////////////////
// mulSpectrums_CONJ
__global__ void mulSpectrumsKernel_CONJ(
const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,
DevMem2D_<cufftComplex> c)
__global__ void mulSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -710,25 +716,23 @@ namespace cv { namespace gpu { namespace imgproc
}
void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,
DevMem2D_<cufftComplex> c)
void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream)
{
dim3 threads(256);
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
mulSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, c);
mulSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, c);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
//////////////////////////////////////////////////////////////////////////
// mulAndScaleSpectrums
__global__ void mulAndScaleSpectrumsKernel(
const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,
float scale, DevMem2D_<cufftComplex> c)
__global__ void mulAndScaleSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -741,25 +745,23 @@ namespace cv { namespace gpu { namespace imgproc
}
void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,
float scale, DevMem2D_<cufftComplex> c)
void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream)
{
dim3 threads(256);
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
mulAndScaleSpectrumsKernel<<<grid, threads>>>(a, b, scale, c);
mulAndScaleSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, scale, c);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
if (stream)
cudaSafeCall( cudaDeviceSynchronize() );
}
//////////////////////////////////////////////////////////////////////////
// mulAndScaleSpectrums_CONJ
__global__ void mulAndScaleSpectrumsKernel_CONJ(
const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,
float scale, DevMem2D_<cufftComplex> c)
__global__ void mulAndScaleSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -772,16 +774,16 @@ namespace cv { namespace gpu { namespace imgproc
}
void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,
float scale, DevMem2D_<cufftComplex> c)
void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream)
{
dim3 threads(256);
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
mulAndScaleSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, scale, c);
mulAndScaleSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, scale, c);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
//////////////////////////////////////////////////////////////////////////
@ -1015,17 +1017,18 @@ namespace cv { namespace gpu { namespace imgproc
}
}
void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel)
void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream)
{
cudaSafeCall(cudaMemcpyToSymbol(c_convolveKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
const dim3 block(16, 16);
const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
convolve<<<grid, block>>>(src, dst, kWidth, kHeight);
convolve<<<grid, block, 0, stream>>>(src, dst, kWidth, kHeight);
cudaSafeCall(cudaGetLastError());
cudaSafeCall(cudaDeviceSynchronize());
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}

@ -78,11 +78,11 @@ __device__ __forceinline__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a
__device__ __forceinline__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
__device__ __forceinline__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
//////////////////////////////////////////////////////////////////////
// Naive_CCORR
template <typename T, int cn>
__global__ void matchTemplateNaiveKernel_CCORR(
int w, int h, const PtrStepb image, const PtrStepb templ,
DevMem2Df result)
template <typename T, int cn>
__global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, DevMem2Df result)
{
typedef typename TypeVec<T, cn>::vec_type Type;
typedef typename TypeVec<float, cn>::vec_type Typef;
@ -106,73 +106,49 @@ __global__ void matchTemplateNaiveKernel_CCORR(
}
}
template <typename T, int cn>
void matchTemplateNaive_CCORR(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream)
{
const dim3 threads(32, 8);
const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ,
DevMem2Df result, int cn)
matchTemplateNaiveKernel_CCORR<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);
switch (cn)
static const caller_t callers[] =
{
case 1:
matchTemplateNaiveKernel_CCORR<float, 1><<<grid, threads>>>(
templ.cols, templ.rows, image, templ, result);
break;
case 2:
matchTemplateNaiveKernel_CCORR<float, 2><<<grid, threads>>>(
templ.cols, templ.rows, image, templ, result);
break;
case 3:
matchTemplateNaiveKernel_CCORR<float, 3><<<grid, threads>>>(
templ.cols, templ.rows, image, templ, result);
break;
case 4:
matchTemplateNaiveKernel_CCORR<float, 4><<<grid, threads>>>(
templ.cols, templ.rows, image, templ, result);
break;
}
cudaSafeCall( cudaGetLastError() );
0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4>
};
cudaSafeCall( cudaDeviceSynchronize() );
callers[cn](image, templ, result, stream);
}
void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ,
DevMem2Df result, int cn)
void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);
switch (cn)
static const caller_t callers[] =
{
case 1:
matchTemplateNaiveKernel_CCORR<uchar, 1><<<grid, threads>>>(
templ.cols, templ.rows, image, templ, result);
break;
case 2:
matchTemplateNaiveKernel_CCORR<uchar, 2><<<grid, threads>>>(
templ.cols, templ.rows, image, templ, result);
break;
case 3:
matchTemplateNaiveKernel_CCORR<uchar, 3><<<grid, threads>>>(
templ.cols, templ.rows, image, templ, result);
break;
case 4:
matchTemplateNaiveKernel_CCORR<uchar, 4><<<grid, threads>>>(
templ.cols, templ.rows, image, templ, result);
break;
}
cudaSafeCall( cudaGetLastError() );
0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4>
};
cudaSafeCall( cudaDeviceSynchronize() );
callers[cn](image, templ, result, stream);
}
//////////////////////////////////////////////////////////////////////
// Naive_SQDIFF
template <typename T, int cn>
__global__ void matchTemplateNaiveKernel_SQDIFF(
int w, int h, const PtrStepb image, const PtrStepb templ,
DevMem2Df result)
__global__ void matchTemplateNaiveKernel_SQDIFF(int w, int h, const PtrStepb image, const PtrStepb templ, DevMem2Df result)
{
typedef typename TypeVec<T, cn>::vec_type Type;
typedef typename TypeVec<float, cn>::vec_type Typef;
@ -200,73 +176,48 @@ __global__ void matchTemplateNaiveKernel_SQDIFF(
}
}
void matchTemplateNaive_SQDIFF_32F(const DevMem2Db image, const DevMem2Db templ,
DevMem2Df result, int cn)
template <typename T, int cn>
void matchTemplateNaive_SQDIFF(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
const dim3 threads(32, 8);
const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
switch (cn)
{
case 1:
matchTemplateNaiveKernel_SQDIFF<float, 1><<<grid, threads>>>(
templ.cols, templ.rows, image, templ, result);
break;
case 2:
matchTemplateNaiveKernel_SQDIFF<float, 2><<<grid, threads>>>(
templ.cols, templ.rows, image, templ, result);
break;
case 3:
matchTemplateNaiveKernel_SQDIFF<float, 3><<<grid, threads>>>(
templ.cols, templ.rows, image, templ, result);
break;
case 4:
matchTemplateNaiveKernel_SQDIFF<float, 4><<<grid, threads>>>(
templ.cols, templ.rows, image, templ, result);
break;
}
matchTemplateNaiveKernel_SQDIFF<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void matchTemplateNaive_SQDIFF_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)
{
typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);
static const caller_t callers[] =
{
0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4>
};
callers[cn](image, templ, result, stream);
}
void matchTemplateNaive_SQDIFF_8U(const DevMem2Db image, const DevMem2Db templ,
DevMem2Df result, int cn)
void matchTemplateNaive_SQDIFF_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);
switch (cn)
static const caller_t callers[] =
{
case 1:
matchTemplateNaiveKernel_SQDIFF<uchar, 1><<<grid, threads>>>(
templ.cols, templ.rows, image, templ, result);
break;
case 2:
matchTemplateNaiveKernel_SQDIFF<uchar, 2><<<grid, threads>>>(
templ.cols, templ.rows, image, templ, result);
break;
case 3:
matchTemplateNaiveKernel_SQDIFF<uchar, 3><<<grid, threads>>>(
templ.cols, templ.rows, image, templ, result);
break;
case 4:
matchTemplateNaiveKernel_SQDIFF<uchar, 4><<<grid, threads>>>(
templ.cols, templ.rows, image, templ, result);
break;
}
cudaSafeCall( cudaGetLastError() );
0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4>
};
cudaSafeCall( cudaDeviceSynchronize() );
callers[cn](image, templ, result, stream);
}
//////////////////////////////////////////////////////////////////////
// Prepared_SQDIFF
template <int cn>
__global__ void matchTemplatePreparedKernel_SQDIFF_8U(
int w, int h, const PtrStep<unsigned long long> image_sqsum,
unsigned int templ_sqsum, DevMem2Df result)
__global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -281,37 +232,34 @@ __global__ void matchTemplatePreparedKernel_SQDIFF_8U(
}
}
template <int cn>
void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream)
{
const dim3 threads(32, 8);
const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
matchTemplatePreparedKernel_SQDIFF_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
cudaSafeCall( cudaGetLastError() );
void matchTemplatePrepared_SQDIFF_8U(
int w, int h, const DevMem2D_<unsigned long long> image_sqsum,
unsigned int templ_sqsum, DevMem2Df result, int cn)
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, int cn,
cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
switch (cn)
typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream);
static const caller_t callers[] =
{
case 1:
matchTemplatePreparedKernel_SQDIFF_8U<1><<<grid, threads>>>(
w, h, image_sqsum, templ_sqsum, result);
break;
case 2:
matchTemplatePreparedKernel_SQDIFF_8U<2><<<grid, threads>>>(
w, h, image_sqsum, templ_sqsum, result);
break;
case 3:
matchTemplatePreparedKernel_SQDIFF_8U<3><<<grid, threads>>>(
w, h, image_sqsum, templ_sqsum, result);
break;
case 4:
matchTemplatePreparedKernel_SQDIFF_8U<4><<<grid, threads>>>(
w, h, image_sqsum, templ_sqsum, result);
break;
}
cudaSafeCall( cudaGetLastError() );
0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4>
};
cudaSafeCall( cudaDeviceSynchronize() );
callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
}
//////////////////////////////////////////////////////////////////////
// Prepared_SQDIFF_NORMED
// normAcc* are accurate normalization routines which make GPU matchTemplate
// consistent with CPU one
@ -337,9 +285,7 @@ __device__ float normAcc_SQDIFF(float num, float denum)
template <int cn>
__global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(
int w, int h, const PtrStep<unsigned long long> image_sqsum,
unsigned int templ_sqsum, DevMem2Df result)
__global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -355,41 +301,37 @@ __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(
}
}
template <int cn>
void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum,
DevMem2Df result, cudaStream_t stream)
{
const dim3 threads(32, 8);
const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
matchTemplatePreparedKernel_SQDIFF_NORMED_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void matchTemplatePrepared_SQDIFF_NORMED_8U(
int w, int h, const DevMem2D_<unsigned long long> image_sqsum,
unsigned int templ_sqsum, DevMem2Df result, int cn)
void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum,
DevMem2Df result, int cn, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
switch (cn)
typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream);
static const caller_t callers[] =
{
case 1:
matchTemplatePreparedKernel_SQDIFF_NORMED_8U<1><<<grid, threads>>>(
w, h, image_sqsum, templ_sqsum, result);
break;
case 2:
matchTemplatePreparedKernel_SQDIFF_NORMED_8U<2><<<grid, threads>>>(
w, h, image_sqsum, templ_sqsum, result);
break;
case 3:
matchTemplatePreparedKernel_SQDIFF_NORMED_8U<3><<<grid, threads>>>(
w, h, image_sqsum, templ_sqsum, result);
break;
case 4:
matchTemplatePreparedKernel_SQDIFF_NORMED_8U<4><<<grid, threads>>>(
w, h, image_sqsum, templ_sqsum, result);
break;
}
cudaSafeCall( cudaGetLastError() );
0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>
};
cudaSafeCall( cudaDeviceSynchronize() );
callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
}
//////////////////////////////////////////////////////////////////////
// Prepared_CCOFF
__global__ void matchTemplatePreparedKernel_CCOFF_8U(
int w, int h, float templ_sum_scale,
const PtrStep<unsigned int> image_sum, DevMem2Df result)
__global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<unsigned int> image_sum, DevMem2Df result)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -404,21 +346,20 @@ __global__ void matchTemplatePreparedKernel_CCOFF_8U(
}
}
void matchTemplatePrepared_CCOFF_8U(
int w, int h, const DevMem2D_<unsigned int> image_sum,
unsigned int templ_sum, DevMem2Df result)
void matchTemplatePrepared_CCOFF_8U(int w, int h, const DevMem2D_<unsigned int> image_sum, unsigned int templ_sum, DevMem2Df result, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads>>>(
w, h, (float)templ_sum / (w * h), image_sum, result);
matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads, 0, stream>>>(w, h, (float)templ_sum / (w * h), image_sum, result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
__global__ void matchTemplatePreparedKernel_CCOFF_8UC2(
int w, int h, float templ_sum_scale_r, float templ_sum_scale_g,
const PtrStep<unsigned int> image_sum_r,
@ -442,25 +383,27 @@ __global__ void matchTemplatePreparedKernel_CCOFF_8UC2(
}
}
void matchTemplatePrepared_CCOFF_8UC2(
int w, int h,
const DevMem2D_<unsigned int> image_sum_r,
const DevMem2D_<unsigned int> image_sum_g,
unsigned int templ_sum_r, unsigned int templ_sum_g,
DevMem2Df result)
DevMem2Df result, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads>>>(
matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads, 0, stream>>>(
w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h),
image_sum_r, image_sum_g, result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
__global__ void matchTemplatePreparedKernel_CCOFF_8UC3(
int w, int h,
float templ_sum_scale_r,
@ -492,7 +435,6 @@ __global__ void matchTemplatePreparedKernel_CCOFF_8UC3(
}
}
void matchTemplatePrepared_CCOFF_8UC3(
int w, int h,
const DevMem2D_<unsigned int> image_sum_r,
@ -501,11 +443,12 @@ void matchTemplatePrepared_CCOFF_8UC3(
unsigned int templ_sum_r,
unsigned int templ_sum_g,
unsigned int templ_sum_b,
DevMem2Df result)
DevMem2Df result, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads>>>(
matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>(
w, h,
(float)templ_sum_r / (w * h),
(float)templ_sum_g / (w * h),
@ -513,10 +456,12 @@ void matchTemplatePrepared_CCOFF_8UC3(
image_sum_r, image_sum_g, image_sum_b, result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
__global__ void matchTemplatePreparedKernel_CCOFF_8UC4(
int w, int h,
float templ_sum_scale_r,
@ -554,7 +499,6 @@ __global__ void matchTemplatePreparedKernel_CCOFF_8UC4(
}
}
void matchTemplatePrepared_CCOFF_8UC4(
int w, int h,
const DevMem2D_<unsigned int> image_sum_r,
@ -565,11 +509,12 @@ void matchTemplatePrepared_CCOFF_8UC4(
unsigned int templ_sum_g,
unsigned int templ_sum_b,
unsigned int templ_sum_a,
DevMem2Df result)
DevMem2Df result, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads>>>(
matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>(
w, h,
(float)templ_sum_r / (w * h),
(float)templ_sum_g / (w * h),
@ -579,9 +524,12 @@ void matchTemplatePrepared_CCOFF_8UC4(
result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
//////////////////////////////////////////////////////////////////////
// Prepared_CCOFF_NORMED
__global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
int w, int h, float weight,
@ -607,12 +555,11 @@ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
}
}
void matchTemplatePrepared_CCOFF_NORMED_8U(
int w, int h, const DevMem2D_<unsigned int> image_sum,
const DevMem2D_<unsigned long long> image_sqsum,
unsigned int templ_sum, unsigned int templ_sqsum,
DevMem2Df result)
DevMem2Df result, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@ -620,15 +567,18 @@ void matchTemplatePrepared_CCOFF_NORMED_8U(
float weight = 1.f / (w * h);
float templ_sum_scale = templ_sum * weight;
float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum;
matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads>>>(
matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>(
w, h, weight, templ_sum_scale, templ_sqsum_scale,
image_sum, image_sqsum, result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
__global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(
int w, int h, float weight,
float templ_sum_scale_r, float templ_sum_scale_g,
@ -663,14 +613,13 @@ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(
}
}
void matchTemplatePrepared_CCOFF_NORMED_8UC2(
int w, int h,
const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
unsigned int templ_sum_r, unsigned int templ_sqsum_r,
unsigned int templ_sum_g, unsigned int templ_sqsum_g,
DevMem2Df result)
DevMem2Df result, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@ -680,7 +629,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC2(
float templ_sum_scale_g = templ_sum_g * weight;
float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
+ templ_sqsum_g - weight * templ_sum_g * templ_sum_g;
matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads>>>(
matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>(
w, h, weight,
templ_sum_scale_r, templ_sum_scale_g,
templ_sqsum_scale,
@ -689,10 +639,12 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC2(
result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
__global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(
int w, int h, float weight,
float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
@ -736,7 +688,6 @@ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(
}
}
void matchTemplatePrepared_CCOFF_NORMED_8UC3(
int w, int h,
const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
@ -745,7 +696,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC3(
unsigned int templ_sum_r, unsigned int templ_sqsum_r,
unsigned int templ_sum_g, unsigned int templ_sqsum_g,
unsigned int templ_sum_b, unsigned int templ_sqsum_b,
DevMem2Df result)
DevMem2Df result, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@ -757,7 +708,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC3(
float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
+ templ_sqsum_g - weight * templ_sum_g * templ_sum_g
+ templ_sqsum_b - weight * templ_sum_b * templ_sum_b;
matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads>>>(
matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>(
w, h, weight,
templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b,
templ_sqsum_scale,
@ -767,10 +719,12 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC3(
result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
__global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(
int w, int h, float weight,
float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
@ -821,7 +775,6 @@ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(
}
}
void matchTemplatePrepared_CCOFF_NORMED_8UC4(
int w, int h,
const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
@ -832,7 +785,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC4(
unsigned int templ_sum_g, unsigned int templ_sqsum_g,
unsigned int templ_sum_b, unsigned int templ_sqsum_b,
unsigned int templ_sum_a, unsigned int templ_sqsum_a,
DevMem2Df result)
DevMem2Df result, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@ -846,7 +799,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC4(
+ templ_sqsum_g - weight * templ_sum_g * templ_sum_g
+ templ_sqsum_b - weight * templ_sum_b * templ_sum_b
+ templ_sqsum_a - weight * templ_sum_a * templ_sum_a;
matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads>>>(
matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>(
w, h, weight,
templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a,
templ_sqsum_scale,
@ -857,9 +811,12 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC4(
result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
//////////////////////////////////////////////////////////////////////
// normalize
template <int cn>
__global__ void normalizeKernel_8U(
@ -878,32 +835,36 @@ __global__ void normalizeKernel_8U(
}
}
void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum,
unsigned int templ_sqsum, DevMem2Df result, int cn)
unsigned int templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
switch (cn)
{
case 1:
normalizeKernel_8U<1><<<grid, threads>>>(w, h, image_sqsum, templ_sqsum, result);
normalizeKernel_8U<1><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
break;
case 2:
normalizeKernel_8U<2><<<grid, threads>>>(w, h, image_sqsum, templ_sqsum, result);
normalizeKernel_8U<2><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
break;
case 3:
normalizeKernel_8U<3><<<grid, threads>>>(w, h, image_sqsum, templ_sqsum, result);
normalizeKernel_8U<3><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
break;
case 4:
normalizeKernel_8U<4><<<grid, threads>>>(w, h, image_sqsum, templ_sqsum, result);
normalizeKernel_8U<4><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
break;
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
//////////////////////////////////////////////////////////////////////
// extractFirstChannel
template <int cn>
__global__ void extractFirstChannel_32F(const PtrStepb image, DevMem2Df result)
@ -920,8 +881,7 @@ __global__ void extractFirstChannel_32F(const PtrStepb image, DevMem2Df result)
}
}
void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn)
void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@ -929,23 +889,21 @@ void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn)
switch (cn)
{
case 1:
extractFirstChannel_32F<1><<<grid, threads>>>(image, result);
extractFirstChannel_32F<1><<<grid, threads, 0, stream>>>(image, result);
break;
case 2:
extractFirstChannel_32F<2><<<grid, threads>>>(image, result);
extractFirstChannel_32F<2><<<grid, threads, 0, stream>>>(image, result);
break;
case 3:
extractFirstChannel_32F<3><<<grid, threads>>>(image, result);
extractFirstChannel_32F<3><<<grid, threads, 0, stream>>>(image, result);
break;
case 4:
extractFirstChannel_32F<4><<<grid, threads>>>(image, result);
extractFirstChannel_32F<4><<<grid, threads, 0, stream>>>(image, result);
break;
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
}}}

@ -45,16 +45,18 @@
#include "cuda_runtime_api.h"
#include "cufft.h"
//#include <nppdefs.h>
#include "NCV.hpp"
#if defined(__GNUC__)
#define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
#define cufftSafeCall(expr) ___cufftSafeCall(expr, __FILE__, __LINE__, __func__)
#define nppSafeCall(expr) ___nppSafeCall(expr, __FILE__, __LINE__, __func__)
#define ncvSafeCall(expr) ___ncvSafeCall(expr, __FILE__, __LINE__, __func__)
#define cufftSafeCall(expr) ___cufftSafeCall(expr, __FILE__, __LINE__, __func__)
#else /* defined(__CUDACC__) || defined(__MSVC__) */
#define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__)
#define cufftSafeCall(expr) ___cufftSafeCall(expr, __FILE__, __LINE__)
#define nppSafeCall(expr) ___nppSafeCall(expr, __FILE__, __LINE__)
#define ncvSafeCall(expr) ___ncvSafeCall(expr, __FILE__, __LINE__)
#define cufftSafeCall(expr) ___cufftSafeCall(expr, __FILE__, __LINE__)
#endif
namespace cv
@ -62,8 +64,9 @@ namespace cv
namespace gpu
{
void error(const char *error_string, const char *file, const int line, const char *func = "");
void nppError(int err, const char *file, const int line, const char *func = "");
void cufftError(int err, const char *file, const int line, const char *func = "");
void nppError(int err, const char *file, const int line, const char *func = "");
void ncvError(int err, const char *file, const int line, const char *func = "");
void cufftError(int err, const char *file, const int line, const char *func = "");
static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
{
@ -71,17 +74,23 @@ namespace cv
cv::gpu::error(cudaGetErrorString(err), file, line, func);
}
static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
{
if (CUFFT_SUCCESS != err)
cv::gpu::cufftError(err, file, line, func);
}
static inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
{
if (err < 0)
cv::gpu::nppError(err, file, line, func);
}
static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")
{
if (NCV_SUCCESS != err)
cv::gpu::ncvError(err, file, line, func);
}
static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
{
if (CUFFT_SUCCESS != err)
cv::gpu::cufftError(err, file, line, func);
}
}
}

@ -42,30 +42,45 @@
#include "precomp.hpp"
using namespace cv;
using namespace cv::gpu;
using namespace std;
#if !defined (HAVE_CUDA)
#else /* !defined (HAVE_CUDA) */
#ifdef HAVE_CUDA
namespace
{
#define error_entry(entry) { entry, #entry }
//////////////////////////////////////////////////////////////////////////
// NPP errors
struct NppError
struct ErrorEntry
{
int error;
int code;
string str;
}
};
struct ErrorEntryComparer
{
int code;
ErrorEntryComparer(int code_) : code(code_) {};
bool operator()(const ErrorEntry& e) const { return e.code == code; }
};
string getErrorString(int code, const ErrorEntry* errors, size_t n)
{
size_t idx = find_if(errors, errors + n, ErrorEntryComparer(code)) - errors;
const string& msg = (idx != n) ? errors[idx].str : string("Unknown error code");
ostringstream ostr;
ostr << msg << " [Code = " << code << "]";
return ostr.str();
}
//////////////////////////////////////////////////////////////////////////
// NPP errors
npp_errors [] =
const ErrorEntry npp_errors [] =
{
error_entry( NPP_NOT_SUPPORTED_MODE_ERROR ),
error_entry( NPP_ROUND_MODE_NOT_SUPPORTED_ERROR ),
@ -74,6 +89,7 @@ namespace
#if defined (_MSC_VER)
error_entry( NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY ),
#endif
error_entry( NPP_BAD_ARG_ERROR ),
error_entry( NPP_LUT_NUMBER_OF_LEVELS_ERROR ),
error_entry( NPP_TEXTURE_BIND_ERROR ),
@ -110,106 +126,116 @@ namespace
error_entry( NPP_ODD_ROI_WARNING )
};
const size_t error_num = sizeof(npp_errors) / sizeof(npp_errors[0]);
const size_t npp_error_num = sizeof(npp_errors) / sizeof(npp_errors[0]);
struct Searcher
//////////////////////////////////////////////////////////////////////////
// NCV errors
const ErrorEntry ncv_errors [] =
{
int err;
Searcher(int err_) : err(err_) {};
bool operator()(const NppError& e) const { return e.error == err; }
error_entry( NCV_SUCCESS ),
error_entry( NCV_UNKNOWN_ERROR ),
error_entry( NCV_CUDA_ERROR ),
error_entry( NCV_NPP_ERROR ),
error_entry( NCV_FILE_ERROR ),
error_entry( NCV_NULL_PTR ),
error_entry( NCV_INCONSISTENT_INPUT ),
error_entry( NCV_TEXTURE_BIND_ERROR ),
error_entry( NCV_DIMENSIONS_INVALID ),
error_entry( NCV_INVALID_ROI ),
error_entry( NCV_INVALID_STEP ),
error_entry( NCV_INVALID_SCALE ),
error_entry( NCV_INVALID_SCALE ),
error_entry( NCV_ALLOCATOR_NOT_INITIALIZED ),
error_entry( NCV_ALLOCATOR_BAD_ALLOC ),
error_entry( NCV_ALLOCATOR_BAD_DEALLOC ),
error_entry( NCV_ALLOCATOR_INSUFFICIENT_CAPACITY ),
error_entry( NCV_ALLOCATOR_DEALLOC_ORDER ),
error_entry( NCV_ALLOCATOR_BAD_REUSE ),
error_entry( NCV_MEM_COPY_ERROR ),
error_entry( NCV_MEM_RESIDENCE_ERROR ),
error_entry( NCV_MEM_INSUFFICIENT_CAPACITY ),
error_entry( NCV_HAAR_INVALID_PIXEL_STEP ),
error_entry( NCV_HAAR_TOO_MANY_FEATURES_IN_CLASSIFIER ),
error_entry( NCV_HAAR_TOO_MANY_FEATURES_IN_CASCADE ),
error_entry( NCV_HAAR_TOO_LARGE_FEATURES ),
error_entry( NCV_HAAR_XML_LOADING_EXCEPTION ),
error_entry( NCV_NOIMPL_HAAR_TILTED_FEATURES ),
error_entry( NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW ),
error_entry( NPPST_SUCCESS ),
error_entry( NPPST_ERROR ),
error_entry( NPPST_CUDA_KERNEL_EXECUTION_ERROR ),
error_entry( NPPST_NULL_POINTER_ERROR ),
error_entry( NPPST_TEXTURE_BIND_ERROR ),
error_entry( NPPST_MEMCPY_ERROR ),
error_entry( NPPST_MEM_ALLOC_ERR ),
error_entry( NPPST_MEMFREE_ERR ),
error_entry( NPPST_INVALID_ROI ),
error_entry( NPPST_INVALID_STEP ),
error_entry( NPPST_INVALID_SCALE ),
error_entry( NPPST_MEM_INSUFFICIENT_BUFFER ),
error_entry( NPPST_MEM_RESIDENCE_ERROR ),
error_entry( NPPST_MEM_INTERNAL_ERROR )
};
const size_t ncv_error_num = sizeof(npp_errors) / sizeof(npp_errors[0]);
//////////////////////////////////////////////////////////////////////////
// CUFFT errors
struct CufftError
{
int code;
string message;
};
const CufftError cufft_errors[] =
{
error_entry(CUFFT_INVALID_PLAN),
error_entry(CUFFT_ALLOC_FAILED),
error_entry(CUFFT_INVALID_TYPE),
error_entry(CUFFT_INVALID_VALUE),
error_entry(CUFFT_INTERNAL_ERROR),
error_entry(CUFFT_EXEC_FAILED),
error_entry(CUFFT_SETUP_FAILED),
error_entry(CUFFT_INVALID_SIZE),
error_entry(CUFFT_UNALIGNED_DATA)
};
struct CufftErrorComparer
const ErrorEntry cufft_errors[] =
{
CufftErrorComparer(int code_): code(code_) {}
bool operator()(const CufftError& other) const
{
return other.code == code;
}
int code;
error_entry( CUFFT_INVALID_PLAN ),
error_entry( CUFFT_ALLOC_FAILED ),
error_entry( CUFFT_INVALID_TYPE ),
error_entry( CUFFT_INVALID_VALUE ),
error_entry( CUFFT_INTERNAL_ERROR ),
error_entry( CUFFT_EXEC_FAILED ),
error_entry( CUFFT_SETUP_FAILED ),
error_entry( CUFFT_INVALID_SIZE ),
error_entry( CUFFT_UNALIGNED_DATA )
};
const int cufft_error_num = sizeof(cufft_errors) / sizeof(cufft_errors[0]);
}
namespace cv
{
namespace gpu
{
const string getNppErrorString( int err )
{
size_t idx = std::find_if(npp_errors, npp_errors + error_num, Searcher(err)) - npp_errors;
const string& msg = (idx != error_num) ? npp_errors[idx].str : string("Unknown error code");
std::stringstream interpreter;
interpreter << msg <<" [Code = " << err << "]";
return interpreter.str();
}
void nppError( int err, const char *file, const int line, const char *func)
{
cv::error( cv::Exception(CV_GpuNppCallError, getNppErrorString(err), func, file, line) );
}
const string getCufftErrorString(int err_code)
{
const CufftError* cufft_error = std::find_if(
cufft_errors, cufft_errors + cufft_error_num,
CufftErrorComparer(err_code));
bool found = cufft_error != cufft_errors + cufft_error_num;
std::stringstream ss;
ss << (found ? cufft_error->message : "Unknown error code");
ss << " [Code = " << err_code << "]";
return ss.str();
}
void cufftError(int err, const char *file, const int line, const char *func)
{
cv::error(cv::Exception(CV_GpuCufftCallError, getCufftErrorString(err), func, file, line));
}
void error(const char *error_string, const char *file, const int line, const char *func)
{
int code = CV_GpuApiCallError;
if (std::uncaught_exception())
if (uncaught_exception())
{
const char* errorStr = cvErrorStr(code);
const char* function = func ? func : "unknown function";
std::cerr << "OpenCV Error: " << errorStr << "(" << error_string << ") in " << function << ", file " << file << ", line " << line;
std::cerr.flush();
cerr << "OpenCV Error: " << errorStr << "(" << error_string << ") in " << function << ", file " << file << ", line " << line;
cerr.flush();
}
else
cv::error( cv::Exception(code, error_string, func, file, line) );
}
void nppError(int code, const char *file, const int line, const char *func)
{
string msg = getErrorString(code, npp_errors, npp_error_num);
cv::gpu::error(msg.c_str(), file, line, func);
}
void ncvError(int code, const char *file, const int line, const char *func)
{
string msg = getErrorString(code, ncv_errors, ncv_error_num);
cv::gpu::error(msg.c_str(), file, line, func);
}
void cufftError(int code, const char *file, const int line, const char *func)
{
string msg = getErrorString(code, cufft_errors, cufft_error_num);
cv::gpu::error(msg.c_str(), file, line, func);
}
}
}

@ -48,8 +48,8 @@ using namespace cv::gpu;
#if !defined (HAVE_CUDA)
void cv::gpu::remap(const GpuMat&, GpuMat&, const GpuMat&, const GpuMat&, int, int, const Scalar&, Stream&){ throw_nogpu(); }
void cv::gpu::meanShiftFiltering(const GpuMat&, GpuMat&, int, int, TermCriteria) { throw_nogpu(); }
void cv::gpu::meanShiftProc(const GpuMat&, GpuMat&, GpuMat&, int, int, TermCriteria) { throw_nogpu(); }
void cv::gpu::meanShiftFiltering(const GpuMat&, GpuMat&, int, int, TermCriteria, Stream&) { throw_nogpu(); }
void cv::gpu::meanShiftProc(const GpuMat&, GpuMat&, GpuMat&, int, int, TermCriteria, Stream&) { throw_nogpu(); }
void cv::gpu::drawColorDisp(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
void cv::gpu::reprojectImageTo3D(const GpuMat&, GpuMat&, const Mat&, Stream&) { throw_nogpu(); }
void cv::gpu::resize(const GpuMat&, GpuMat&, Size, double, double, int, Stream&) { throw_nogpu(); }
@ -82,14 +82,16 @@ void cv::gpu::equalizeHist(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nog
void cv::gpu::equalizeHist(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, int, int, double, int) { throw_nogpu(); }
void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, double, int) { throw_nogpu(); }
void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, double, int, Stream&) { throw_nogpu(); }
void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, int, int, int) { throw_nogpu(); }
void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, int) { throw_nogpu(); }
void cv::gpu::mulSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, bool) { throw_nogpu(); }
void cv::gpu::mulAndScaleSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, float, bool) { throw_nogpu(); }
void cv::gpu::dft(const GpuMat&, GpuMat&, Size, int) { throw_nogpu(); }
void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, int, Stream&) { throw_nogpu(); }
void cv::gpu::mulSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, bool, Stream&) { throw_nogpu(); }
void cv::gpu::mulAndScaleSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, float, bool, Stream&) { throw_nogpu(); }
void cv::gpu::dft(const GpuMat&, GpuMat&, Size, int, Stream&) { throw_nogpu(); }
void cv::gpu::ConvolveBuf::create(Size, Size) { throw_nogpu(); }
void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool) { throw_nogpu(); }
void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&) { throw_nogpu(); }
void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream& stream) { throw_nogpu(); }
void cv::gpu::pyrDown(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
void cv::gpu::pyrUp(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int, bool) { throw_nogpu(); }
@ -151,10 +153,10 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
namespace cv { namespace gpu { namespace imgproc
{
extern "C" void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps);
void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
}}}
void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, TermCriteria criteria)
void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, TermCriteria criteria, Stream& stream)
{
if( src.empty() )
CV_Error( CV_StsBadArg, "The input image is empty" );
@ -174,7 +176,7 @@ void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,
eps = 1.f;
eps = (float)std::max(criteria.epsilon, 0.0);
imgproc::meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps);
imgproc::meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps, StreamAccessor::getStream(stream));
}
////////////////////////////////////////////////////////////////////////
@ -182,10 +184,10 @@ void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,
namespace cv { namespace gpu { namespace imgproc
{
extern "C" void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps);
void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
}}}
void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, TermCriteria criteria)
void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, TermCriteria criteria, Stream& stream)
{
if( src.empty() )
CV_Error( CV_StsBadArg, "The input image is empty" );
@ -206,7 +208,7 @@ void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int
eps = 1.f;
eps = (float)std::max(criteria.epsilon, 0.0);
imgproc::meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps);
imgproc::meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps, StreamAccessor::getStream(stream));
}
////////////////////////////////////////////////////////////////////////
@ -766,14 +768,14 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S
cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
Ncv32u bufSize;
nppSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer);
cudaStream_t stream = StreamAccessor::getStream(s);
NppStStreamHandler h(stream);
nppSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), static_cast<int>(src.step),
ncvSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), static_cast<int>(src.step),
sum.ptr<Ncv32u>(), static_cast<int>(sum.step), roiSize, buffer.ptr<Ncv8u>(), bufSize, prop) );
if (stream == 0)
@ -819,7 +821,7 @@ void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& s)
cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
Ncv32u bufSize;
nppSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop));
ncvSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop));
GpuMat buf(1, bufSize, CV_8U);
cudaStream_t stream = StreamAccessor::getStream(s);
@ -827,7 +829,7 @@ void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& s)
NppStStreamHandler h(stream);
sqsum.create(src.rows + 1, src.cols + 1, CV_64F);
nppSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>(0)), static_cast<int>(src.step),
ncvSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>(0)), static_cast<int>(src.step),
sqsum.ptr<Ncv64u>(0), static_cast<int>(sqsum.step), roiSize, buf.ptr<Ncv8u>(0), bufSize, prop));
if (stream == 0)
@ -1260,16 +1262,16 @@ void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat&
namespace cv { namespace gpu { namespace imgproc {
void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst);
void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type);
void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type);
void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst, cudaStream_t stream);
void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream);
void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream);
}}}
namespace
{
template <typename T>
void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType)
void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)
{
double scale = (double)(1 << ((ksize > 0 ? ksize : 3) - 1)) * blockSize;
if (ksize < 0)
@ -1283,25 +1285,25 @@ namespace
if (ksize > 0)
{
Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, borderType);
Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, borderType);
Sobel(src, Dx, CV_32F, 1, 0, buf, ksize, scale, borderType, -1, stream);
Sobel(src, Dy, CV_32F, 0, 1, buf, ksize, scale, borderType, -1, stream);
}
else
{
Scharr(src, Dx, CV_32F, 1, 0, scale, borderType);
Scharr(src, Dy, CV_32F, 0, 1, scale, borderType);
Scharr(src, Dx, CV_32F, 1, 0, buf, scale, borderType, -1, stream);
Scharr(src, Dy, CV_32F, 0, 1, buf, scale, borderType, -1, stream);
}
}
void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType)
void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)
{
switch (src.type())
{
case CV_8U:
extractCovData<unsigned char>(src, Dx, Dy, blockSize, ksize, borderType);
extractCovData<unsigned char>(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);
break;
case CV_32F:
extractCovData<float>(src, Dx, Dy, blockSize, ksize, borderType);
extractCovData<float>(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);
break;
default:
CV_Error(CV_StsBadArg, "extractCovData: unsupported type of the source matrix");
@ -1343,6 +1345,12 @@ void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ks
}
void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, double k, int borderType)
{
GpuMat buf;
cornerHarris(src, dst, Dx, Dy, buf, blockSize, ksize, k, borderType);
}
void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, double k, int borderType, Stream& stream)
{
CV_Assert(borderType == cv::BORDER_REFLECT101 ||
borderType == cv::BORDER_REPLICATE);
@ -1350,9 +1358,9 @@ void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& D
int gpuBorderType;
CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
extractCovData(src, Dx, Dy, blockSize, ksize, borderType);
extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);
dst.create(src.size(), CV_32F);
imgproc::cornerHarris_caller(blockSize, (float)k, Dx, Dy, dst, gpuBorderType);
imgproc::cornerHarris_caller(blockSize, (float)k, Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream));
}
void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType)
@ -1362,6 +1370,12 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, i
}
void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType)
{
GpuMat buf;
cornerMinEigenVal(src, dst, Dx, Dy, buf, blockSize, ksize, borderType);
}
void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)
{
CV_Assert(borderType == cv::BORDER_REFLECT101 ||
borderType == cv::BORDER_REPLICATE);
@ -1369,9 +1383,9 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuM
int gpuBorderType;
CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
extractCovData(src, Dx, Dy, blockSize, ksize, borderType);
extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);
dst.create(src.size(), CV_32F);
imgproc::cornerMinEigenVal_caller(blockSize, Dx, Dy, dst, gpuBorderType);
imgproc::cornerMinEigenVal_caller(blockSize, Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream));
}
//////////////////////////////////////////////////////////////////////////////
@ -1379,21 +1393,16 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuM
namespace cv { namespace gpu { namespace imgproc
{
void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,
DevMem2D_<cufftComplex> c);
void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream);
void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,
DevMem2D_<cufftComplex> c);
void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream);
}}}
void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c,
int flags, bool conjB)
void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB, Stream& stream)
{
typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>,
DevMem2D_<cufftComplex>);
static Caller callers[] = { imgproc::mulSpectrums,
imgproc::mulSpectrums_CONJ };
typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, DevMem2D_<cufftComplex>, cudaStream_t stream);
static Caller callers[] = { imgproc::mulSpectrums, imgproc::mulSpectrums_CONJ };
CV_Assert(a.type() == b.type() && a.type() == CV_32FC2);
CV_Assert(a.size() == b.size());
@ -1401,7 +1410,7 @@ void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c,
c.create(a.size(), CV_32FC2);
Caller caller = callers[(int)conjB];
caller(a, b, c);
caller(a, b, c, StreamAccessor::getStream(stream));
}
//////////////////////////////////////////////////////////////////////////////
@ -1409,21 +1418,16 @@ void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c,
namespace cv { namespace gpu { namespace imgproc
{
void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,
float scale, DevMem2D_<cufftComplex> c);
void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream);
void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,
float scale, DevMem2D_<cufftComplex> c);
void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream);
}}}
void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c,
int flags, float scale, bool conjB)
void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB, Stream& stream)
{
typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>,
float scale, DevMem2D_<cufftComplex>);
static Caller callers[] = { imgproc::mulAndScaleSpectrums,
imgproc::mulAndScaleSpectrums_CONJ };
typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, float scale, DevMem2D_<cufftComplex>, cudaStream_t stream);
static Caller callers[] = { imgproc::mulAndScaleSpectrums, imgproc::mulAndScaleSpectrums_CONJ };
CV_Assert(a.type() == b.type() && a.type() == CV_32FC2);
CV_Assert(a.size() == b.size());
@ -1431,14 +1435,26 @@ void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c,
c.create(a.size(), CV_32FC2);
Caller caller = callers[(int)conjB];
caller(a, b, scale, c);
caller(a, b, scale, c, StreamAccessor::getStream(stream));
}
//////////////////////////////////////////////////////////////////////////////
// dft
void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags)
void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags, Stream& stream)
{
#ifndef HAVE_CUFFT
OPENCV_GPU_UNUSED(src);
OPENCV_GPU_UNUSED(dst);
OPENCV_GPU_UNUSED(dft_size);
OPENCV_GPU_UNUSED(flags);
OPENCV_GPU_UNUSED(stream);
throw_nogpu();
#else
CV_Assert(src.type() == CV_32F || src.type() == CV_32FC2);
// We don't support unpacked output (in the case of real input)
@ -1483,6 +1499,8 @@ void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags)
else
cufftPlan2d(&plan, dft_size_opt.height, dft_size_opt.width, dft_type);
cufftSafeCall( cufftSetStream(plan, StreamAccessor::getStream(stream)) );
if (is_complex_input)
{
if (is_complex_output)
@ -1514,7 +1532,9 @@ void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags)
cufftSafeCall(cufftDestroy(plan));
if (is_scaled_dft)
multiply(dst, Scalar::all(1. / dft_size.area()), dst);
multiply(dst, Scalar::all(1. / dft_size.area()), dst, 1, -1, stream);
#endif
}
//////////////////////////////////////////////////////////////////////////////
@ -1563,8 +1583,7 @@ Size cv::gpu::ConvolveBuf::estimateBlockSize(Size result_size, Size templ_size)
}
void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
bool ccorr)
void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr)
{
ConvolveBuf buf;
convolve(image, templ, result, ccorr, buf);
@ -1572,12 +1591,37 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
namespace cv { namespace gpu { namespace imgproc
{
void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel);
void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream);
}}}
void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
bool ccorr, ConvolveBuf& buf)
void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream)
{
#ifndef HAVE_CUFFT
CV_Assert(image.type() == CV_32F);
CV_Assert(templ.type() == CV_32F);
CV_Assert(templ.cols <= 17 && templ.rows <= 17);
result.create(image.size(), CV_32F);
GpuMat& contKernel = buf.templ_block;
if (templ.isContinuous())
contKernel = templ;
else
{
contKernel = createContinuous(templ.size(), templ.type());
if (stream)
stream.enqueueCopy(templ, contKernel);
else
templ.copyTo(contKernel);
}
imgproc::convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr<float>(), StreamAccessor::getStream(stream));
#else
StaticAssert<sizeof(float) == sizeof(cufftReal)>::check();
StaticAssert<sizeof(float) * 2 == sizeof(cufftComplex)>::check();
@ -1587,77 +1631,91 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
if (templ.cols < 13 && templ.rows < 13)
{
result.create(image.size(), CV_32F);
GpuMat contKernel;
GpuMat& contKernel = buf.templ_block;
if (templ.isContinuous())
contKernel = templ;
else
{
contKernel = createContinuous(templ.size(), templ.type());
templ.copyTo(contKernel);
}
imgproc::convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr<float>());
if (stream)
stream.enqueueCopy(templ, contKernel);
else
templ.copyTo(contKernel);
}
return;
imgproc::convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr<float>(), StreamAccessor::getStream(stream));
}
else
{
buf.create(image.size(), templ.size());
result.create(buf.result_size, CV_32F);
buf.create(image.size(), templ.size());
result.create(buf.result_size, CV_32F);
Size& block_size = buf.block_size;
Size& dft_size = buf.dft_size;
Size& block_size = buf.block_size;
Size& dft_size = buf.dft_size;
GpuMat& image_block = buf.image_block;
GpuMat& templ_block = buf.templ_block;
GpuMat& result_data = buf.result_data;
GpuMat& image_block = buf.image_block;
GpuMat& templ_block = buf.templ_block;
GpuMat& result_data = buf.result_data;
GpuMat& image_spect = buf.image_spect;
GpuMat& templ_spect = buf.templ_spect;
GpuMat& result_spect = buf.result_spect;
GpuMat& image_spect = buf.image_spect;
GpuMat& templ_spect = buf.templ_spect;
GpuMat& result_spect = buf.result_spect;
cufftHandle planR2C, planC2R;
cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R));
cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C));
cufftHandle planR2C, planC2R;
cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R));
cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C));
cufftSafeCall( cufftSetStream(planR2C, StreamAccessor::getStream(stream)) );
cufftSafeCall( cufftSetStream(planC2R, StreamAccessor::getStream(stream)) );
GpuMat templ_roi(templ.size(), CV_32F, templ.data, templ.step);
copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0,
templ_block.cols - templ_roi.cols, 0);
GpuMat templ_roi(templ.size(), CV_32F, templ.data, templ.step);
copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0,
templ_block.cols - templ_roi.cols, 0, Scalar(), stream);
cufftSafeCall(cufftExecR2C(planR2C, templ_block.ptr<cufftReal>(),
templ_spect.ptr<cufftComplex>()));
cufftSafeCall(cufftExecR2C(planR2C, templ_block.ptr<cufftReal>(),
templ_spect.ptr<cufftComplex>()));
// Process all blocks of the result matrix
for (int y = 0; y < result.rows; y += block_size.height)
{
for (int x = 0; x < result.cols; x += block_size.width)
// Process all blocks of the result matrix
for (int y = 0; y < result.rows; y += block_size.height)
{
Size image_roi_size(std::min(x + dft_size.width, image.cols) - x,
std::min(y + dft_size.height, image.rows) - y);
GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr<float>(y) + x),
image.step);
copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows,
0, image_block.cols - image_roi.cols, 0);
cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr<cufftReal>(),
image_spect.ptr<cufftComplex>()));
mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0,
1.f / dft_size.area(), ccorr);
cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr<cufftComplex>(),
result_data.ptr<cufftReal>()));
Size result_roi_size(std::min(x + block_size.width, result.cols) - x,
std::min(y + block_size.height, result.rows) - y);
GpuMat result_roi(result_roi_size, result.type(),
(void*)(result.ptr<float>(y) + x), result.step);
GpuMat result_block(result_roi_size, result_data.type(),
result_data.ptr(), result_data.step);
result_block.copyTo(result_roi);
for (int x = 0; x < result.cols; x += block_size.width)
{
Size image_roi_size(std::min(x + dft_size.width, image.cols) - x,
std::min(y + dft_size.height, image.rows) - y);
GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr<float>(y) + x),
image.step);
copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows,
0, image_block.cols - image_roi.cols, 0, Scalar(), stream);
cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr<cufftReal>(),
image_spect.ptr<cufftComplex>()));
mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0,
1.f / dft_size.area(), ccorr, stream);
cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr<cufftComplex>(),
result_data.ptr<cufftReal>()));
Size result_roi_size(std::min(x + block_size.width, result.cols) - x,
std::min(y + block_size.height, result.rows) - y);
GpuMat result_roi(result_roi_size, result.type(),
(void*)(result.ptr<float>(y) + x), result.step);
GpuMat result_block(result_roi_size, result_data.type(),
result_data.ptr(), result_data.step);
if (stream)
stream.enqueueCopy(result_block, result_roi);
else
result_block.copyTo(result_roi);
}
}
cufftSafeCall(cufftDestroy(planR2C));
cufftSafeCall(cufftDestroy(planC2R));
}
cufftSafeCall(cufftDestroy(planR2C));
cufftSafeCall(cufftDestroy(planC2R));
#endif
}
//////////////////////////////////////////////////////////////////////////////

@ -47,43 +47,32 @@ using namespace cv::gpu;
#if !defined (HAVE_CUDA)
void cv::gpu::matchTemplate(const GpuMat&, const GpuMat&, GpuMat&, int) { throw_nogpu(); }
void cv::gpu::matchTemplate(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
#else
namespace cv { namespace gpu { namespace imgproc
{
void matchTemplateNaive_CCORR_8U(
const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn);
void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
void matchTemplateNaive_CCORR_32F(
const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn);
void matchTemplateNaive_SQDIFF_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
void matchTemplateNaive_SQDIFF_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
void matchTemplateNaive_SQDIFF_8U(
const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn);
void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result,
int cn, cudaStream_t stream);
void matchTemplateNaive_SQDIFF_32F(
const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn);
void matchTemplatePrepared_SQDIFF_8U(
int w, int h, const DevMem2D_<unsigned long long> image_sqsum,
unsigned int templ_sqsum, DevMem2Df result, int cn);
void matchTemplatePrepared_SQDIFF_NORMED_8U(
int w, int h, const DevMem2D_<unsigned long long> image_sqsum,
unsigned int templ_sqsum, DevMem2Df result, int cn);
void matchTemplatePrepared_CCOFF_8U(
int w, int h, const DevMem2D_<unsigned int> image_sum,
unsigned int templ_sum, DevMem2Df result);
void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result,
int cn, cudaStream_t stream);
void matchTemplatePrepared_CCOFF_8U(int w, int h, const DevMem2D_<unsigned int> image_sum, unsigned int templ_sum, DevMem2Df result, cudaStream_t stream);
void matchTemplatePrepared_CCOFF_8UC2(
int w, int h,
const DevMem2D_<unsigned int> image_sum_r,
const DevMem2D_<unsigned int> image_sum_g,
unsigned int templ_sum_r, unsigned int templ_sum_g,
DevMem2Df result);
int w, int h,
const DevMem2D_<unsigned int> image_sum_r,
const DevMem2D_<unsigned int> image_sum_g,
unsigned int templ_sum_r,
unsigned int templ_sum_g,
DevMem2Df result, cudaStream_t stream);
void matchTemplatePrepared_CCOFF_8UC3(
int w, int h,
const DevMem2D_<unsigned int> image_sum_r,
@ -92,8 +81,7 @@ namespace cv { namespace gpu { namespace imgproc
unsigned int templ_sum_r,
unsigned int templ_sum_g,
unsigned int templ_sum_b,
DevMem2Df result);
DevMem2Df result, cudaStream_t stream);
void matchTemplatePrepared_CCOFF_8UC4(
int w, int h,
const DevMem2D_<unsigned int> image_sum_r,
@ -104,22 +92,21 @@ namespace cv { namespace gpu { namespace imgproc
unsigned int templ_sum_g,
unsigned int templ_sum_b,
unsigned int templ_sum_a,
DevMem2Df result);
DevMem2Df result, cudaStream_t stream);
void matchTemplatePrepared_CCOFF_NORMED_8U(
int w, int h, const DevMem2D_<unsigned int> image_sum,
const DevMem2D_<unsigned long long> image_sqsum,
unsigned int templ_sum, unsigned int templ_sqsum,
DevMem2Df result);
DevMem2Df result, cudaStream_t stream);
void matchTemplatePrepared_CCOFF_NORMED_8UC2(
int w, int h,
const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
unsigned int templ_sum_r, unsigned int templ_sqsum_r,
unsigned int templ_sum_g, unsigned int templ_sqsum_g,
DevMem2Df result);
DevMem2Df result, cudaStream_t stream);
void matchTemplatePrepared_CCOFF_NORMED_8UC3(
int w, int h,
const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
@ -128,8 +115,7 @@ namespace cv { namespace gpu { namespace imgproc
unsigned int templ_sum_r, unsigned int templ_sqsum_r,
unsigned int templ_sum_g, unsigned int templ_sqsum_g,
unsigned int templ_sum_b, unsigned int templ_sqsum_b,
DevMem2Df result);
DevMem2Df result, cudaStream_t stream);
void matchTemplatePrepared_CCOFF_NORMED_8UC4(
int w, int h,
const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
@ -140,12 +126,12 @@ namespace cv { namespace gpu { namespace imgproc
unsigned int templ_sum_g, unsigned int templ_sqsum_g,
unsigned int templ_sum_b, unsigned int templ_sqsum_b,
unsigned int templ_sum_a, unsigned int templ_sqsum_a,
DevMem2Df result);
DevMem2Df result, cudaStream_t stream);
void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum,
unsigned int templ_sqsum, DevMem2Df result, int cn);
unsigned int templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream);
void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn);
void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cudaStream_t stream);
}}}
@ -186,103 +172,111 @@ namespace
}
void matchTemplate_CCORR_32F(const GpuMat& image, const GpuMat& templ, GpuMat& result)
void matchTemplate_CCORR_32F(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)
{
result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
if (templ.size().area() < getTemplateThreshold(CV_TM_CCORR, CV_32F))
{
imgproc::matchTemplateNaive_CCORR_32F(image, templ, result, image.channels());
imgproc::matchTemplateNaive_CCORR_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
return;
}
GpuMat result_;
convolve(image.reshape(1), templ.reshape(1), result_, true);
imgproc::extractFirstChannel_32F(result_, result, image.channels());
ConvolveBuf buf;
convolve(image.reshape(1), templ.reshape(1), result_, true, buf, stream);
imgproc::extractFirstChannel_32F(result_, result, image.channels(), StreamAccessor::getStream(stream));
}
void matchTemplate_CCORR_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result)
void matchTemplate_CCORR_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)
{
if (templ.size().area() < getTemplateThreshold(CV_TM_CCORR, CV_8U))
{
result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
imgproc::matchTemplateNaive_CCORR_8U(image, templ, result, image.channels());
imgproc::matchTemplateNaive_CCORR_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
return;
}
GpuMat imagef, templf;
image.convertTo(imagef, CV_32F);
templ.convertTo(templf, CV_32F);
matchTemplate_CCORR_32F(imagef, templf, result);
if (stream)
{
stream.enqueueConvert(image, imagef, CV_32F);
stream.enqueueConvert(templ, templf, CV_32F);
}
else
{
image.convertTo(imagef, CV_32F);
templ.convertTo(templf, CV_32F);
}
matchTemplate_CCORR_32F(imagef, templf, result, stream);
}
void matchTemplate_CCORR_NORMED_8U(const GpuMat& image, const GpuMat& templ,
GpuMat& result)
void matchTemplate_CCORR_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)
{
matchTemplate_CCORR_8U(image, templ, result);
matchTemplate_CCORR_8U(image, templ, result, stream);
GpuMat img_sqsum;
sqrIntegral(image.reshape(1), img_sqsum);
sqrIntegral(image.reshape(1), img_sqsum, stream);
unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0];
imgproc::normalize_8U(templ.cols, templ.rows, img_sqsum, templ_sqsum,
result, image.channels());
result, image.channels(), StreamAccessor::getStream(stream));
}
void matchTemplate_SQDIFF_32F(const GpuMat& image, const GpuMat& templ, GpuMat& result)
void matchTemplate_SQDIFF_32F(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)
{
result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
imgproc::matchTemplateNaive_SQDIFF_32F(image, templ, result, image.channels());
imgproc::matchTemplateNaive_SQDIFF_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
}
void matchTemplate_SQDIFF_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result)
void matchTemplate_SQDIFF_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)
{
if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, CV_8U))
{
result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
imgproc::matchTemplateNaive_SQDIFF_8U(image, templ, result, image.channels());
imgproc::matchTemplateNaive_SQDIFF_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
return;
}
GpuMat img_sqsum;
sqrIntegral(image.reshape(1), img_sqsum);
sqrIntegral(image.reshape(1), img_sqsum, stream);
unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0];
matchTemplate_CCORR_8U(image, templ, result);
matchTemplate_CCORR_8U(image, templ, result, stream);
imgproc::matchTemplatePrepared_SQDIFF_8U(
templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels());
templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
}
void matchTemplate_SQDIFF_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result)
void matchTemplate_SQDIFF_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)
{
GpuMat img_sqsum;
sqrIntegral(image.reshape(1), img_sqsum);
sqrIntegral(image.reshape(1), img_sqsum, stream);
unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0];
matchTemplate_CCORR_8U(image, templ, result);
matchTemplate_CCORR_8U(image, templ, result, stream);
imgproc::matchTemplatePrepared_SQDIFF_NORMED_8U(
templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels());
templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
}
void matchTemplate_CCOFF_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result)
void matchTemplate_CCOFF_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)
{
matchTemplate_CCORR_8U(image, templ, result);
matchTemplate_CCORR_8U(image, templ, result, stream);
if (image.channels() == 1)
{
GpuMat image_sum;
integral(image, image_sum);
integral(image, image_sum, stream);
unsigned int templ_sum = (unsigned int)sum(templ)[0];
imgproc::matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows,
image_sum, templ_sum, result);
image_sum, templ_sum, result, StreamAccessor::getStream(stream));
}
else
{
@ -291,7 +285,7 @@ namespace
split(image, images);
for (int i = 0; i < image.channels(); ++i)
integral(images[i], image_sums[i]);
integral(images[i], image_sums[i], stream);
Scalar templ_sum = sum(templ);
@ -301,19 +295,19 @@ namespace
imgproc::matchTemplatePrepared_CCOFF_8UC2(
templ.cols, templ.rows, image_sums[0], image_sums[1],
(unsigned int)templ_sum[0], (unsigned int)templ_sum[1],
result);
result, StreamAccessor::getStream(stream));
break;
case 3:
imgproc::matchTemplatePrepared_CCOFF_8UC3(
templ.cols, templ.rows, image_sums[0], image_sums[1], image_sums[2],
(unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2],
result);
result, StreamAccessor::getStream(stream));
break;
case 4:
imgproc::matchTemplatePrepared_CCOFF_8UC4(
templ.cols, templ.rows, image_sums[0], image_sums[1], image_sums[2], image_sums[3],
(unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2],
(unsigned int)templ_sum[3], result);
(unsigned int)templ_sum[3], result, StreamAccessor::getStream(stream));
break;
default:
CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");
@ -322,25 +316,34 @@ namespace
}
void matchTemplate_CCOFF_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result)
void matchTemplate_CCOFF_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)
{
GpuMat imagef, templf;
image.convertTo(imagef, CV_32F);
templ.convertTo(templf, CV_32F);
matchTemplate_CCORR_32F(imagef, templf, result);
if (stream)
{
stream.enqueueConvert(image, imagef, CV_32F);
stream.enqueueConvert(templ, templf, CV_32F);
}
else
{
image.convertTo(imagef, CV_32F);
templ.convertTo(templf, CV_32F);
}
matchTemplate_CCORR_32F(imagef, templf, result, stream);
if (image.channels() == 1)
{
GpuMat image_sum, image_sqsum;
integral(image, image_sum);
sqrIntegral(image, image_sqsum);
integral(image, image_sum, stream);
sqrIntegral(image, image_sqsum, stream);
unsigned int templ_sum = (unsigned int)sum(templ)[0];
unsigned int templ_sqsum = (unsigned int)sqrSum(templ)[0];
imgproc::matchTemplatePrepared_CCOFF_NORMED_8U(
templ.cols, templ.rows, image_sum, image_sqsum,
templ_sum, templ_sqsum, result);
templ_sum, templ_sqsum, result, StreamAccessor::getStream(stream));
}
else
{
@ -351,8 +354,8 @@ namespace
split(image, images);
for (int i = 0; i < image.channels(); ++i)
{
integral(images[i], image_sums[i]);
sqrIntegral(images[i], image_sqsums[i]);
integral(images[i], image_sums[i], stream);
sqrIntegral(images[i], image_sqsums[i], stream);
}
Scalar templ_sum = sum(templ);
@ -367,7 +370,7 @@ namespace
image_sums[1], image_sqsums[1],
(unsigned int)templ_sum[0], (unsigned int)templ_sqsum[0],
(unsigned int)templ_sum[1], (unsigned int)templ_sqsum[1],
result);
result, StreamAccessor::getStream(stream));
break;
case 3:
imgproc::matchTemplatePrepared_CCOFF_NORMED_8UC3(
@ -378,7 +381,7 @@ namespace
(unsigned int)templ_sum[0], (unsigned int)templ_sqsum[0],
(unsigned int)templ_sum[1], (unsigned int)templ_sqsum[1],
(unsigned int)templ_sum[2], (unsigned int)templ_sqsum[2],
result);
result, StreamAccessor::getStream(stream));
break;
case 4:
imgproc::matchTemplatePrepared_CCOFF_NORMED_8UC4(
@ -391,7 +394,7 @@ namespace
(unsigned int)templ_sum[1], (unsigned int)templ_sqsum[1],
(unsigned int)templ_sum[2], (unsigned int)templ_sqsum[2],
(unsigned int)templ_sum[3], (unsigned int)templ_sqsum[3],
result);
result, StreamAccessor::getStream(stream));
break;
default:
CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");
@ -401,12 +404,12 @@ namespace
}
void cv::gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method)
void cv::gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream& stream)
{
CV_Assert(image.type() == templ.type());
CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows);
typedef void (*Caller)(const GpuMat&, const GpuMat&, GpuMat&);
typedef void (*Caller)(const GpuMat&, const GpuMat&, GpuMat&, Stream& stream);
static const Caller callers8U[] = { ::matchTemplate_SQDIFF_8U, ::matchTemplate_SQDIFF_NORMED_8U,
::matchTemplate_CCORR_8U, ::matchTemplate_CCORR_NORMED_8U,
@ -424,7 +427,7 @@ void cv::gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& re
Caller caller = callers[method];
CV_Assert(caller);
caller(image, templ, result);
caller(image, templ, result, stream);
}
#endif

@ -59,10 +59,8 @@ namespace
NCVMatrix<Ncv32f>& u, NCVMatrix<Ncv32f>& v, const cudaDeviceProp& devProp)
{
NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
CV_Assert(gpuCounter.isInitialized());
NCVStatus ncvStat = NCVBroxOpticalFlow(desc, gpuCounter, frame0, frame1, u, v, 0);
CV_Assert(ncvStat == NCV_SUCCESS);
ncvSafeCall( NCVBroxOpticalFlow(desc, gpuCounter, frame0, frame1, u, v, 0) );
return gpuCounter.maxSize();
}
@ -130,10 +128,8 @@ void cv::gpu::BroxOpticalFlow::operator ()(const GpuMat& frame0, const GpuMat& f
ensureSizeIsEnough(1, bufSize, CV_8UC1, buf);
NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), buf.ptr());
CV_Assert(gpuAllocator.isInitialized());
NCVStatus ncvStat = NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, stream);
CV_Assert(ncvStat == NCV_SUCCESS);
ncvSafeCall( NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, stream) );
}
void cv::gpu::interpolateFrames(const GpuMat& frame0, const GpuMat& frame1, const GpuMat& fu, const GpuMat& fv, const GpuMat& bu, const GpuMat& bv,
@ -189,7 +185,7 @@ void cv::gpu::interpolateFrames(const GpuMat& frame0, const GpuMat& frame1, cons
state.ppBuffers[4] = bui.ptr<Ncv32f>();
state.ppBuffers[5] = bvi.ptr<Ncv32f>();
nppSafeCall( nppiStInterpolateFrames(&state) );
ncvSafeCall( nppiStInterpolateFrames(&state) );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );

@ -39,15 +39,16 @@
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_PRECOMP_H__
#define __OPENCV_PRECOMP_H__
#if _MSC_VER >= 1200
#pragma warning( disable: 4251 4710 4711 4514 4996 )
#pragma warning( disable: 4251 4710 4711 4514 4996 )
#endif
#ifdef HAVE_CVCONFIG_H
#include "cvconfig.h"
#include "cvconfig.h"
#endif
#include <iostream>
@ -65,33 +66,43 @@
#include "opencv2/calib3d/calib3d.hpp"
#include "opencv2/core/internal.hpp"
#if defined(HAVE_CUDA)
#define OPENCV_GPU_UNUSED(x) (void)x
#ifdef HAVE_CUDA
#include "internal_shared.hpp"
#include "cuda_runtime_api.h"
#include "cufft.h"
#include "npp.h"
#ifdef HAVE_CUFFT
#include "cufft.h"
#endif
#ifdef HAVE_CUBLAS
#include "cublas.h"
#endif
#include "internal_shared.hpp"
#include "opencv2/gpu/stream_accessor.hpp"
#include "npp.h"
#include "nvidia/core/NCV.hpp"
#include "nvidia/NPP_staging/NPP_staging.hpp"
#include "nvidia/NCVHaarObjectDetection.hpp"
#include "nvidia/NCVBroxOpticalFlow.hpp"
#define CUDART_MINIMUM_REQUIRED_VERSION 4000
#define NPP_MINIMUM_REQUIRED_VERSION 4000
#define CUDART_MINIMUM_REQUIRED_VERSION 4000
#define NPP_MINIMUM_REQUIRED_VERSION 4000
#if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION)
#error "Insufficient Cuda Runtime library version, please update it."
#endif
#if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION)
#error "Insufficient Cuda Runtime library version, please update it."
#endif
#if (NPP_VERSION_MAJOR*1000+NPP_VERSION_MINOR*100+NPP_VERSION_BUILD < NPP_MINIMUM_REQUIRED_VERSION)
#error "Insufficient NPP version, please update it."
#endif
#if (NPP_VERSION_MAJOR * 1000 + NPP_VERSION_MINOR * 100 + NPP_VERSION_BUILD < NPP_MINIMUM_REQUIRED_VERSION)
#error "Insufficient NPP version, please update it."
#endif
#if defined(CUDA_ARCH_BIN_OR_PTX_10)
#error "OpenCV GPU module doesn't support NVIDIA compute capability 1.0"
#endif
#if defined(CUDA_ARCH_BIN_OR_PTX_10)
#error "OpenCV GPU module doesn't support NVIDIA compute capability 1.0"
#endif
static inline void throw_nogpu() { CV_Error(CV_GpuNotSupported, "The called functionality is disabled for current build or platform"); }

Loading…
Cancel
Save