added WITH_CUFFT and WITH_CUBLAS flags to cmake scripts

fixed gpu module error reporting added asynchronous version of some functions
14 years ago · 40ee754e28
parent a73b509b1e
commit 40ee754e28
16 changed files with 712 additions and 630 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -452,8 +452,12 @@ set(WITH_EIGEN ON CACHE BOOL "Include Eigen2/Eigen3 support")

 if( CMAKE_VERSION VERSION_GREATER "2.8")
    set(WITH_CUDA ON CACHE BOOL "Include NVidia Cuda Runtime support")
+    set(WITH_CUFFT ON CACHE BOOL "Include NVidia Cuda Fast Fourier Transform (FFT) library support")
+    set(WITH_CUBLAS OFF CACHE BOOL "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support")
 else()
    set(WITH_CUDA OFF CACHE BOOL "Include NVidia Cuda Runtime support")
+    set(WITH_CUFFT OFF CACHE BOOL "Include NVidia Cuda Fast Fourier Transform (FFT) library support")
+    set(WITH_CUBLAS OFF CACHE BOOL "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support")
 endif()

 set(WITH_OPENNI OFF CACHE BOOL "Include OpenNI support")
@ -995,6 +999,15 @@ if(WITH_CUDA)
    
    if(CUDA_FOUND)
        set(HAVE_CUDA 1)
+
+        if(WITH_CUFFT)
+            set(HAVE_CUFFT 1)
+        endif()
+
+        if(WITH_CUBLAS)
+            set(HAVE_CUBLAS 1)
+        endif()
+
        message(STATUS "CUDA detected: " ${CUDA_VERSION})

        set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0)" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
--- a/cvconfig.h.cmake
+++ b/cvconfig.h.cmake
@ -172,6 +172,12 @@
 /* NVidia Cuda Runtime API*/
 #cmakedefine HAVE_CUDA

+/* NVidia Cuda Fast Fourier Transform (FFT) API*/
+#cmakedefine HAVE_CUFFT
+
+/* NVidia Cuda Basic Linear Algebra Subprograms (BLAS) API*/
+#cmakedefine HAVE_CUBLAS
+
 /* Compile for 'real' NVIDIA GPU architectures */
 #define CUDA_ARCH_BIN "${OPENCV_CUDA_ARCH_BIN}"

--- a/modules/core/include/opencv2/core/types_c.h
+++ b/modules/core/include/opencv2/core/types_c.h
@ -250,9 +250,7 @@ enum {
 CV_StsBadMemBlock=            -214, /* an allocated block has been corrupted */
 CV_StsAssert=                 -215, /* assertion failed */    
 CV_GpuNotSupported=           -216,  
- CV_GpuApiCallError=           -217, 
- CV_GpuNppCallError=           -218,
- CV_GpuCufftCallError=         -219
+ CV_GpuApiCallError=           -217
 };

 /****************************************************************************************\
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@ -629,9 +629,8 @@ CV_IMPL const char* cvErrorStr( int status )
    case CV_StsNotImplemented :      return "The function/feature is not implemented";
    case CV_StsBadMemBlock :         return "Memory block has been corrupted";
    case CV_StsAssert :              return "Assertion failed";
-    case CV_GpuNotSupported : return "No GPU support";
-    case CV_GpuApiCallError : return "Gpu Api call";
-    case CV_GpuNppCallError : return "Npp Api call";
+    case CV_GpuNotSupported :        return "No GPU support";
+    case CV_GpuApiCallError :        return "Gpu Api call";
    };

    sprintf(buf, "Unknown %s code %d", status >= 0 ? "status":"error", status);
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@ -120,12 +120,19 @@ set_target_properties(${the_target} PROPERTIES
 target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${DEPS} )

 if (HAVE_CUDA)
-    target_link_libraries(${the_target} ${CUDA_LIBRARIES})    
-    CUDA_ADD_CUFFT_TO_TARGET(${the_target})
+    target_link_libraries(${the_target} ${CUDA_LIBRARIES})

    unset(CUDA_npp_LIBRARY CACHE)
    find_cuda_helper_libs(npp)
    target_link_libraries(${the_target} ${CUDA_npp_LIBRARY})
+
+    if(HAVE_CUFFT)
+        CUDA_ADD_CUFFT_TO_TARGET(${the_target})
+    endif()
+
+    if(HAVE_CUBLAS)
+        CUDA_ADD_CUBLAS_TO_TARGET(${the_target})
+    endif()
 endif()

 if(MSVC)
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@ -141,8 +141,8 @@ namespace cv

        //////////////////////////////// Error handling ////////////////////////

-        CV_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func);
-        CV_EXPORTS void nppError( int err, const char *file, const int line, const char *func);
+        //CV_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func);
+        //CV_EXPORTS void nppError( int err, const char *file, const int line, const char *func);

        //////////////////////////////// CudaMem ////////////////////////////////
        // CudaMem is limited cv::Mat with page locked memory allocation.
@ -628,11 +628,11 @@ namespace cv

        //! Does mean shift filtering on GPU.
        CV_EXPORTS void meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,
-            TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));
+            TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1), Stream& stream = Stream::Null());

        //! Does mean shift procedure on GPU.
        CV_EXPORTS void meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr,
-            TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));
+            TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1), Stream& stream = Stream::Null());

        //! Does mean shift segmentation with elimination of small regions.
        CV_EXPORTS void meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr, int minsize,
@ -683,10 +683,12 @@ namespace cv
        //! rotate 8bit single or four channel image
        //! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
        //! supports CV_8UC1, CV_8UC4 types
-        CV_EXPORTS void rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());
+        CV_EXPORTS void rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0, 
+            int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());

        //! copies 2D array to a larger destination array and pads borders with user-specifiable constant
-        CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType, const Scalar& value = Scalar(), Stream& stream = Stream::Null());
+        CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType, 
+            const Scalar& value = Scalar(), Stream& stream = Stream::Null());

        //! computes the integral image
        //! sum will have CV_32S type, but will contain unsigned int values
@ -715,21 +717,26 @@ namespace cv
        CV_EXPORTS void rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& stream = Stream::Null());

        //! computes Harris cornerness criteria at each image pixel
-        CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, int borderType=BORDER_REFLECT101);
-        CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, double k, int borderType=BORDER_REFLECT101);
+        CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, 
+            int borderType = BORDER_REFLECT101);
+        CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, double k, 
+            int borderType = BORDER_REFLECT101);
+        CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, double k, 
+            int borderType = BORDER_REFLECT101, Stream& stream = Stream::Null());

        //! computes minimum eigen value of 2x2 derivative covariation matrix at each pixel - the cornerness criteria
        CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType=BORDER_REFLECT101);
        CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType=BORDER_REFLECT101);
+        CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, 
+            int borderType=BORDER_REFLECT101, Stream& stream = Stream::Null());

        //! performs per-element multiplication of two full (not packed) Fourier spectrums
        //! supports 32FC2 matrixes only (interleaved format)
-        CV_EXPORTS void mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB=false);
+        CV_EXPORTS void mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB=false, Stream& stream = Stream::Null());

        //! performs per-element multiplication of two full (not packed) Fourier spectrums
        //! supports 32FC2 matrixes only (interleaved format)
-        CV_EXPORTS void mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, 
-                                             float scale, bool conjB=false);
+        CV_EXPORTS void mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB=false, Stream& stream = Stream::Null());

        //! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix.
        //! Param dft_size is the size of DFT transform.
@ -742,19 +749,14 @@ namespace cv
        //! in CUFFT's format. Result as full complex matrix for such kind of transform cannot be retrieved.
        //!
        //! For complex-to-real transform it is assumed that the source matrix is packed in CUFFT's format.
-        CV_EXPORTS void dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags=0);
+        CV_EXPORTS void dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags=0, Stream& stream = Stream::Null());

        //! computes convolution (or cross-correlation) of two images using discrete Fourier transform
        //! supports source images of 32FC1 type only
        //! result matrix will have 32FC1 type
-        CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, 
-                                 bool ccorr=false);
-
        struct CV_EXPORTS ConvolveBuf;
-
-        //! buffered version
-        CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, 
-                                 bool ccorr, ConvolveBuf& buf);
+        CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr = false);
+        CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream = Stream::Null());

        struct CV_EXPORTS ConvolveBuf
        {
@ -766,7 +768,7 @@ namespace cv

        private:
            static Size estimateBlockSize(Size result_size, Size templ_size);
-            friend void convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&);
+            friend void convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream& stream);

            Size result_size;
            Size block_size;
@ -778,7 +780,7 @@ namespace cv
        };

        //! computes the proximity map for the raster template and the image where the template is searched for
-        CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method);
+        CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream& stream = Stream::Null());

        //! smoothes the source image and downsamples it
        CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@ -93,7 +93,7 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
        sz.width  = src.cols;
        sz.height = src.rows;

-        nppSafeCall( nppiStTranspose_32u_C1R(const_cast<Ncv32u*>(src.ptr<Ncv32u>()), static_cast<int>(src.step), 
+        ncvSafeCall( nppiStTranspose_32u_C1R(const_cast<Ncv32u*>(src.ptr<Ncv32u>()), static_cast<int>(src.step), 
            dst.ptr<Ncv32u>(), static_cast<int>(dst.step), sz) );
    }
    else // if (src.elemSize() == 8)
@ -104,7 +104,7 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
        sz.width  = src.cols;
        sz.height = src.rows;

-        nppSafeCall( nppiStTranspose_64u_C1R(const_cast<Ncv64u*>(src.ptr<Ncv64u>()), static_cast<int>(src.step), 
+        ncvSafeCall( nppiStTranspose_64u_C1R(const_cast<Ncv64u*>(src.ptr<Ncv64u>()), static_cast<int>(src.step), 
            dst.ptr<Ncv64u>(), static_cast<int>(dst.step), sz) );		
    }

--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@ -66,10 +66,7 @@ struct cv::gpu::CascadeClassifier_GPU::CascadeClassifierImpl
    CascadeClassifierImpl(const string& filename) : lastAllocatedFrameSize(-1, -1)
    {
        ncvSetDebugOutputHandler(NCVDebugOutputHandler);
-        if (ncvStat != load(filename))
-        {
-            CV_Error(CV_GpuApiCallError, "Error in GPU cacade load");
-        }
+        ncvSafeCall( load(filename) );
    }


@ -287,11 +284,7 @@ int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat& image, GpuMa
    }

    unsigned int numDetections;
-    NCVStatus ncvStat = impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, ncvMinSize, numDetections);
-    if (ncvStat != NCV_SUCCESS)
-    {
-        CV_Error(CV_GpuApiCallError, "Error in face detectioln");
-    }
+    ncvSafeCall( impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, ncvMinSize, numDetections) );

    return numDetections;
 }
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@ -120,8 +120,7 @@ namespace cv { namespace gpu { namespace imgproc
        return make_short2((short)x0, (short)y0);
    }

-    extern "C" __global__ void meanshift_kernel( unsigned char* out, size_t out_step, int cols, int rows, 
-                                                 int sp, int sr, int maxIter, float eps )
+    __global__ void meanshift_kernel(unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps )
    {
        int x0 = blockIdx.x * blockDim.x + threadIdx.x;
        int y0 = blockIdx.y * blockDim.y + threadIdx.y;
@ -130,10 +129,10 @@ namespace cv { namespace gpu { namespace imgproc
            do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);
    }

-    extern "C" __global__ void meanshiftproc_kernel( unsigned char* outr, size_t outrstep, 
-                                                 unsigned char* outsp, size_t outspstep, 
-                                                 int cols, int rows, 
-                                                 int sp, int sr, int maxIter, float eps )
+    __global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep, 
+                                         unsigned char* outsp, size_t outspstep, 
+                                         int cols, int rows, 
+                                         int sp, int sr, int maxIter, float eps)
    {
        int x0 = blockIdx.x * blockDim.x + threadIdx.x;
        int y0 = blockIdx.y * blockDim.y + threadIdx.y;
@ -145,7 +144,7 @@ namespace cv { namespace gpu { namespace imgproc
        }
    }

-    extern "C" void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps)
+    void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
    {
        dim3 grid(1, 1, 1);
        dim3 threads(32, 8, 1);
@ -155,13 +154,16 @@ namespace cv { namespace gpu { namespace imgproc
        cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
        cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );

-        meanshift_kernel<<< grid, threads >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
+        meanshift_kernel<<< grid, threads, 0, stream >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
-        cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+        //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        
    }
-    extern "C" void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps) 
+
+    void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream) 
    {
        dim3 grid(1, 1, 1);
        dim3 threads(32, 8, 1);
@ -171,11 +173,13 @@ namespace cv { namespace gpu { namespace imgproc
        cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
        cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );

-        meanshiftproc_kernel<<< grid, threads >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
+        meanshiftproc_kernel<<< grid, threads, 0, stream >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
-        cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+        //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        
    }

 /////////////////////////////////// drawColorDisp ///////////////////////////////////////////////
@ -389,15 +393,16 @@ namespace cv { namespace gpu { namespace imgproc
        }
    }

-    void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst)
+    void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst, cudaStream_t stream)
    {
        dim3 threads(32, 8);
        dim3 grid(divUp(Dx.cols, threads.x), divUp(Dx.rows, threads.y));

-        extractCovData_kernel<<<grid, threads>>>(Dx.cols, Dx.rows, Dx, Dy, dst);
+        extractCovData_kernel<<<grid, threads, 0, stream>>>(Dx.cols, Dx.rows, Dx, Dy, dst);
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }

 /////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////
@ -475,7 +480,7 @@ namespace cv { namespace gpu { namespace imgproc
    }

    void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, 
-                             int border_type)
+                             int border_type, cudaStream_t stream)
    {
        const int rows = Dx.rows;
        const int cols = Dx.cols;
@ -492,7 +497,7 @@ namespace cv { namespace gpu { namespace imgproc
        switch (border_type) 
        {
        case BORDER_REFLECT101_GPU:
-            cornerHarris_kernel<<<grid, threads>>>(
+            cornerHarris_kernel<<<grid, threads, 0, stream>>>(
                    cols, rows, block_size, k, dst, BrdRowReflect101<void>(cols), BrdColReflect101<void>(rows));
            break;
        case BORDER_REPLICATE_GPU:
@ -500,16 +505,18 @@ namespace cv { namespace gpu { namespace imgproc
            harrisDxTex.addressMode[1] = cudaAddressModeClamp;
            harrisDyTex.addressMode[0] = cudaAddressModeClamp;
            harrisDyTex.addressMode[1] = cudaAddressModeClamp;
-            cornerHarris_kernel<<<grid, threads>>>(cols, rows, block_size, k, dst);
+
+            cornerHarris_kernel<<<grid, threads, 0, stream>>>(cols, rows, block_size, k, dst);
            break;
        }

        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );

-        cudaSafeCall(cudaUnbindTexture(harrisDxTex));
-        cudaSafeCall(cudaUnbindTexture(harrisDyTex));
+        //cudaSafeCall(cudaUnbindTexture(harrisDxTex));
+        //cudaSafeCall(cudaUnbindTexture(harrisDyTex));
    }

 /////////////////////////////////////////// Corner Min Eigen Val /////////////////////////////////////////////////
@ -592,7 +599,7 @@ namespace cv { namespace gpu { namespace imgproc
    }

    void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst,
-                                  int border_type)
+                                  int border_type, cudaStream_t stream)
    {
        const int rows = Dx.rows;
        const int cols = Dx.cols;
@ -609,7 +616,7 @@ namespace cv { namespace gpu { namespace imgproc
        switch (border_type)
        {
        case BORDER_REFLECT101_GPU:
-            cornerMinEigenVal_kernel<<<grid, threads>>>(
+            cornerMinEigenVal_kernel<<<grid, threads, 0, stream>>>(
                    cols, rows, block_size, dst, BrdRowReflect101<void>(cols), BrdColReflect101<void>(rows));
            break;
        case BORDER_REPLICATE_GPU:
@ -617,16 +624,18 @@ namespace cv { namespace gpu { namespace imgproc
            minEigenValDxTex.addressMode[1] = cudaAddressModeClamp;
            minEigenValDyTex.addressMode[0] = cudaAddressModeClamp;
            minEigenValDyTex.addressMode[1] = cudaAddressModeClamp;
-            cornerMinEigenVal_kernel<<<grid, threads>>>(cols, rows, block_size, dst);
+
+            cornerMinEigenVal_kernel<<<grid, threads, 0, stream>>>(cols, rows, block_size, dst);
            break;
        }

        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall(cudaDeviceSynchronize());
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());

-        cudaSafeCall(cudaUnbindTexture(minEigenValDxTex));
-        cudaSafeCall(cudaUnbindTexture(minEigenValDyTex));
+        //cudaSafeCall(cudaUnbindTexture(minEigenValDxTex));
+        //cudaSafeCall(cudaUnbindTexture(minEigenValDyTex));
    }

 ////////////////////////////// Column Sum //////////////////////////////////////
@ -667,8 +676,7 @@ namespace cv { namespace gpu { namespace imgproc
    //////////////////////////////////////////////////////////////////////////
    // mulSpectrums

-    __global__ void mulSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, 
-                                       DevMem2D_<cufftComplex> c)
+    __global__ void mulSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c)
    {
        const int x = blockIdx.x * blockDim.x + threadIdx.x;    
        const int y = blockIdx.y * blockDim.y + threadIdx.y;    
@ -680,25 +688,23 @@ namespace cv { namespace gpu { namespace imgproc
    }


-    void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, 
-                      DevMem2D_<cufftComplex> c)
+    void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream)
    {
        dim3 threads(256);
        dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));

-        mulSpectrumsKernel<<<grid, threads>>>(a, b, c);
+        mulSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, c);
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }


    //////////////////////////////////////////////////////////////////////////
    // mulSpectrums_CONJ

-    __global__ void mulSpectrumsKernel_CONJ(
-            const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,
-            DevMem2D_<cufftComplex> c)
+    __global__ void mulSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c)
    {
        const int x = blockIdx.x * blockDim.x + threadIdx.x;    
        const int y = blockIdx.y * blockDim.y + threadIdx.y;    
@ -710,25 +716,23 @@ namespace cv { namespace gpu { namespace imgproc
    }


-    void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, 
-                           DevMem2D_<cufftComplex> c)
+    void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream)
    {
        dim3 threads(256);
        dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));

-        mulSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, c);
+        mulSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, c);
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }


    //////////////////////////////////////////////////////////////////////////
    // mulAndScaleSpectrums

-    __global__ void mulAndScaleSpectrumsKernel(
-            const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, 
-            float scale, DevMem2D_<cufftComplex> c)
+    __global__ void mulAndScaleSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c)
    {
        const int x = blockIdx.x * blockDim.x + threadIdx.x;
        const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -741,25 +745,23 @@ namespace cv { namespace gpu { namespace imgproc
    }


-    void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,
-                              float scale, DevMem2D_<cufftComplex> c)
+    void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream)
    {
        dim3 threads(256);
        dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));

-        mulAndScaleSpectrumsKernel<<<grid, threads>>>(a, b, scale, c);
+        mulAndScaleSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, scale, c);
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (stream)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }


    //////////////////////////////////////////////////////////////////////////
    // mulAndScaleSpectrums_CONJ

-    __global__ void mulAndScaleSpectrumsKernel_CONJ(
-            const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,
-            float scale, DevMem2D_<cufftComplex> c)
+    __global__ void mulAndScaleSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c)
    {
        const int x = blockIdx.x * blockDim.x + threadIdx.x;
        const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -772,16 +774,16 @@ namespace cv { namespace gpu { namespace imgproc
    }


-    void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,
-                                  float scale, DevMem2D_<cufftComplex> c)
+    void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream)
    {
        dim3 threads(256);
        dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));

-        mulAndScaleSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, scale, c);
+        mulAndScaleSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, scale, c);
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }    

    //////////////////////////////////////////////////////////////////////////
@ -1015,17 +1017,18 @@ namespace cv { namespace gpu { namespace imgproc
        }
    }

-    void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel)
+    void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream)
    {
        cudaSafeCall(cudaMemcpyToSymbol(c_convolveKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );

        const dim3 block(16, 16);
        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));

-        convolve<<<grid, block>>>(src, dst, kWidth, kHeight);
+        convolve<<<grid, block, 0, stream>>>(src, dst, kWidth, kHeight);
        cudaSafeCall(cudaGetLastError());

-        cudaSafeCall(cudaDeviceSynchronize());
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
    }


--- a/modules/gpu/src/cuda/match_template.cu
+++ b/modules/gpu/src/cuda/match_template.cu
@ -78,11 +78,11 @@ __device__ __forceinline__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a
 __device__ __forceinline__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
 __device__ __forceinline__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }

+//////////////////////////////////////////////////////////////////////
+// Naive_CCORR

-template <typename T, int cn>
-__global__ void matchTemplateNaiveKernel_CCORR(
-        int w, int h, const PtrStepb image, const PtrStepb templ, 
-        DevMem2Df result)
+template <typename T, int cn> 
+__global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, DevMem2Df result)
 {
    typedef typename TypeVec<T, cn>::vec_type Type;
    typedef typename TypeVec<float, cn>::vec_type Typef;
@ -106,73 +106,49 @@ __global__ void matchTemplateNaiveKernel_CCORR(
    }
 }

+template <typename T, int cn>
+void matchTemplateNaive_CCORR(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream)
+{
+    const dim3 threads(32, 8);
+    const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));

-void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ,
-                                  DevMem2Df result, int cn)
+    matchTemplateNaiveKernel_CCORR<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
+    cudaSafeCall( cudaGetLastError() );
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)
 {
-    dim3 threads(32, 8);
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+    typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);

-    switch (cn)
+    static const caller_t callers[] = 
    {
-    case 1:
-        matchTemplateNaiveKernel_CCORR<float, 1><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 2:
-        matchTemplateNaiveKernel_CCORR<float, 2><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 3:
-        matchTemplateNaiveKernel_CCORR<float, 3><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 4:
-        matchTemplateNaiveKernel_CCORR<float, 4><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    }
-    cudaSafeCall( cudaGetLastError() );
+        0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4>
+    };

-    cudaSafeCall( cudaDeviceSynchronize() );
+    callers[cn](image, templ, result, stream);
 }


-void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ,
-                                 DevMem2Df result, int cn)
+void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)
 {
-    dim3 threads(32, 8);
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+    typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);

-    switch (cn)
+    static const caller_t callers[] = 
    {
-    case 1:
-        matchTemplateNaiveKernel_CCORR<uchar, 1><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 2:
-        matchTemplateNaiveKernel_CCORR<uchar, 2><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 3:
-        matchTemplateNaiveKernel_CCORR<uchar, 3><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 4:
-        matchTemplateNaiveKernel_CCORR<uchar, 4><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    }
-    cudaSafeCall( cudaGetLastError() );
+        0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4>
+    };

-    cudaSafeCall( cudaDeviceSynchronize() );
+    callers[cn](image, templ, result, stream);
 }

+//////////////////////////////////////////////////////////////////////
+// Naive_SQDIFF

 template <typename T, int cn>
-__global__ void matchTemplateNaiveKernel_SQDIFF(
-        int w, int h, const PtrStepb image, const PtrStepb templ, 
-        DevMem2Df result)
+__global__ void matchTemplateNaiveKernel_SQDIFF(int w, int h, const PtrStepb image, const PtrStepb templ, DevMem2Df result)
 {
    typedef typename TypeVec<T, cn>::vec_type Type;
    typedef typename TypeVec<float, cn>::vec_type Typef;
@ -200,73 +176,48 @@ __global__ void matchTemplateNaiveKernel_SQDIFF(
    }
 }

-
-void matchTemplateNaive_SQDIFF_32F(const DevMem2Db image, const DevMem2Db templ,
-                                   DevMem2Df result, int cn)
+template <typename T, int cn>
+void matchTemplateNaive_SQDIFF(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream)
 {
-    dim3 threads(32, 8);
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+    const dim3 threads(32, 8);
+    const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));

-    switch (cn)
-    {
-    case 1:
-        matchTemplateNaiveKernel_SQDIFF<float, 1><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 2:
-        matchTemplateNaiveKernel_SQDIFF<float, 2><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 3:
-        matchTemplateNaiveKernel_SQDIFF<float, 3><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 4:
-        matchTemplateNaiveKernel_SQDIFF<float, 4><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    }
+    matchTemplateNaiveKernel_SQDIFF<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
    cudaSafeCall( cudaGetLastError() );

-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }

+void matchTemplateNaive_SQDIFF_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)
+{
+    typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);
+
+    static const caller_t callers[] = 
+    {
+        0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4>
+    };
+
+    callers[cn](image, templ, result, stream);
+}

-void matchTemplateNaive_SQDIFF_8U(const DevMem2Db image, const DevMem2Db templ,
-                                  DevMem2Df result, int cn)
+void matchTemplateNaive_SQDIFF_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)
 {
-    dim3 threads(32, 8);
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+    typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);

-    switch (cn)
+    static const caller_t callers[] = 
    {
-    case 1:
-        matchTemplateNaiveKernel_SQDIFF<uchar, 1><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 2:
-        matchTemplateNaiveKernel_SQDIFF<uchar, 2><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 3:
-        matchTemplateNaiveKernel_SQDIFF<uchar, 3><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 4:
-        matchTemplateNaiveKernel_SQDIFF<uchar, 4><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    }
-    cudaSafeCall( cudaGetLastError() );
+        0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4>
+    };

-    cudaSafeCall( cudaDeviceSynchronize() );
+    callers[cn](image, templ, result, stream);
 }

+//////////////////////////////////////////////////////////////////////
+// Prepared_SQDIFF

 template <int cn>
-__global__ void matchTemplatePreparedKernel_SQDIFF_8U(
-        int w, int h, const PtrStep<unsigned long long> image_sqsum, 
-        unsigned int templ_sqsum, DevMem2Df result)
+__global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result)
 {
    const int x = blockIdx.x * blockDim.x + threadIdx.x;
    const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -281,37 +232,34 @@ __global__ void matchTemplatePreparedKernel_SQDIFF_8U(
    }
 }

+template <int cn>
+void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream)
+{
+    const dim3 threads(32, 8);
+    const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+    matchTemplatePreparedKernel_SQDIFF_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+    cudaSafeCall( cudaGetLastError() );

-void matchTemplatePrepared_SQDIFF_8U(
-        int w, int h, const DevMem2D_<unsigned long long> image_sqsum, 
-        unsigned int templ_sqsum, DevMem2Df result, int cn)
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, int cn, 
+                                     cudaStream_t stream)
 {
-    dim3 threads(32, 8);
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-    switch (cn)
+    typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream);
+
+    static const caller_t callers[] = 
    {
-    case 1:
-        matchTemplatePreparedKernel_SQDIFF_8U<1><<<grid, threads>>>(
-                w, h, image_sqsum, templ_sqsum, result);
-        break;
-    case 2:
-        matchTemplatePreparedKernel_SQDIFF_8U<2><<<grid, threads>>>(
-                w, h, image_sqsum, templ_sqsum, result);
-        break;
-    case 3:
-        matchTemplatePreparedKernel_SQDIFF_8U<3><<<grid, threads>>>(
-                w, h, image_sqsum, templ_sqsum, result);
-        break;
-    case 4:
-        matchTemplatePreparedKernel_SQDIFF_8U<4><<<grid, threads>>>(
-                w, h, image_sqsum, templ_sqsum, result);
-        break;
-    }
-    cudaSafeCall( cudaGetLastError() );
+        0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4>
+    };

-    cudaSafeCall( cudaDeviceSynchronize() );
+    callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
 }

+//////////////////////////////////////////////////////////////////////
+// Prepared_SQDIFF_NORMED

 // normAcc* are accurate normalization routines which make GPU matchTemplate
 // consistent with CPU one
@ -337,9 +285,7 @@ __device__ float normAcc_SQDIFF(float num, float denum)


 template <int cn>
-__global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(
-        int w, int h, const PtrStep<unsigned long long> image_sqsum, 
-        unsigned int templ_sqsum, DevMem2Df result)
+__global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result)
 {
    const int x = blockIdx.x * blockDim.x + threadIdx.x;
    const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -355,41 +301,37 @@ __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(
    }
 }

+template <int cn>
+void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, 
+                                            DevMem2Df result, cudaStream_t stream)
+{
+    const dim3 threads(32, 8);
+    const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+    matchTemplatePreparedKernel_SQDIFF_NORMED_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+    cudaSafeCall( cudaGetLastError() );
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+

-void matchTemplatePrepared_SQDIFF_NORMED_8U(
-        int w, int h, const DevMem2D_<unsigned long long> image_sqsum, 
-        unsigned int templ_sqsum, DevMem2Df result, int cn)
+void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, 
+                                            DevMem2Df result, int cn, cudaStream_t stream)
 {
-    dim3 threads(32, 8);
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-    switch (cn)
+    typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream);
+    static const caller_t callers[] = 
    {
-    case 1:
-        matchTemplatePreparedKernel_SQDIFF_NORMED_8U<1><<<grid, threads>>>(
-                w, h, image_sqsum, templ_sqsum, result);
-        break;
-    case 2:
-        matchTemplatePreparedKernel_SQDIFF_NORMED_8U<2><<<grid, threads>>>(
-                w, h, image_sqsum, templ_sqsum, result);
-        break;
-    case 3:
-        matchTemplatePreparedKernel_SQDIFF_NORMED_8U<3><<<grid, threads>>>(
-                w, h, image_sqsum, templ_sqsum, result);
-        break;
-    case 4:
-        matchTemplatePreparedKernel_SQDIFF_NORMED_8U<4><<<grid, threads>>>(
-                w, h, image_sqsum, templ_sqsum, result);
-        break;
-    }
-    cudaSafeCall( cudaGetLastError() );
+        0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>
+    };

-    cudaSafeCall( cudaDeviceSynchronize() );
+    callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
 }

+//////////////////////////////////////////////////////////////////////
+// Prepared_CCOFF

-__global__ void matchTemplatePreparedKernel_CCOFF_8U(
-        int w, int h, float templ_sum_scale, 
-        const PtrStep<unsigned int> image_sum, DevMem2Df result)
+__global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<unsigned int> image_sum, DevMem2Df result)
 {
    const int x = blockIdx.x * blockDim.x + threadIdx.x;
    const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -404,21 +346,20 @@ __global__ void matchTemplatePreparedKernel_CCOFF_8U(
    }
 }

-
-void matchTemplatePrepared_CCOFF_8U(
-        int w, int h, const DevMem2D_<unsigned int> image_sum,
-        unsigned int templ_sum, DevMem2Df result)
+void matchTemplatePrepared_CCOFF_8U(int w, int h, const DevMem2D_<unsigned int> image_sum, unsigned int templ_sum, DevMem2Df result, cudaStream_t stream)
 {
    dim3 threads(32, 8);
    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-    matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads>>>(
-            w, h, (float)templ_sum / (w * h), image_sum, result);
+
+    matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads, 0, stream>>>(w, h, (float)templ_sum / (w * h), image_sum, result);
    cudaSafeCall( cudaGetLastError() );

-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }


+
 __global__ void matchTemplatePreparedKernel_CCOFF_8UC2(
        int w, int h, float templ_sum_scale_r, float templ_sum_scale_g,
        const PtrStep<unsigned int> image_sum_r,
@ -442,25 +383,27 @@ __global__ void matchTemplatePreparedKernel_CCOFF_8UC2(
    }
 }

-
 void matchTemplatePrepared_CCOFF_8UC2(
        int w, int h, 
        const DevMem2D_<unsigned int> image_sum_r, 
        const DevMem2D_<unsigned int> image_sum_g,
        unsigned int templ_sum_r, unsigned int templ_sum_g, 
-        DevMem2Df result)
+        DevMem2Df result, cudaStream_t stream)
 {
    dim3 threads(32, 8);
    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-    matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads>>>(
+
+    matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads, 0, stream>>>(
            w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h),
            image_sum_r, image_sum_g, result);
    cudaSafeCall( cudaGetLastError() );

-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }


+
 __global__ void matchTemplatePreparedKernel_CCOFF_8UC3(
        int w, int h, 
        float templ_sum_scale_r,
@ -492,7 +435,6 @@ __global__ void matchTemplatePreparedKernel_CCOFF_8UC3(
    }
 }

-
 void matchTemplatePrepared_CCOFF_8UC3(
        int w, int h, 
        const DevMem2D_<unsigned int> image_sum_r, 
@ -501,11 +443,12 @@ void matchTemplatePrepared_CCOFF_8UC3(
        unsigned int templ_sum_r, 
        unsigned int templ_sum_g, 
        unsigned int templ_sum_b, 
-        DevMem2Df result)
+        DevMem2Df result, cudaStream_t stream)
 {
    dim3 threads(32, 8);
    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-    matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads>>>(
+
+    matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>(
            w, h, 
            (float)templ_sum_r / (w * h),
            (float)templ_sum_g / (w * h),
@ -513,10 +456,12 @@ void matchTemplatePrepared_CCOFF_8UC3(
            image_sum_r, image_sum_g, image_sum_b, result);
    cudaSafeCall( cudaGetLastError() );

-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }


+
 __global__ void matchTemplatePreparedKernel_CCOFF_8UC4(
        int w, int h, 
        float templ_sum_scale_r, 
@ -554,7 +499,6 @@ __global__ void matchTemplatePreparedKernel_CCOFF_8UC4(
    }
 }

-
 void matchTemplatePrepared_CCOFF_8UC4(
        int w, int h, 
        const DevMem2D_<unsigned int> image_sum_r, 
@ -565,11 +509,12 @@ void matchTemplatePrepared_CCOFF_8UC4(
        unsigned int templ_sum_g, 
        unsigned int templ_sum_b, 
        unsigned int templ_sum_a, 
-        DevMem2Df result)
+        DevMem2Df result, cudaStream_t stream)
 {
    dim3 threads(32, 8);
    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-    matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads>>>(
+
+    matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>(
            w, h, 
            (float)templ_sum_r / (w * h), 
            (float)templ_sum_g / (w * h), 
@ -579,9 +524,12 @@ void matchTemplatePrepared_CCOFF_8UC4(
            result);
    cudaSafeCall( cudaGetLastError() );

-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }

+//////////////////////////////////////////////////////////////////////
+// Prepared_CCOFF_NORMED

 __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
        int w, int h, float weight, 
@ -607,12 +555,11 @@ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
    }
 }

-
 void matchTemplatePrepared_CCOFF_NORMED_8U(
            int w, int h, const DevMem2D_<unsigned int> image_sum, 
            const DevMem2D_<unsigned long long> image_sqsum,
            unsigned int templ_sum, unsigned int templ_sqsum,
-            DevMem2Df result)
+            DevMem2Df result, cudaStream_t stream)
 {
    dim3 threads(32, 8);
    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@ -620,15 +567,18 @@ void matchTemplatePrepared_CCOFF_NORMED_8U(
    float weight = 1.f / (w * h);
    float templ_sum_scale = templ_sum * weight;
    float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum;
-    matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads>>>(
+
+    matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>(
            w, h, weight, templ_sum_scale, templ_sqsum_scale, 
            image_sum, image_sqsum, result);
    cudaSafeCall( cudaGetLastError() );

-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }


+
 __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(
        int w, int h, float weight, 
        float templ_sum_scale_r, float templ_sum_scale_g, 
@ -663,14 +613,13 @@ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(
    }
 }

-
 void matchTemplatePrepared_CCOFF_NORMED_8UC2(
            int w, int h, 
            const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
            const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
            unsigned int templ_sum_r, unsigned int templ_sqsum_r,
            unsigned int templ_sum_g, unsigned int templ_sqsum_g,
-            DevMem2Df result)
+            DevMem2Df result, cudaStream_t stream)
 {
    dim3 threads(32, 8);
    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@ -680,7 +629,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC2(
    float templ_sum_scale_g = templ_sum_g * weight;
    float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r 
                               + templ_sqsum_g - weight * templ_sum_g * templ_sum_g;
-    matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads>>>(
+
+    matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>(
            w, h, weight, 
            templ_sum_scale_r, templ_sum_scale_g,
            templ_sqsum_scale,
@ -689,10 +639,12 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC2(
            result);
    cudaSafeCall( cudaGetLastError() );

-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }


+
 __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(
        int w, int h, float weight, 
        float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, 
@ -736,7 +688,6 @@ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(
    }
 }

-
 void matchTemplatePrepared_CCOFF_NORMED_8UC3(
            int w, int h, 
            const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
@ -745,7 +696,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC3(
            unsigned int templ_sum_r, unsigned int templ_sqsum_r,
            unsigned int templ_sum_g, unsigned int templ_sqsum_g,
            unsigned int templ_sum_b, unsigned int templ_sqsum_b,
-            DevMem2Df result)
+            DevMem2Df result, cudaStream_t stream)
 {
    dim3 threads(32, 8);
    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@ -757,7 +708,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC3(
    float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r 
                              + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
                              + templ_sqsum_b - weight * templ_sum_b * templ_sum_b;
-    matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads>>>(
+
+    matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>(
            w, h, weight, 
            templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, 
            templ_sqsum_scale, 
@ -767,10 +719,12 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC3(
            result);
    cudaSafeCall( cudaGetLastError() );

-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }


+
 __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(
        int w, int h, float weight, 
        float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, 
@ -821,7 +775,6 @@ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(
    }
 }

-
 void matchTemplatePrepared_CCOFF_NORMED_8UC4(
            int w, int h, 
            const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
@ -832,7 +785,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC4(
            unsigned int templ_sum_g, unsigned int templ_sqsum_g,
            unsigned int templ_sum_b, unsigned int templ_sqsum_b,
            unsigned int templ_sum_a, unsigned int templ_sqsum_a,
-            DevMem2Df result)
+            DevMem2Df result, cudaStream_t stream)
 {
    dim3 threads(32, 8);
    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@ -846,7 +799,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC4(
                              + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
                              + templ_sqsum_b - weight * templ_sum_b * templ_sum_b
                              + templ_sqsum_a - weight * templ_sum_a * templ_sum_a;
-    matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads>>>(
+
+    matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>(
            w, h, weight, 
            templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a, 
            templ_sqsum_scale, 
@ -857,9 +811,12 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC4(
            result);
    cudaSafeCall( cudaGetLastError() );

-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }

+//////////////////////////////////////////////////////////////////////
+// normalize

 template <int cn>
 __global__ void normalizeKernel_8U(
@ -878,32 +835,36 @@ __global__ void normalizeKernel_8U(
    }
 }

-
 void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, 
-                  unsigned int templ_sqsum, DevMem2Df result, int cn)
+                  unsigned int templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream)
 {
    dim3 threads(32, 8);
    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
    switch (cn)
    {
    case 1:
-        normalizeKernel_8U<1><<<grid, threads>>>(w, h, image_sqsum, templ_sqsum, result);
+        normalizeKernel_8U<1><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
        break;
    case 2:
-        normalizeKernel_8U<2><<<grid, threads>>>(w, h, image_sqsum, templ_sqsum, result);
+        normalizeKernel_8U<2><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
        break;
    case 3:
-        normalizeKernel_8U<3><<<grid, threads>>>(w, h, image_sqsum, templ_sqsum, result);
+        normalizeKernel_8U<3><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
        break;
    case 4:
-        normalizeKernel_8U<4><<<grid, threads>>>(w, h, image_sqsum, templ_sqsum, result);
+        normalizeKernel_8U<4><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
        break;
    }
+
    cudaSafeCall( cudaGetLastError() );

-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }

+//////////////////////////////////////////////////////////////////////
+// extractFirstChannel

 template <int cn>
 __global__ void extractFirstChannel_32F(const PtrStepb image, DevMem2Df result)
@ -920,8 +881,7 @@ __global__ void extractFirstChannel_32F(const PtrStepb image, DevMem2Df result)
    }
 }

-
-void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn)
+void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cudaStream_t stream)
 {
    dim3 threads(32, 8);
    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@ -929,23 +889,21 @@ void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn)
    switch (cn)
    {
    case 1:
-        extractFirstChannel_32F<1><<<grid, threads>>>(image, result);
+        extractFirstChannel_32F<1><<<grid, threads, 0, stream>>>(image, result);
        break;
    case 2:
-        extractFirstChannel_32F<2><<<grid, threads>>>(image, result);
+        extractFirstChannel_32F<2><<<grid, threads, 0, stream>>>(image, result);
        break;
    case 3:
-        extractFirstChannel_32F<3><<<grid, threads>>>(image, result);
+        extractFirstChannel_32F<3><<<grid, threads, 0, stream>>>(image, result);
        break;
    case 4:
-        extractFirstChannel_32F<4><<<grid, threads>>>(image, result);
+        extractFirstChannel_32F<4><<<grid, threads, 0, stream>>>(image, result);
        break;
    }
    cudaSafeCall( cudaGetLastError() );

-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }
-
-
 }}}
-
--- a/modules/gpu/src/cuda/safe_call.hpp
+++ b/modules/gpu/src/cuda/safe_call.hpp
@ -45,16 +45,18 @@

 #include "cuda_runtime_api.h"
 #include "cufft.h"
-//#include <nppdefs.h>
+#include "NCV.hpp"

 #if defined(__GNUC__)
    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
-    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__, __func__)
    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, __func__)
+    #define ncvSafeCall(expr)  ___ncvSafeCall(expr, __FILE__, __LINE__, __func__)
+    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__, __func__)
 #else /* defined(__CUDACC__) || defined(__MSVC__) */
    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
-    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__)
    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__)
+    #define ncvSafeCall(expr)  ___ncvSafeCall(expr, __FILE__, __LINE__)
+    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__)
 #endif

 namespace cv
@ -62,8 +64,9 @@ namespace cv
    namespace gpu
    {
        void error(const char *error_string, const char *file, const int line, const char *func = "");
-        void nppError(int err, const char *file, const int line, const char *func = "");   
-        void cufftError(int err, const char *file, const int line, const char *func = "");   
+        void nppError(int err, const char *file, const int line, const char *func = "");
+        void ncvError(int err, const char *file, const int line, const char *func = "");
+        void cufftError(int err, const char *file, const int line, const char *func = "");

        static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
        {
@ -71,17 +74,23 @@ namespace cv
                cv::gpu::error(cudaGetErrorString(err), file, line, func);
        }

-        static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
-        {
-            if (CUFFT_SUCCESS != err)
-                cv::gpu::cufftError(err, file, line, func);
-        }
-
        static inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
        {
            if (err < 0)
                cv::gpu::nppError(err, file, line, func);
        }
+
+        static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")
+        {
+            if (NCV_SUCCESS != err)
+                cv::gpu::ncvError(err, file, line, func);
+        }
+
+        static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
+        {
+            if (CUFFT_SUCCESS != err)
+                cv::gpu::cufftError(err, file, line, func);
+        }
    }
 }

--- a/modules/gpu/src/error.cpp
+++ b/modules/gpu/src/error.cpp
@ -42,30 +42,45 @@

 #include "precomp.hpp"

-
 using namespace cv;
 using namespace cv::gpu;
+using namespace std;

-
-#if !defined (HAVE_CUDA)
-
-#else /* !defined (HAVE_CUDA) */
-
+#ifdef HAVE_CUDA

 namespace 
 {
    #define error_entry(entry)  { entry, #entry }

-    //////////////////////////////////////////////////////////////////////////
-    // NPP errors
-
-    struct NppError
+    struct ErrorEntry
    {
-        int error;
+        int code;
        string str;
-    } 
+    }; 
+
+    struct ErrorEntryComparer
+    {
+        int code;
+        ErrorEntryComparer(int code_) : code(code_) {};
+        bool operator()(const ErrorEntry& e) const { return e.code == code; }
+    };
+
+    string getErrorString(int code, const ErrorEntry* errors, size_t n)
+    {
+        size_t idx = find_if(errors, errors + n, ErrorEntryComparer(code)) - errors;
+
+        const string& msg = (idx != n) ? errors[idx].str : string("Unknown error code");
+
+        ostringstream ostr;
+        ostr << msg << " [Code = " << code << "]";
+
+        return ostr.str();
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    // NPP errors
    
-    npp_errors [] = 
+    const ErrorEntry npp_errors [] = 
    {
        error_entry( NPP_NOT_SUPPORTED_MODE_ERROR ),
        error_entry( NPP_ROUND_MODE_NOT_SUPPORTED_ERROR ),
@ -74,6 +89,7 @@ namespace
 #if defined (_MSC_VER)
        error_entry( NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY ),
 #endif
+
        error_entry( NPP_BAD_ARG_ERROR ),
        error_entry( NPP_LUT_NUMBER_OF_LEVELS_ERROR ),
        error_entry( NPP_TEXTURE_BIND_ERROR ),
@ -110,106 +126,116 @@ namespace
        error_entry( NPP_ODD_ROI_WARNING )
    };

-    const size_t error_num = sizeof(npp_errors) / sizeof(npp_errors[0]);
+    const size_t npp_error_num = sizeof(npp_errors) / sizeof(npp_errors[0]);

-    struct Searcher
+    //////////////////////////////////////////////////////////////////////////
+    // NCV errors
+    
+    const ErrorEntry ncv_errors [] = 
    {
-        int err;
-        Searcher(int err_) : err(err_) {};
-        bool operator()(const NppError& e) const { return e.error == err; }
+        error_entry( NCV_SUCCESS ),
+        error_entry( NCV_UNKNOWN_ERROR ),
+        error_entry( NCV_CUDA_ERROR ),
+        error_entry( NCV_NPP_ERROR ),
+        error_entry( NCV_FILE_ERROR ),
+        error_entry( NCV_NULL_PTR ),
+        error_entry( NCV_INCONSISTENT_INPUT ),
+        error_entry( NCV_TEXTURE_BIND_ERROR ),
+        error_entry( NCV_DIMENSIONS_INVALID ),
+        error_entry( NCV_INVALID_ROI ),
+        error_entry( NCV_INVALID_STEP ),
+        error_entry( NCV_INVALID_SCALE ),
+        error_entry( NCV_INVALID_SCALE ),
+        error_entry( NCV_ALLOCATOR_NOT_INITIALIZED ),
+        error_entry( NCV_ALLOCATOR_BAD_ALLOC ),
+        error_entry( NCV_ALLOCATOR_BAD_DEALLOC ),
+        error_entry( NCV_ALLOCATOR_INSUFFICIENT_CAPACITY ),
+        error_entry( NCV_ALLOCATOR_DEALLOC_ORDER ),
+        error_entry( NCV_ALLOCATOR_BAD_REUSE ),
+        error_entry( NCV_MEM_COPY_ERROR ),
+        error_entry( NCV_MEM_RESIDENCE_ERROR ),
+        error_entry( NCV_MEM_INSUFFICIENT_CAPACITY ),
+        error_entry( NCV_HAAR_INVALID_PIXEL_STEP ),
+        error_entry( NCV_HAAR_TOO_MANY_FEATURES_IN_CLASSIFIER ),
+        error_entry( NCV_HAAR_TOO_MANY_FEATURES_IN_CASCADE ),
+        error_entry( NCV_HAAR_TOO_LARGE_FEATURES ),
+        error_entry( NCV_HAAR_XML_LOADING_EXCEPTION ),
+        error_entry( NCV_NOIMPL_HAAR_TILTED_FEATURES ),
+        error_entry( NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW ),
+        error_entry( NPPST_SUCCESS ),
+        error_entry( NPPST_ERROR ),
+        error_entry( NPPST_CUDA_KERNEL_EXECUTION_ERROR ),
+        error_entry( NPPST_NULL_POINTER_ERROR ),
+        error_entry( NPPST_TEXTURE_BIND_ERROR ),
+        error_entry( NPPST_MEMCPY_ERROR ),
+        error_entry( NPPST_MEM_ALLOC_ERR ),
+        error_entry( NPPST_MEMFREE_ERR ),
+        error_entry( NPPST_INVALID_ROI ),
+        error_entry( NPPST_INVALID_STEP ),
+        error_entry( NPPST_INVALID_SCALE ),
+        error_entry( NPPST_MEM_INSUFFICIENT_BUFFER ),
+        error_entry( NPPST_MEM_RESIDENCE_ERROR ),
+        error_entry( NPPST_MEM_INTERNAL_ERROR )
    };

+    const size_t ncv_error_num = sizeof(npp_errors) / sizeof(npp_errors[0]);
+
    //////////////////////////////////////////////////////////////////////////
    // CUFFT errors

-    struct CufftError
-    {
-        int code;
-        string message;
-    };
-
-    const CufftError cufft_errors[] = 
-    {
-        error_entry(CUFFT_INVALID_PLAN),
-        error_entry(CUFFT_ALLOC_FAILED),
-        error_entry(CUFFT_INVALID_TYPE),
-        error_entry(CUFFT_INVALID_VALUE),
-        error_entry(CUFFT_INTERNAL_ERROR),
-        error_entry(CUFFT_EXEC_FAILED),
-        error_entry(CUFFT_SETUP_FAILED),
-        error_entry(CUFFT_INVALID_SIZE),
-        error_entry(CUFFT_UNALIGNED_DATA)
-    };
-
-    struct CufftErrorComparer
+    const ErrorEntry cufft_errors[] = 
    {
-        CufftErrorComparer(int code_): code(code_) {}
-        bool operator()(const CufftError& other) const 
-        { 
-            return other.code == code; 
-        }
-        int code;
+        error_entry( CUFFT_INVALID_PLAN ),
+        error_entry( CUFFT_ALLOC_FAILED ),
+        error_entry( CUFFT_INVALID_TYPE ),
+        error_entry( CUFFT_INVALID_VALUE ),
+        error_entry( CUFFT_INTERNAL_ERROR ),
+        error_entry( CUFFT_EXEC_FAILED ),
+        error_entry( CUFFT_SETUP_FAILED ),
+        error_entry( CUFFT_INVALID_SIZE ),
+        error_entry( CUFFT_UNALIGNED_DATA )
    };

    const int cufft_error_num = sizeof(cufft_errors) / sizeof(cufft_errors[0]);
-
 }

 namespace cv
 {
    namespace gpu
    {
-        const string getNppErrorString( int err )
-        {
-            size_t idx = std::find_if(npp_errors, npp_errors + error_num, Searcher(err)) - npp_errors;
-            const string& msg = (idx != error_num) ? npp_errors[idx].str : string("Unknown error code");
-
-            std::stringstream interpreter;
-            interpreter << msg <<" [Code = " << err << "]";
-
-            return interpreter.str();
-        }
-
-        void nppError( int err, const char *file, const int line, const char *func)
-        {                    
-            cv::error( cv::Exception(CV_GpuNppCallError, getNppErrorString(err), func, file, line) );                
-        }
-
-        const string getCufftErrorString(int err_code)
-        {
-            const CufftError* cufft_error = std::find_if(
-                    cufft_errors, cufft_errors + cufft_error_num, 
-                    CufftErrorComparer(err_code));
-
-            bool found = cufft_error != cufft_errors + cufft_error_num;
-
-            std::stringstream ss;
-            ss << (found ? cufft_error->message : "Unknown error code");
-            ss << " [Code = " << err_code << "]";
-
-            return ss.str();
-        }
-
-        void cufftError(int err, const char *file, const int line, const char *func)
-        {
-            cv::error(cv::Exception(CV_GpuCufftCallError, getCufftErrorString(err), func, file, line));
-        }
-
        void error(const char *error_string, const char *file, const int line, const char *func)
        {          
            int code = CV_GpuApiCallError;

-            if (std::uncaught_exception())
+            if (uncaught_exception())
            {
                const char* errorStr = cvErrorStr(code);            
                const char* function = func ? func : "unknown function";    

-                std::cerr << "OpenCV Error: " << errorStr << "(" << error_string << ") in " << function << ", file " << file << ", line " << line;
-                std::cerr.flush();            
+                cerr << "OpenCV Error: " << errorStr << "(" << error_string << ") in " << function << ", file " << file << ", line " << line;
+                cerr.flush();            
            }
            else    
                cv::error( cv::Exception(code, error_string, func, file, line) );
        }
+
+        void nppError(int code, const char *file, const int line, const char *func)
+        {
+            string msg = getErrorString(code, npp_errors, npp_error_num);
+            cv::gpu::error(msg.c_str(), file, line, func);
+        }
+
+        void ncvError(int code, const char *file, const int line, const char *func)
+        {
+            string msg = getErrorString(code, ncv_errors, ncv_error_num);
+            cv::gpu::error(msg.c_str(), file, line, func);
+        }
+
+        void cufftError(int code, const char *file, const int line, const char *func)
+        {
+            string msg = getErrorString(code, cufft_errors, cufft_error_num);
+            cv::gpu::error(msg.c_str(), file, line, func);
+        }
    }
 }

--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@ -48,8 +48,8 @@ using namespace cv::gpu;
 #if !defined (HAVE_CUDA)

 void cv::gpu::remap(const GpuMat&, GpuMat&, const GpuMat&, const GpuMat&, int, int, const Scalar&, Stream&){ throw_nogpu(); }
-void cv::gpu::meanShiftFiltering(const GpuMat&, GpuMat&, int, int, TermCriteria) { throw_nogpu(); }
-void cv::gpu::meanShiftProc(const GpuMat&, GpuMat&, GpuMat&, int, int, TermCriteria) { throw_nogpu(); }
+void cv::gpu::meanShiftFiltering(const GpuMat&, GpuMat&, int, int, TermCriteria, Stream&) { throw_nogpu(); }
+void cv::gpu::meanShiftProc(const GpuMat&, GpuMat&, GpuMat&, int, int, TermCriteria, Stream&) { throw_nogpu(); }
 void cv::gpu::drawColorDisp(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
 void cv::gpu::reprojectImageTo3D(const GpuMat&, GpuMat&, const Mat&, Stream&) { throw_nogpu(); }
 void cv::gpu::resize(const GpuMat&, GpuMat&, Size, double, double, int, Stream&) { throw_nogpu(); }
@ -82,14 +82,16 @@ void cv::gpu::equalizeHist(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nog
 void cv::gpu::equalizeHist(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, int, int, double, int) { throw_nogpu(); }
 void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, double, int) { throw_nogpu(); }
+void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, double, int, Stream&) { throw_nogpu(); }
 void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, int, int, int) { throw_nogpu(); }
 void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, int) { throw_nogpu(); }
-void cv::gpu::mulSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, bool) { throw_nogpu(); }
-void cv::gpu::mulAndScaleSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, float, bool) { throw_nogpu(); }
-void cv::gpu::dft(const GpuMat&, GpuMat&, Size, int) { throw_nogpu(); }
+void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, int, Stream&) { throw_nogpu(); }
+void cv::gpu::mulSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, bool, Stream&) { throw_nogpu(); }
+void cv::gpu::mulAndScaleSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, float, bool, Stream&) { throw_nogpu(); }
+void cv::gpu::dft(const GpuMat&, GpuMat&, Size, int, Stream&) { throw_nogpu(); }
 void cv::gpu::ConvolveBuf::create(Size, Size) { throw_nogpu(); }
 void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool) { throw_nogpu(); }
-void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&) { throw_nogpu(); }
+void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream& stream) { throw_nogpu(); }
 void cv::gpu::pyrDown(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
 void cv::gpu::pyrUp(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
 void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int, bool) { throw_nogpu(); }
@ -151,10 +153,10 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp

 namespace cv { namespace gpu {  namespace imgproc
 {
-    extern "C" void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps);
+    void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
 }}}

-void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, TermCriteria criteria)
+void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, TermCriteria criteria, Stream& stream)
 {
    if( src.empty() )
        CV_Error( CV_StsBadArg, "The input image is empty" );
@ -174,7 +176,7 @@ void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,
        eps = 1.f;
    eps = (float)std::max(criteria.epsilon, 0.0);

-    imgproc::meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps);
+    imgproc::meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps, StreamAccessor::getStream(stream));
 }

 ////////////////////////////////////////////////////////////////////////
@ -182,10 +184,10 @@ void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,

 namespace cv { namespace gpu {  namespace imgproc
 {
-    extern "C" void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps);
+    void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
 }}}

-void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, TermCriteria criteria)
+void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, TermCriteria criteria, Stream& stream)
 {
    if( src.empty() )
        CV_Error( CV_StsBadArg, "The input image is empty" );
@ -206,7 +208,7 @@ void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int
        eps = 1.f;
    eps = (float)std::max(criteria.epsilon, 0.0);

-    imgproc::meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps);
+    imgproc::meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps, StreamAccessor::getStream(stream));
 }

 ////////////////////////////////////////////////////////////////////////
@ -766,14 +768,14 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S
 	cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );

    Ncv32u bufSize;
-    nppSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
+    ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
    ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer);

    cudaStream_t stream = StreamAccessor::getStream(s);

    NppStStreamHandler h(stream);

-    nppSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), static_cast<int>(src.step), 
+    ncvSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), static_cast<int>(src.step), 
        sum.ptr<Ncv32u>(), static_cast<int>(sum.step), roiSize, buffer.ptr<Ncv8u>(), bufSize, prop) );

    if (stream == 0)
@ -819,7 +821,7 @@ void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& s)
 	cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );

    Ncv32u bufSize;
-    nppSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop));	
+    ncvSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop));	
    GpuMat buf(1, bufSize, CV_8U);

    cudaStream_t stream = StreamAccessor::getStream(s);
@ -827,7 +829,7 @@ void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& s)
    NppStStreamHandler h(stream);

    sqsum.create(src.rows + 1, src.cols + 1, CV_64F);
-    nppSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>(0)), static_cast<int>(src.step), 
+    ncvSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>(0)), static_cast<int>(src.step), 
            sqsum.ptr<Ncv64u>(0), static_cast<int>(sqsum.step), roiSize, buf.ptr<Ncv8u>(0), bufSize, prop));

    if (stream == 0)
@ -1260,16 +1262,16 @@ void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat&

 namespace cv { namespace gpu { namespace imgproc {

-    void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst);
-    void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type);
-    void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type);
+    void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst, cudaStream_t stream);
+    void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream);
+    void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream);

 }}}

 namespace 
 {
    template <typename T>
-    void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType)
+    void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)
    {   
        double scale = (double)(1 << ((ksize > 0 ? ksize : 3) - 1)) * blockSize;
        if (ksize < 0) 
@ -1283,25 +1285,25 @@ namespace

        if (ksize > 0)
        {
-            Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, borderType);
-            Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, borderType);
+            Sobel(src, Dx, CV_32F, 1, 0, buf, ksize, scale, borderType, -1, stream);
+            Sobel(src, Dy, CV_32F, 0, 1, buf, ksize, scale, borderType, -1, stream);
        }
        else
        {
-            Scharr(src, Dx, CV_32F, 1, 0, scale, borderType);
-            Scharr(src, Dy, CV_32F, 0, 1, scale, borderType);
+            Scharr(src, Dx, CV_32F, 1, 0, buf, scale, borderType, -1, stream);
+            Scharr(src, Dy, CV_32F, 0, 1, buf, scale, borderType, -1, stream);
        }
    }

-    void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType)
+    void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)
    {
        switch (src.type())
        {
        case CV_8U:
-            extractCovData<unsigned char>(src, Dx, Dy, blockSize, ksize, borderType);
+            extractCovData<unsigned char>(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);
            break;
        case CV_32F:
-            extractCovData<float>(src, Dx, Dy, blockSize, ksize, borderType);
+            extractCovData<float>(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);
            break;
        default:
            CV_Error(CV_StsBadArg, "extractCovData: unsupported type of the source matrix");
@ -1343,6 +1345,12 @@ void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ks
 }

 void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, double k, int borderType)
+{
+    GpuMat buf;
+    cornerHarris(src, dst, Dx, Dy, buf, blockSize, ksize, k, borderType);
+}
+
+void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, double k, int borderType, Stream& stream)
 {
    CV_Assert(borderType == cv::BORDER_REFLECT101 ||
              borderType == cv::BORDER_REPLICATE);
@ -1350,9 +1358,9 @@ void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& D
    int gpuBorderType;
    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));

-    extractCovData(src, Dx, Dy, blockSize, ksize, borderType);
+    extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);
    dst.create(src.size(), CV_32F);
-    imgproc::cornerHarris_caller(blockSize, (float)k, Dx, Dy, dst, gpuBorderType);
+    imgproc::cornerHarris_caller(blockSize, (float)k, Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream));
 }

 void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType)
@ -1362,6 +1370,12 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, i
 }

 void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType)
+{
+    GpuMat buf;
+    cornerMinEigenVal(src, dst, Dx, Dy, buf, blockSize, ksize, borderType);
+}
+
+void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)
 {  
    CV_Assert(borderType == cv::BORDER_REFLECT101 ||
              borderType == cv::BORDER_REPLICATE);
@ -1369,9 +1383,9 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuM
    int gpuBorderType;
    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));

-    extractCovData(src, Dx, Dy, blockSize, ksize, borderType);    
+    extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);    
    dst.create(src.size(), CV_32F);
-    imgproc::cornerMinEigenVal_caller(blockSize, Dx, Dy, dst, gpuBorderType);
+    imgproc::cornerMinEigenVal_caller(blockSize, Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream));
 }

 //////////////////////////////////////////////////////////////////////////////
@ -1379,21 +1393,16 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuM

 namespace cv { namespace gpu { namespace imgproc 
 {
-    void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, 
-                      DevMem2D_<cufftComplex> c);
+    void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream);

-    void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, 
-                           DevMem2D_<cufftComplex> c);
+    void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream);
 }}}


-void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, 
-                           int flags, bool conjB) 
+void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB, Stream& stream) 
 {
-    typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, 
-                           DevMem2D_<cufftComplex>);
-    static Caller callers[] = { imgproc::mulSpectrums, 
-                                imgproc::mulSpectrums_CONJ };
+    typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, DevMem2D_<cufftComplex>, cudaStream_t stream);
+    static Caller callers[] = { imgproc::mulSpectrums, imgproc::mulSpectrums_CONJ };

    CV_Assert(a.type() == b.type() && a.type() == CV_32FC2);
    CV_Assert(a.size() == b.size());
@ -1401,7 +1410,7 @@ void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c,
    c.create(a.size(), CV_32FC2);

    Caller caller = callers[(int)conjB];
-    caller(a, b, c);
+    caller(a, b, c, StreamAccessor::getStream(stream));
 }

 //////////////////////////////////////////////////////////////////////////////
@ -1409,21 +1418,16 @@ void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c,

 namespace cv { namespace gpu { namespace imgproc 
 {
-    void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,
-                             float scale, DevMem2D_<cufftComplex> c);
+    void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream);

-    void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,
-                                  float scale, DevMem2D_<cufftComplex> c);
+    void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream);
 }}}


-void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c,
-                                  int flags, float scale, bool conjB) 
+void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB, Stream& stream) 
 {
-    typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>,
-                           float scale, DevMem2D_<cufftComplex>);
-    static Caller callers[] = { imgproc::mulAndScaleSpectrums, 
-                                imgproc::mulAndScaleSpectrums_CONJ };
+    typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, float scale, DevMem2D_<cufftComplex>, cudaStream_t stream);
+    static Caller callers[] = { imgproc::mulAndScaleSpectrums, imgproc::mulAndScaleSpectrums_CONJ };

    CV_Assert(a.type() == b.type() && a.type() == CV_32FC2);
    CV_Assert(a.size() == b.size());
@ -1431,14 +1435,26 @@ void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c,
    c.create(a.size(), CV_32FC2);

    Caller caller = callers[(int)conjB];
-    caller(a, b, scale, c);
+    caller(a, b, scale, c, StreamAccessor::getStream(stream));
 }

 //////////////////////////////////////////////////////////////////////////////
 // dft

-void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags)
+void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags, Stream& stream)
 {
+#ifndef HAVE_CUFFT
+
+    OPENCV_GPU_UNUSED(src);
+    OPENCV_GPU_UNUSED(dst);
+    OPENCV_GPU_UNUSED(dft_size);
+    OPENCV_GPU_UNUSED(flags);
+    OPENCV_GPU_UNUSED(stream);
+
+    throw_nogpu();
+
+#else
+
    CV_Assert(src.type() == CV_32F || src.type() == CV_32FC2);

    // We don't support unpacked output (in the case of real input)
@ -1483,6 +1499,8 @@ void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags)
    else
        cufftPlan2d(&plan, dft_size_opt.height, dft_size_opt.width, dft_type);

+    cufftSafeCall( cufftSetStream(plan, StreamAccessor::getStream(stream)) );
+
    if (is_complex_input)
    {
        if (is_complex_output)
@ -1514,7 +1532,9 @@ void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags)
    cufftSafeCall(cufftDestroy(plan));

    if (is_scaled_dft)
-        multiply(dst, Scalar::all(1. / dft_size.area()), dst);
+        multiply(dst, Scalar::all(1. / dft_size.area()), dst, 1, -1, stream);
+
+#endif
 }

 //////////////////////////////////////////////////////////////////////////////
@ -1563,8 +1583,7 @@ Size cv::gpu::ConvolveBuf::estimateBlockSize(Size result_size, Size templ_size)
 }


-void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, 
-                       bool ccorr)
+void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr)
 {
    ConvolveBuf buf;
    convolve(image, templ, result, ccorr, buf);
@ -1572,12 +1591,37 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,

 namespace cv { namespace gpu { namespace imgproc
 {
-    void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel);
+    void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream);
 }}}

-void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, 
-                       bool ccorr, ConvolveBuf& buf)
+void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream)
 {
+#ifndef HAVE_CUFFT
+
+    CV_Assert(image.type() == CV_32F);
+    CV_Assert(templ.type() == CV_32F);
+    CV_Assert(templ.cols <= 17 && templ.rows <= 17);
+    
+    result.create(image.size(), CV_32F);
+
+    GpuMat& contKernel = buf.templ_block;
+
+    if (templ.isContinuous())
+        contKernel = templ;
+    else
+    {
+        contKernel = createContinuous(templ.size(), templ.type());
+
+        if (stream)
+            stream.enqueueCopy(templ, contKernel);
+        else
+            templ.copyTo(contKernel);
+    }
+
+    imgproc::convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr<float>(), StreamAccessor::getStream(stream));
+
+#else
+
    StaticAssert<sizeof(float) == sizeof(cufftReal)>::check();
    StaticAssert<sizeof(float) * 2 == sizeof(cufftComplex)>::check();

@ -1587,77 +1631,91 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
    if (templ.cols < 13 && templ.rows < 13)
    {
        result.create(image.size(), CV_32F);
-        GpuMat contKernel;
+
+        GpuMat& contKernel = buf.templ_block;

        if (templ.isContinuous())
            contKernel = templ;
        else
        {
            contKernel = createContinuous(templ.size(), templ.type());
-            templ.copyTo(contKernel);
-        }

-        imgproc::convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr<float>());
+            if (stream)
+                stream.enqueueCopy(templ, contKernel);
+            else
+                templ.copyTo(contKernel);
+        }

-        return;
+        imgproc::convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr<float>(), StreamAccessor::getStream(stream));
    }
+    else
+    {
+        buf.create(image.size(), templ.size());
+        result.create(buf.result_size, CV_32F);

-    buf.create(image.size(), templ.size());
-    result.create(buf.result_size, CV_32F);
+        Size& block_size = buf.block_size;
+        Size& dft_size = buf.dft_size;

-    Size& block_size = buf.block_size;
-    Size& dft_size = buf.dft_size;
+        GpuMat& image_block = buf.image_block;
+        GpuMat& templ_block = buf.templ_block;
+        GpuMat& result_data = buf.result_data;

-    GpuMat& image_block = buf.image_block;
-    GpuMat& templ_block = buf.templ_block;
-    GpuMat& result_data = buf.result_data;
+        GpuMat& image_spect = buf.image_spect;
+        GpuMat& templ_spect = buf.templ_spect;
+        GpuMat& result_spect = buf.result_spect;

-    GpuMat& image_spect = buf.image_spect;
-    GpuMat& templ_spect = buf.templ_spect;
-    GpuMat& result_spect = buf.result_spect;
+        cufftHandle planR2C, planC2R;
+        cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R));
+        cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C));

-    cufftHandle planR2C, planC2R;
-    cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R));
-    cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C));
+        cufftSafeCall( cufftSetStream(planR2C, StreamAccessor::getStream(stream)) );
+        cufftSafeCall( cufftSetStream(planC2R, StreamAccessor::getStream(stream)) );

-    GpuMat templ_roi(templ.size(), CV_32F, templ.data, templ.step);
-    copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0, 
-                   templ_block.cols - templ_roi.cols, 0);
+        GpuMat templ_roi(templ.size(), CV_32F, templ.data, templ.step);
+        copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0, 
+                       templ_block.cols - templ_roi.cols, 0, Scalar(), stream);

-    cufftSafeCall(cufftExecR2C(planR2C, templ_block.ptr<cufftReal>(), 
-                               templ_spect.ptr<cufftComplex>()));
+        cufftSafeCall(cufftExecR2C(planR2C, templ_block.ptr<cufftReal>(), 
+                                   templ_spect.ptr<cufftComplex>()));

-    // Process all blocks of the result matrix
-    for (int y = 0; y < result.rows; y += block_size.height)
-    {
-        for (int x = 0; x < result.cols; x += block_size.width)
+        // Process all blocks of the result matrix
+        for (int y = 0; y < result.rows; y += block_size.height)
        {
-            Size image_roi_size(std::min(x + dft_size.width, image.cols) - x,
-                                std::min(y + dft_size.height, image.rows) - y);
-            GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr<float>(y) + x), 
-                             image.step);
-            copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows,
-                           0, image_block.cols - image_roi.cols, 0);
-
-            cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr<cufftReal>(), 
-                                       image_spect.ptr<cufftComplex>()));
-            mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0,
-                                 1.f / dft_size.area(), ccorr);
-            cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr<cufftComplex>(), 
-                                       result_data.ptr<cufftReal>()));
-
-            Size result_roi_size(std::min(x + block_size.width, result.cols) - x,
-                                 std::min(y + block_size.height, result.rows) - y);
-            GpuMat result_roi(result_roi_size, result.type(), 
-                              (void*)(result.ptr<float>(y) + x), result.step);
-            GpuMat result_block(result_roi_size, result_data.type(), 
-                                result_data.ptr(), result_data.step);
-            result_block.copyTo(result_roi);
+            for (int x = 0; x < result.cols; x += block_size.width)
+            {
+                Size image_roi_size(std::min(x + dft_size.width, image.cols) - x,
+                                    std::min(y + dft_size.height, image.rows) - y);
+                GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr<float>(y) + x), 
+                                 image.step);
+                copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows,
+                               0, image_block.cols - image_roi.cols, 0, Scalar(), stream);
+
+                cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr<cufftReal>(), 
+                                           image_spect.ptr<cufftComplex>()));
+                mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0,
+                                     1.f / dft_size.area(), ccorr, stream);
+                cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr<cufftComplex>(), 
+                                           result_data.ptr<cufftReal>()));
+
+                Size result_roi_size(std::min(x + block_size.width, result.cols) - x,
+                                     std::min(y + block_size.height, result.rows) - y);
+                GpuMat result_roi(result_roi_size, result.type(), 
+                                  (void*)(result.ptr<float>(y) + x), result.step);
+                GpuMat result_block(result_roi_size, result_data.type(), 
+                                    result_data.ptr(), result_data.step);
+
+                if (stream)
+                    stream.enqueueCopy(result_block, result_roi);
+                else
+                    result_block.copyTo(result_roi);
+            }
        }
+
+        cufftSafeCall(cufftDestroy(planR2C));
+        cufftSafeCall(cufftDestroy(planC2R));
    }

-    cufftSafeCall(cufftDestroy(planR2C));
-    cufftSafeCall(cufftDestroy(planC2R));
+#endif
 }

 //////////////////////////////////////////////////////////////////////////////
--- a/modules/gpu/src/match_template.cpp
+++ b/modules/gpu/src/match_template.cpp
@ -47,43 +47,32 @@ using namespace cv::gpu;

 #if !defined (HAVE_CUDA)

-void cv::gpu::matchTemplate(const GpuMat&, const GpuMat&, GpuMat&, int) { throw_nogpu(); }
+void cv::gpu::matchTemplate(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }

 #else

 namespace cv { namespace gpu { namespace imgproc 
 {  
-    void matchTemplateNaive_CCORR_8U(
-            const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn);
+    void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
+    void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);

-    void matchTemplateNaive_CCORR_32F(
-            const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn);
+    void matchTemplateNaive_SQDIFF_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
+    void matchTemplateNaive_SQDIFF_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);

-    void matchTemplateNaive_SQDIFF_8U(
-            const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn);
+    void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, 
+        int cn, cudaStream_t stream);

-    void matchTemplateNaive_SQDIFF_32F(
-            const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn);
-
-    void matchTemplatePrepared_SQDIFF_8U(
-            int w, int h, const DevMem2D_<unsigned long long> image_sqsum, 
-            unsigned int templ_sqsum, DevMem2Df result, int cn);
-
-    void matchTemplatePrepared_SQDIFF_NORMED_8U(
-            int w, int h, const DevMem2D_<unsigned long long> image_sqsum, 
-            unsigned int templ_sqsum, DevMem2Df result, int cn);
-
-    void matchTemplatePrepared_CCOFF_8U(
-            int w, int h, const DevMem2D_<unsigned int> image_sum,
-            unsigned int templ_sum, DevMem2Df result);
+    void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, 
+        int cn, cudaStream_t stream);

+    void matchTemplatePrepared_CCOFF_8U(int w, int h, const DevMem2D_<unsigned int> image_sum, unsigned int templ_sum, DevMem2Df result, cudaStream_t stream);
    void matchTemplatePrepared_CCOFF_8UC2(
-            int w, int h, 
-            const DevMem2D_<unsigned int> image_sum_r, 
-            const DevMem2D_<unsigned int> image_sum_g,
-            unsigned int templ_sum_r, unsigned int templ_sum_g, 
-            DevMem2Df result);
-
+        int w, int h,
+        const DevMem2D_<unsigned int> image_sum_r, 
+        const DevMem2D_<unsigned int> image_sum_g, 
+        unsigned int templ_sum_r,
+        unsigned int templ_sum_g, 
+        DevMem2Df result, cudaStream_t stream);
    void matchTemplatePrepared_CCOFF_8UC3(
            int w, int h, 
            const DevMem2D_<unsigned int> image_sum_r, 
@ -92,8 +81,7 @@ namespace cv { namespace gpu { namespace imgproc
            unsigned int templ_sum_r, 
            unsigned int templ_sum_g, 
            unsigned int templ_sum_b, 
-            DevMem2Df result);
-
+            DevMem2Df result, cudaStream_t stream);
    void matchTemplatePrepared_CCOFF_8UC4(
            int w, int h, 
            const DevMem2D_<unsigned int> image_sum_r, 
@ -104,22 +92,21 @@ namespace cv { namespace gpu { namespace imgproc
            unsigned int templ_sum_g, 
            unsigned int templ_sum_b, 
            unsigned int templ_sum_a, 
-            DevMem2Df result);
+            DevMem2Df result, cudaStream_t stream);
+

    void matchTemplatePrepared_CCOFF_NORMED_8U(
            int w, int h, const DevMem2D_<unsigned int> image_sum, 
            const DevMem2D_<unsigned long long> image_sqsum,
            unsigned int templ_sum, unsigned int templ_sqsum,
-            DevMem2Df result);
-
+            DevMem2Df result, cudaStream_t stream);
    void matchTemplatePrepared_CCOFF_NORMED_8UC2(
            int w, int h, 
            const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
            const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
            unsigned int templ_sum_r, unsigned int templ_sqsum_r,
            unsigned int templ_sum_g, unsigned int templ_sqsum_g,
-            DevMem2Df result);
-
+            DevMem2Df result, cudaStream_t stream);
    void matchTemplatePrepared_CCOFF_NORMED_8UC3(
            int w, int h, 
            const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
@ -128,8 +115,7 @@ namespace cv { namespace gpu { namespace imgproc
            unsigned int templ_sum_r, unsigned int templ_sqsum_r,
            unsigned int templ_sum_g, unsigned int templ_sqsum_g,
            unsigned int templ_sum_b, unsigned int templ_sqsum_b,
-            DevMem2Df result);
-
+            DevMem2Df result, cudaStream_t stream);
    void matchTemplatePrepared_CCOFF_NORMED_8UC4(
            int w, int h, 
            const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
@ -140,12 +126,12 @@ namespace cv { namespace gpu { namespace imgproc
            unsigned int templ_sum_g, unsigned int templ_sqsum_g,
            unsigned int templ_sum_b, unsigned int templ_sqsum_b,
            unsigned int templ_sum_a, unsigned int templ_sqsum_a,
-            DevMem2Df result);
+            DevMem2Df result, cudaStream_t stream);

    void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, 
-                      unsigned int templ_sqsum, DevMem2Df result, int cn);
+                      unsigned int templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream);

-    void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn);
+    void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cudaStream_t stream);
 }}}


@ -186,103 +172,111 @@ namespace
    }

    
-    void matchTemplate_CCORR_32F(const GpuMat& image, const GpuMat& templ, GpuMat& result)
+    void matchTemplate_CCORR_32F(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)
    {
        result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
        if (templ.size().area() < getTemplateThreshold(CV_TM_CCORR, CV_32F))
        {
-            imgproc::matchTemplateNaive_CCORR_32F(image, templ, result, image.channels());
+            imgproc::matchTemplateNaive_CCORR_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
            return;
        }

        GpuMat result_;
-        convolve(image.reshape(1), templ.reshape(1), result_, true);
-        imgproc::extractFirstChannel_32F(result_, result, image.channels());
+        ConvolveBuf buf;
+        convolve(image.reshape(1), templ.reshape(1), result_, true, buf, stream);
+        imgproc::extractFirstChannel_32F(result_, result, image.channels(), StreamAccessor::getStream(stream));
    }


-    void matchTemplate_CCORR_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result)
+    void matchTemplate_CCORR_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)
    {
        if (templ.size().area() < getTemplateThreshold(CV_TM_CCORR, CV_8U))
        {
            result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
-            imgproc::matchTemplateNaive_CCORR_8U(image, templ, result, image.channels());
+            imgproc::matchTemplateNaive_CCORR_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
            return;
        }

        GpuMat imagef, templf;
-        image.convertTo(imagef, CV_32F);
-        templ.convertTo(templf, CV_32F);
-        matchTemplate_CCORR_32F(imagef, templf, result);
+        if (stream)
+        {
+            stream.enqueueConvert(image, imagef, CV_32F);
+            stream.enqueueConvert(templ, templf, CV_32F);
+        }
+        else
+        {
+            image.convertTo(imagef, CV_32F);
+            templ.convertTo(templf, CV_32F);
+        }
+        matchTemplate_CCORR_32F(imagef, templf, result, stream);
    }


-    void matchTemplate_CCORR_NORMED_8U(const GpuMat& image, const GpuMat& templ, 
-                                       GpuMat& result)
+    void matchTemplate_CCORR_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)
    {
-        matchTemplate_CCORR_8U(image, templ, result);
+        matchTemplate_CCORR_8U(image, templ, result, stream);

        GpuMat img_sqsum;
-        sqrIntegral(image.reshape(1), img_sqsum);
+        sqrIntegral(image.reshape(1), img_sqsum, stream);

        unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0];
        imgproc::normalize_8U(templ.cols, templ.rows, img_sqsum, templ_sqsum, 
-                              result, image.channels());
+                              result, image.channels(), StreamAccessor::getStream(stream));
    }

    
-    void matchTemplate_SQDIFF_32F(const GpuMat& image, const GpuMat& templ, GpuMat& result)
+    void matchTemplate_SQDIFF_32F(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)
    {
        result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
-        imgproc::matchTemplateNaive_SQDIFF_32F(image, templ, result, image.channels());
+        imgproc::matchTemplateNaive_SQDIFF_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
    }


-    void matchTemplate_SQDIFF_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result)
+    void matchTemplate_SQDIFF_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)
    {
        if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, CV_8U))
        {
            result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
-            imgproc::matchTemplateNaive_SQDIFF_8U(image, templ, result, image.channels());
+            imgproc::matchTemplateNaive_SQDIFF_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
            return;
        }

        GpuMat img_sqsum;
-        sqrIntegral(image.reshape(1), img_sqsum);
+        sqrIntegral(image.reshape(1), img_sqsum, stream);

        unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0];

-        matchTemplate_CCORR_8U(image, templ, result);
+        matchTemplate_CCORR_8U(image, templ, result, stream);
        imgproc::matchTemplatePrepared_SQDIFF_8U(
-                templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels());
+                templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
    }


-    void matchTemplate_SQDIFF_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result)
+    void matchTemplate_SQDIFF_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)
    {
        GpuMat img_sqsum;
-        sqrIntegral(image.reshape(1), img_sqsum);
+        sqrIntegral(image.reshape(1), img_sqsum, stream);

        unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0];

-        matchTemplate_CCORR_8U(image, templ, result);
+        matchTemplate_CCORR_8U(image, templ, result, stream);
        imgproc::matchTemplatePrepared_SQDIFF_NORMED_8U(
-                templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels());
+                templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
    }


-    void matchTemplate_CCOFF_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result)
+    void matchTemplate_CCOFF_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)
    {
-        matchTemplate_CCORR_8U(image, templ, result);
+        matchTemplate_CCORR_8U(image, templ, result, stream);

        if (image.channels() == 1)
        {
            GpuMat image_sum;
-            integral(image, image_sum);
+            integral(image, image_sum, stream);

            unsigned int templ_sum = (unsigned int)sum(templ)[0];
            imgproc::matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, 
-                                                    image_sum, templ_sum, result);
+                                                    image_sum, templ_sum, result, StreamAccessor::getStream(stream));
        }
        else
        {
@ -291,7 +285,7 @@ namespace

            split(image, images);
            for (int i = 0; i < image.channels(); ++i)
-                integral(images[i], image_sums[i]);
+                integral(images[i], image_sums[i], stream);

            Scalar templ_sum = sum(templ);

@ -301,19 +295,19 @@ namespace
                imgproc::matchTemplatePrepared_CCOFF_8UC2(
                        templ.cols, templ.rows, image_sums[0], image_sums[1],
                        (unsigned int)templ_sum[0], (unsigned int)templ_sum[1],
-                        result);
+                        result, StreamAccessor::getStream(stream));
                break;
            case 3:
                imgproc::matchTemplatePrepared_CCOFF_8UC3(
                        templ.cols, templ.rows, image_sums[0], image_sums[1], image_sums[2],
                        (unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2],
-                        result);
+                        result, StreamAccessor::getStream(stream));
                break;
            case 4:
                imgproc::matchTemplatePrepared_CCOFF_8UC4(
                        templ.cols, templ.rows, image_sums[0], image_sums[1], image_sums[2], image_sums[3],
                        (unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2],
-                        (unsigned int)templ_sum[3], result);
+                        (unsigned int)templ_sum[3], result, StreamAccessor::getStream(stream));
                break;
            default:
                CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");
@ -322,25 +316,34 @@ namespace
    }


-    void matchTemplate_CCOFF_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result)
+    void matchTemplate_CCOFF_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)
    {
        GpuMat imagef, templf;
-        image.convertTo(imagef, CV_32F);
-        templ.convertTo(templf, CV_32F);
-        matchTemplate_CCORR_32F(imagef, templf, result);
+        if (stream)
+        {
+            stream.enqueueConvert(image, imagef, CV_32F);
+            stream.enqueueConvert(templ, templf, CV_32F);
+        }
+        else
+        {
+            image.convertTo(imagef, CV_32F);
+            templ.convertTo(templf, CV_32F);
+        }
+
+        matchTemplate_CCORR_32F(imagef, templf, result, stream);

        if (image.channels() == 1)
        {
            GpuMat image_sum, image_sqsum;
-            integral(image, image_sum);
-            sqrIntegral(image, image_sqsum);
+            integral(image, image_sum, stream);
+            sqrIntegral(image, image_sqsum, stream);

            unsigned int templ_sum = (unsigned int)sum(templ)[0];
            unsigned int templ_sqsum = (unsigned int)sqrSum(templ)[0];

            imgproc::matchTemplatePrepared_CCOFF_NORMED_8U(
                    templ.cols, templ.rows, image_sum, image_sqsum, 
-                    templ_sum, templ_sqsum, result);
+                    templ_sum, templ_sqsum, result, StreamAccessor::getStream(stream));
        }
        else
        {
@ -351,8 +354,8 @@ namespace
            split(image, images);
            for (int i = 0; i < image.channels(); ++i)
            {
-                integral(images[i], image_sums[i]);
-                sqrIntegral(images[i], image_sqsums[i]);
+                integral(images[i], image_sums[i], stream);
+                sqrIntegral(images[i], image_sqsums[i], stream);
            }

            Scalar templ_sum = sum(templ);
@ -367,7 +370,7 @@ namespace
                        image_sums[1], image_sqsums[1],
                        (unsigned int)templ_sum[0], (unsigned int)templ_sqsum[0],
                        (unsigned int)templ_sum[1], (unsigned int)templ_sqsum[1],
-                        result);
+                        result, StreamAccessor::getStream(stream));
                break;
            case 3:
                imgproc::matchTemplatePrepared_CCOFF_NORMED_8UC3(
@ -378,7 +381,7 @@ namespace
                        (unsigned int)templ_sum[0], (unsigned int)templ_sqsum[0],
                        (unsigned int)templ_sum[1], (unsigned int)templ_sqsum[1],
                        (unsigned int)templ_sum[2], (unsigned int)templ_sqsum[2],
-                        result);
+                        result, StreamAccessor::getStream(stream));
                break;
            case 4:
                imgproc::matchTemplatePrepared_CCOFF_NORMED_8UC4(
@ -391,7 +394,7 @@ namespace
                        (unsigned int)templ_sum[1], (unsigned int)templ_sqsum[1],
                        (unsigned int)templ_sum[2], (unsigned int)templ_sqsum[2],
                        (unsigned int)templ_sum[3], (unsigned int)templ_sqsum[3],
-                        result);                
+                        result, StreamAccessor::getStream(stream));                
                break;
            default:
                CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");
@ -401,12 +404,12 @@ namespace
 }


-void cv::gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method)
+void cv::gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream& stream)
 {
    CV_Assert(image.type() == templ.type());
    CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows);

-    typedef void (*Caller)(const GpuMat&, const GpuMat&, GpuMat&);
+    typedef void (*Caller)(const GpuMat&, const GpuMat&, GpuMat&, Stream& stream);

    static const Caller callers8U[] = { ::matchTemplate_SQDIFF_8U, ::matchTemplate_SQDIFF_NORMED_8U, 
                                        ::matchTemplate_CCORR_8U, ::matchTemplate_CCORR_NORMED_8U, 
@ -424,7 +427,7 @@ void cv::gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& re

    Caller caller = callers[method];
    CV_Assert(caller);
-    caller(image, templ, result);
+    caller(image, templ, result, stream);
 }

 #endif
--- a/modules/gpu/src/optical_flow.cpp
+++ b/modules/gpu/src/optical_flow.cpp
@ -59,10 +59,8 @@ namespace
                      NCVMatrix<Ncv32f>& u, NCVMatrix<Ncv32f>& v, const cudaDeviceProp& devProp)
    {
        NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
-        CV_Assert(gpuCounter.isInitialized());

-        NCVStatus ncvStat = NCVBroxOpticalFlow(desc, gpuCounter, frame0, frame1, u, v, 0);
-        CV_Assert(ncvStat == NCV_SUCCESS);
+        ncvSafeCall( NCVBroxOpticalFlow(desc, gpuCounter, frame0, frame1, u, v, 0) );

        return gpuCounter.maxSize();
    }
@ -130,10 +128,8 @@ void cv::gpu::BroxOpticalFlow::operator ()(const GpuMat& frame0, const GpuMat& f
    ensureSizeIsEnough(1, bufSize, CV_8UC1, buf);

    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), buf.ptr());
-    CV_Assert(gpuAllocator.isInitialized());
    
-    NCVStatus ncvStat = NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, stream);
-    CV_Assert(ncvStat == NCV_SUCCESS);
+    ncvSafeCall( NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, stream) );
 }

 void cv::gpu::interpolateFrames(const GpuMat& frame0, const GpuMat& frame1, const GpuMat& fu, const GpuMat& fv, const GpuMat& bu, const GpuMat& bv, 
@ -189,7 +185,7 @@ void cv::gpu::interpolateFrames(const GpuMat& frame0, const GpuMat& frame1, cons
    state.ppBuffers[4] = bui.ptr<Ncv32f>();
    state.ppBuffers[5] = bvi.ptr<Ncv32f>();

-    nppSafeCall( nppiStInterpolateFrames(&state) );
+    ncvSafeCall( nppiStInterpolateFrames(&state) );

    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
@ -39,15 +39,16 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
+
 #ifndef __OPENCV_PRECOMP_H__
 #define __OPENCV_PRECOMP_H__

 #if _MSC_VER >= 1200
-#pragma warning( disable: 4251 4710 4711 4514 4996 )
+    #pragma warning( disable: 4251 4710 4711 4514 4996 )
 #endif

 #ifdef HAVE_CVCONFIG_H
-#include "cvconfig.h"
+    #include "cvconfig.h"
 #endif

 #include <iostream>
@ -65,33 +66,43 @@
 #include "opencv2/calib3d/calib3d.hpp"
 #include "opencv2/core/internal.hpp"

-#if defined(HAVE_CUDA)
+#define OPENCV_GPU_UNUSED(x) (void)x
+
+#ifdef HAVE_CUDA

-    #include "internal_shared.hpp"
    #include "cuda_runtime_api.h"
-    #include "cufft.h"
+    #include "npp.h"
+    
+    #ifdef HAVE_CUFFT
+        #include "cufft.h"
+    #endif
+
+    #ifdef HAVE_CUBLAS
+        #include "cublas.h"
+    #endif
+
+    #include "internal_shared.hpp"
    #include "opencv2/gpu/stream_accessor.hpp"
-    #include "npp.h"    
    
    #include "nvidia/core/NCV.hpp"
    #include "nvidia/NPP_staging/NPP_staging.hpp"
    #include "nvidia/NCVHaarObjectDetection.hpp"
    #include "nvidia/NCVBroxOpticalFlow.hpp"

-#define CUDART_MINIMUM_REQUIRED_VERSION 4000
-#define NPP_MINIMUM_REQUIRED_VERSION 4000
+    #define CUDART_MINIMUM_REQUIRED_VERSION 4000
+    #define NPP_MINIMUM_REQUIRED_VERSION 4000

-#if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION)
-    #error "Insufficient Cuda Runtime library version, please update it."
-#endif
+    #if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION)
+        #error "Insufficient Cuda Runtime library version, please update it."
+    #endif

-#if (NPP_VERSION_MAJOR*1000+NPP_VERSION_MINOR*100+NPP_VERSION_BUILD < NPP_MINIMUM_REQUIRED_VERSION)
-    #error "Insufficient NPP version, please update it."
-#endif
+    #if (NPP_VERSION_MAJOR * 1000 + NPP_VERSION_MINOR * 100 + NPP_VERSION_BUILD < NPP_MINIMUM_REQUIRED_VERSION)
+        #error "Insufficient NPP version, please update it."
+    #endif

-#if defined(CUDA_ARCH_BIN_OR_PTX_10)
-    #error "OpenCV GPU module doesn't support NVIDIA compute capability 1.0"
-#endif
+    #if defined(CUDA_ARCH_BIN_OR_PTX_10)
+        #error "OpenCV GPU module doesn't support NVIDIA compute capability 1.0"
+    #endif

    static inline void throw_nogpu() { CV_Error(CV_GpuNotSupported, "The called functionality is disabled for current build or platform"); }