From 40ee754e281e530dcdf9dbd64437051535aed4f8 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <no@email>
Date: Wed, 19 Oct 2011 09:53:22 +0000
Subject: [PATCH] added WITH_CUFFT and WITH_CUBLAS flags to cmake scripts fixed
 gpu module error reporting added asynchronous version of some functions

---
 CMakeLists.txt                              |  13 +
 cvconfig.h.cmake                            |   6 +
 modules/core/include/opencv2/core/types_c.h |   4 +-
 modules/core/src/system.cpp                 |   5 +-
 modules/gpu/CMakeLists.txt                  |  11 +-
 modules/gpu/include/opencv2/gpu/gpu.hpp     |  44 +--
 modules/gpu/src/arithm.cpp                  |   4 +-
 modules/gpu/src/cascadeclassifier.cpp       |  11 +-
 modules/gpu/src/cuda/imgproc.cu             | 121 +++---
 modules/gpu/src/cuda/match_template.cu      | 388 +++++++++-----------
 modules/gpu/src/cuda/safe_call.hpp          |  31 +-
 modules/gpu/src/error.cpp                   | 194 +++++-----
 modules/gpu/src/imgproc.cpp                 | 278 ++++++++------
 modules/gpu/src/match_template.cpp          | 177 ++++-----
 modules/gpu/src/optical_flow.cpp            |  10 +-
 modules/gpu/src/precomp.hpp                 |  45 ++-
 16 files changed, 712 insertions(+), 630 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 628c23f5f5..ff8f20c646 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -452,8 +452,12 @@ set(WITH_EIGEN ON CACHE BOOL "Include Eigen2/Eigen3 support")
 
 if( CMAKE_VERSION VERSION_GREATER "2.8")
     set(WITH_CUDA ON CACHE BOOL "Include NVidia Cuda Runtime support")
+    set(WITH_CUFFT ON CACHE BOOL "Include NVidia Cuda Fast Fourier Transform (FFT) library support")
+    set(WITH_CUBLAS OFF CACHE BOOL "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support")
 else()
     set(WITH_CUDA OFF CACHE BOOL "Include NVidia Cuda Runtime support")
+    set(WITH_CUFFT OFF CACHE BOOL "Include NVidia Cuda Fast Fourier Transform (FFT) library support")
+    set(WITH_CUBLAS OFF CACHE BOOL "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support")
 endif()
 
 set(WITH_OPENNI OFF CACHE BOOL "Include OpenNI support")
@@ -995,6 +999,15 @@ if(WITH_CUDA)
     
     if(CUDA_FOUND)
         set(HAVE_CUDA 1)
+
+        if(WITH_CUFFT)
+            set(HAVE_CUFFT 1)
+        endif()
+
+        if(WITH_CUBLAS)
+            set(HAVE_CUBLAS 1)
+        endif()
+
         message(STATUS "CUDA detected: " ${CUDA_VERSION})
 
         set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0)" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
diff --git a/cvconfig.h.cmake b/cvconfig.h.cmake
index 602c12d19f..36cf6bbb51 100644
--- a/cvconfig.h.cmake
+++ b/cvconfig.h.cmake
@@ -172,6 +172,12 @@
 /* NVidia Cuda Runtime API*/
 #cmakedefine HAVE_CUDA
 
+/* NVidia Cuda Fast Fourier Transform (FFT) API*/
+#cmakedefine HAVE_CUFFT
+
+/* NVidia Cuda Basic Linear Algebra Subprograms (BLAS) API*/
+#cmakedefine HAVE_CUBLAS
+
 /* Compile for 'real' NVIDIA GPU architectures */
 #define CUDA_ARCH_BIN "${OPENCV_CUDA_ARCH_BIN}"
 
diff --git a/modules/core/include/opencv2/core/types_c.h b/modules/core/include/opencv2/core/types_c.h
index 11dbabfa7e..e2284a81d7 100644
--- a/modules/core/include/opencv2/core/types_c.h
+++ b/modules/core/include/opencv2/core/types_c.h
@@ -250,9 +250,7 @@ enum {
  CV_StsBadMemBlock=            -214, /* an allocated block has been corrupted */
  CV_StsAssert=                 -215, /* assertion failed */    
  CV_GpuNotSupported=           -216,  
- CV_GpuApiCallError=           -217, 
- CV_GpuNppCallError=           -218,
- CV_GpuCufftCallError=         -219
+ CV_GpuApiCallError=           -217
 };
 
 /****************************************************************************************\
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index 2922cdaf8f..7c9b805f8e 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -629,9 +629,8 @@ CV_IMPL const char* cvErrorStr( int status )
     case CV_StsNotImplemented :      return "The function/feature is not implemented";
     case CV_StsBadMemBlock :         return "Memory block has been corrupted";
     case CV_StsAssert :              return "Assertion failed";
-    case CV_GpuNotSupported : return "No GPU support";
-    case CV_GpuApiCallError : return "Gpu Api call";
-    case CV_GpuNppCallError : return "Npp Api call";
+    case CV_GpuNotSupported :        return "No GPU support";
+    case CV_GpuApiCallError :        return "Gpu Api call";
     };
 
     sprintf(buf, "Unknown %s code %d", status >= 0 ? "status":"error", status);
diff --git a/modules/gpu/CMakeLists.txt b/modules/gpu/CMakeLists.txt
index 6c1a2225f8..74ccc320e3 100644
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -120,12 +120,19 @@ set_target_properties(${the_target} PROPERTIES
 target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${DEPS} )
 
 if (HAVE_CUDA)
-    target_link_libraries(${the_target} ${CUDA_LIBRARIES})    
-    CUDA_ADD_CUFFT_TO_TARGET(${the_target})
+    target_link_libraries(${the_target} ${CUDA_LIBRARIES})
 
     unset(CUDA_npp_LIBRARY CACHE)
     find_cuda_helper_libs(npp)
     target_link_libraries(${the_target} ${CUDA_npp_LIBRARY})
+
+    if(HAVE_CUFFT)
+        CUDA_ADD_CUFFT_TO_TARGET(${the_target})
+    endif()
+
+    if(HAVE_CUBLAS)
+        CUDA_ADD_CUBLAS_TO_TARGET(${the_target})
+    endif()
 endif()
 
 if(MSVC)
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 1b7efb985e..ee9b734f1d 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -141,8 +141,8 @@ namespace cv
 
         //////////////////////////////// Error handling ////////////////////////
 
-        CV_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func);
-        CV_EXPORTS void nppError( int err, const char *file, const int line, const char *func);
+        //CV_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func);
+        //CV_EXPORTS void nppError( int err, const char *file, const int line, const char *func);
 
         //////////////////////////////// CudaMem ////////////////////////////////
         // CudaMem is limited cv::Mat with page locked memory allocation.
@@ -628,11 +628,11 @@ namespace cv
 
         //! Does mean shift filtering on GPU.
         CV_EXPORTS void meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,
-            TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));
+            TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1), Stream& stream = Stream::Null());
 
         //! Does mean shift procedure on GPU.
         CV_EXPORTS void meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr,
-            TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));
+            TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1), Stream& stream = Stream::Null());
 
         //! Does mean shift segmentation with elimination of small regions.
         CV_EXPORTS void meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr, int minsize,
@@ -683,10 +683,12 @@ namespace cv
         //! rotate 8bit single or four channel image
         //! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
         //! supports CV_8UC1, CV_8UC4 types
-        CV_EXPORTS void rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());
+        CV_EXPORTS void rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0, 
+            int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());
 
         //! copies 2D array to a larger destination array and pads borders with user-specifiable constant
-        CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType, const Scalar& value = Scalar(), Stream& stream = Stream::Null());
+        CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType, 
+            const Scalar& value = Scalar(), Stream& stream = Stream::Null());
 
         //! computes the integral image
         //! sum will have CV_32S type, but will contain unsigned int values
@@ -715,21 +717,26 @@ namespace cv
         CV_EXPORTS void rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& stream = Stream::Null());
 
         //! computes Harris cornerness criteria at each image pixel
-        CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, int borderType=BORDER_REFLECT101);
-        CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, double k, int borderType=BORDER_REFLECT101);
+        CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, 
+            int borderType = BORDER_REFLECT101);
+        CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, double k, 
+            int borderType = BORDER_REFLECT101);
+        CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, double k, 
+            int borderType = BORDER_REFLECT101, Stream& stream = Stream::Null());
 
         //! computes minimum eigen value of 2x2 derivative covariation matrix at each pixel - the cornerness criteria
         CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType=BORDER_REFLECT101);
         CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType=BORDER_REFLECT101);
+        CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, 
+            int borderType=BORDER_REFLECT101, Stream& stream = Stream::Null());
 
         //! performs per-element multiplication of two full (not packed) Fourier spectrums
         //! supports 32FC2 matrixes only (interleaved format)
-        CV_EXPORTS void mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB=false);
+        CV_EXPORTS void mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB=false, Stream& stream = Stream::Null());
 
         //! performs per-element multiplication of two full (not packed) Fourier spectrums
         //! supports 32FC2 matrixes only (interleaved format)
-        CV_EXPORTS void mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, 
-                                             float scale, bool conjB=false);
+        CV_EXPORTS void mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB=false, Stream& stream = Stream::Null());
 
         //! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix.
         //! Param dft_size is the size of DFT transform.
@@ -742,19 +749,14 @@ namespace cv
         //! in CUFFT's format. Result as full complex matrix for such kind of transform cannot be retrieved.
         //!
         //! For complex-to-real transform it is assumed that the source matrix is packed in CUFFT's format.
-        CV_EXPORTS void dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags=0);
+        CV_EXPORTS void dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags=0, Stream& stream = Stream::Null());
 
         //! computes convolution (or cross-correlation) of two images using discrete Fourier transform
         //! supports source images of 32FC1 type only
         //! result matrix will have 32FC1 type
-        CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, 
-                                 bool ccorr=false);
-
         struct CV_EXPORTS ConvolveBuf;
-
-        //! buffered version
-        CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, 
-                                 bool ccorr, ConvolveBuf& buf);
+        CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr = false);
+        CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream = Stream::Null());
 
         struct CV_EXPORTS ConvolveBuf
         {
@@ -766,7 +768,7 @@ namespace cv
 
         private:
             static Size estimateBlockSize(Size result_size, Size templ_size);
-            friend void convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&);
+            friend void convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream& stream);
 
             Size result_size;
             Size block_size;
@@ -778,7 +780,7 @@ namespace cv
         };
 
         //! computes the proximity map for the raster template and the image where the template is searched for
-        CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method);
+        CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream& stream = Stream::Null());
 
         //! smoothes the source image and downsamples it
         CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());
diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp
index 67da283eb6..ec938fea59 100644
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -93,7 +93,7 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
         sz.width  = src.cols;
         sz.height = src.rows;
 
-        nppSafeCall( nppiStTranspose_32u_C1R(const_cast<Ncv32u*>(src.ptr<Ncv32u>()), static_cast<int>(src.step), 
+        ncvSafeCall( nppiStTranspose_32u_C1R(const_cast<Ncv32u*>(src.ptr<Ncv32u>()), static_cast<int>(src.step), 
             dst.ptr<Ncv32u>(), static_cast<int>(dst.step), sz) );
     }
     else // if (src.elemSize() == 8)
@@ -104,7 +104,7 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
         sz.width  = src.cols;
         sz.height = src.rows;
 
-        nppSafeCall( nppiStTranspose_64u_C1R(const_cast<Ncv64u*>(src.ptr<Ncv64u>()), static_cast<int>(src.step), 
+        ncvSafeCall( nppiStTranspose_64u_C1R(const_cast<Ncv64u*>(src.ptr<Ncv64u>()), static_cast<int>(src.step), 
             dst.ptr<Ncv64u>(), static_cast<int>(dst.step), sz) );		
     }
 
diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp
index 984b33f820..6bef7fbe68 100644
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -66,10 +66,7 @@ struct cv::gpu::CascadeClassifier_GPU::CascadeClassifierImpl
     CascadeClassifierImpl(const string& filename) : lastAllocatedFrameSize(-1, -1)
     {
         ncvSetDebugOutputHandler(NCVDebugOutputHandler);
-        if (ncvStat != load(filename))
-        {
-            CV_Error(CV_GpuApiCallError, "Error in GPU cacade load");
-        }
+        ncvSafeCall( load(filename) );
     }
 
 
@@ -287,11 +284,7 @@ int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat& image, GpuMa
     }
 
     unsigned int numDetections;
-    NCVStatus ncvStat = impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, ncvMinSize, numDetections);
-    if (ncvStat != NCV_SUCCESS)
-    {
-        CV_Error(CV_GpuApiCallError, "Error in face detectioln");
-    }
+    ncvSafeCall( impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, ncvMinSize, numDetections) );
 
     return numDetections;
 }
diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu
index 4d54895edc..8149945647 100644
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -120,8 +120,7 @@ namespace cv { namespace gpu { namespace imgproc
         return make_short2((short)x0, (short)y0);
     }
 
-    extern "C" __global__ void meanshift_kernel( unsigned char* out, size_t out_step, int cols, int rows, 
-                                                 int sp, int sr, int maxIter, float eps )
+    __global__ void meanshift_kernel(unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps )
     {
         int x0 = blockIdx.x * blockDim.x + threadIdx.x;
         int y0 = blockIdx.y * blockDim.y + threadIdx.y;
@@ -130,10 +129,10 @@ namespace cv { namespace gpu { namespace imgproc
             do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);
     }
 
-    extern "C" __global__ void meanshiftproc_kernel( unsigned char* outr, size_t outrstep, 
-                                                 unsigned char* outsp, size_t outspstep, 
-                                                 int cols, int rows, 
-                                                 int sp, int sr, int maxIter, float eps )
+    __global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep, 
+                                         unsigned char* outsp, size_t outspstep, 
+                                         int cols, int rows, 
+                                         int sp, int sr, int maxIter, float eps)
     {
         int x0 = blockIdx.x * blockDim.x + threadIdx.x;
         int y0 = blockIdx.y * blockDim.y + threadIdx.y;
@@ -145,7 +144,7 @@ namespace cv { namespace gpu { namespace imgproc
         }
     }
 
-    extern "C" void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps)
+    void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
     {
         dim3 grid(1, 1, 1);
         dim3 threads(32, 8, 1);
@@ -155,13 +154,16 @@ namespace cv { namespace gpu { namespace imgproc
         cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
         cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
 
-        meanshift_kernel<<< grid, threads >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
+        meanshift_kernel<<< grid, threads, 0, stream >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
         cudaSafeCall( cudaGetLastError() );
 
-        cudaSafeCall( cudaDeviceSynchronize() );
-        cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+        //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        
     }
-    extern "C" void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps) 
+
+    void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream) 
     {
         dim3 grid(1, 1, 1);
         dim3 threads(32, 8, 1);
@@ -171,11 +173,13 @@ namespace cv { namespace gpu { namespace imgproc
         cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
         cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
 
-        meanshiftproc_kernel<<< grid, threads >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
+        meanshiftproc_kernel<<< grid, threads, 0, stream >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
         cudaSafeCall( cudaGetLastError() );
 
-        cudaSafeCall( cudaDeviceSynchronize() );
-        cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+        //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        
     }
 
 /////////////////////////////////// drawColorDisp ///////////////////////////////////////////////
@@ -389,15 +393,16 @@ namespace cv { namespace gpu { namespace imgproc
         }
     }
 
-    void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst)
+    void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst, cudaStream_t stream)
     {
         dim3 threads(32, 8);
         dim3 grid(divUp(Dx.cols, threads.x), divUp(Dx.rows, threads.y));
 
-        extractCovData_kernel<<<grid, threads>>>(Dx.cols, Dx.rows, Dx, Dy, dst);
+        extractCovData_kernel<<<grid, threads, 0, stream>>>(Dx.cols, Dx.rows, Dx, Dy, dst);
         cudaSafeCall( cudaGetLastError() );
 
-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
     }
 
 /////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////
@@ -475,7 +480,7 @@ namespace cv { namespace gpu { namespace imgproc
     }
 
     void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, 
-                             int border_type)
+                             int border_type, cudaStream_t stream)
     {
         const int rows = Dx.rows;
         const int cols = Dx.cols;
@@ -492,7 +497,7 @@ namespace cv { namespace gpu { namespace imgproc
         switch (border_type) 
         {
         case BORDER_REFLECT101_GPU:
-            cornerHarris_kernel<<<grid, threads>>>(
+            cornerHarris_kernel<<<grid, threads, 0, stream>>>(
                     cols, rows, block_size, k, dst, BrdRowReflect101<void>(cols), BrdColReflect101<void>(rows));
             break;
         case BORDER_REPLICATE_GPU:
@@ -500,16 +505,18 @@ namespace cv { namespace gpu { namespace imgproc
             harrisDxTex.addressMode[1] = cudaAddressModeClamp;
             harrisDyTex.addressMode[0] = cudaAddressModeClamp;
             harrisDyTex.addressMode[1] = cudaAddressModeClamp;
-            cornerHarris_kernel<<<grid, threads>>>(cols, rows, block_size, k, dst);
+
+            cornerHarris_kernel<<<grid, threads, 0, stream>>>(cols, rows, block_size, k, dst);
             break;
         }
 
         cudaSafeCall( cudaGetLastError() );
 
-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
 
-        cudaSafeCall(cudaUnbindTexture(harrisDxTex));
-        cudaSafeCall(cudaUnbindTexture(harrisDyTex));
+        //cudaSafeCall(cudaUnbindTexture(harrisDxTex));
+        //cudaSafeCall(cudaUnbindTexture(harrisDyTex));
     }
 
 /////////////////////////////////////////// Corner Min Eigen Val /////////////////////////////////////////////////
@@ -592,7 +599,7 @@ namespace cv { namespace gpu { namespace imgproc
     }
 
     void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst,
-                                  int border_type)
+                                  int border_type, cudaStream_t stream)
     {
         const int rows = Dx.rows;
         const int cols = Dx.cols;
@@ -609,7 +616,7 @@ namespace cv { namespace gpu { namespace imgproc
         switch (border_type)
         {
         case BORDER_REFLECT101_GPU:
-            cornerMinEigenVal_kernel<<<grid, threads>>>(
+            cornerMinEigenVal_kernel<<<grid, threads, 0, stream>>>(
                     cols, rows, block_size, dst, BrdRowReflect101<void>(cols), BrdColReflect101<void>(rows));
             break;
         case BORDER_REPLICATE_GPU:
@@ -617,16 +624,18 @@ namespace cv { namespace gpu { namespace imgproc
             minEigenValDxTex.addressMode[1] = cudaAddressModeClamp;
             minEigenValDyTex.addressMode[0] = cudaAddressModeClamp;
             minEigenValDyTex.addressMode[1] = cudaAddressModeClamp;
-            cornerMinEigenVal_kernel<<<grid, threads>>>(cols, rows, block_size, dst);
+
+            cornerMinEigenVal_kernel<<<grid, threads, 0, stream>>>(cols, rows, block_size, dst);
             break;
         }
 
         cudaSafeCall( cudaGetLastError() );
 
-        cudaSafeCall(cudaDeviceSynchronize());
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
 
-        cudaSafeCall(cudaUnbindTexture(minEigenValDxTex));
-        cudaSafeCall(cudaUnbindTexture(minEigenValDyTex));
+        //cudaSafeCall(cudaUnbindTexture(minEigenValDxTex));
+        //cudaSafeCall(cudaUnbindTexture(minEigenValDyTex));
     }
 
 ////////////////////////////// Column Sum //////////////////////////////////////
@@ -667,8 +676,7 @@ namespace cv { namespace gpu { namespace imgproc
     //////////////////////////////////////////////////////////////////////////
     // mulSpectrums
 
-    __global__ void mulSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, 
-                                       DevMem2D_<cufftComplex> c)
+    __global__ void mulSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c)
     {
         const int x = blockIdx.x * blockDim.x + threadIdx.x;    
         const int y = blockIdx.y * blockDim.y + threadIdx.y;    
@@ -680,25 +688,23 @@ namespace cv { namespace gpu { namespace imgproc
     }
 
 
-    void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, 
-                      DevMem2D_<cufftComplex> c)
+    void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream)
     {
         dim3 threads(256);
         dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
 
-        mulSpectrumsKernel<<<grid, threads>>>(a, b, c);
+        mulSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, c);
         cudaSafeCall( cudaGetLastError() );
 
-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
     }
 
 
     //////////////////////////////////////////////////////////////////////////
     // mulSpectrums_CONJ
 
-    __global__ void mulSpectrumsKernel_CONJ(
-            const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,
-            DevMem2D_<cufftComplex> c)
+    __global__ void mulSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c)
     {
         const int x = blockIdx.x * blockDim.x + threadIdx.x;    
         const int y = blockIdx.y * blockDim.y + threadIdx.y;    
@@ -710,25 +716,23 @@ namespace cv { namespace gpu { namespace imgproc
     }
 
 
-    void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, 
-                           DevMem2D_<cufftComplex> c)
+    void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream)
     {
         dim3 threads(256);
         dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
 
-        mulSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, c);
+        mulSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, c);
         cudaSafeCall( cudaGetLastError() );
 
-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
     }
 
 
     //////////////////////////////////////////////////////////////////////////
     // mulAndScaleSpectrums
 
-    __global__ void mulAndScaleSpectrumsKernel(
-            const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, 
-            float scale, DevMem2D_<cufftComplex> c)
+    __global__ void mulAndScaleSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c)
     {
         const int x = blockIdx.x * blockDim.x + threadIdx.x;
         const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -741,25 +745,23 @@ namespace cv { namespace gpu { namespace imgproc
     }
 
 
-    void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,
-                              float scale, DevMem2D_<cufftComplex> c)
+    void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream)
     {
         dim3 threads(256);
         dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
 
-        mulAndScaleSpectrumsKernel<<<grid, threads>>>(a, b, scale, c);
+        mulAndScaleSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, scale, c);
         cudaSafeCall( cudaGetLastError() );
 
-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (stream)
+            cudaSafeCall( cudaDeviceSynchronize() );
     }
 
 
     //////////////////////////////////////////////////////////////////////////
     // mulAndScaleSpectrums_CONJ
 
-    __global__ void mulAndScaleSpectrumsKernel_CONJ(
-            const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,
-            float scale, DevMem2D_<cufftComplex> c)
+    __global__ void mulAndScaleSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c)
     {
         const int x = blockIdx.x * blockDim.x + threadIdx.x;
         const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -772,16 +774,16 @@ namespace cv { namespace gpu { namespace imgproc
     }
 
 
-    void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,
-                                  float scale, DevMem2D_<cufftComplex> c)
+    void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream)
     {
         dim3 threads(256);
         dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
 
-        mulAndScaleSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, scale, c);
+        mulAndScaleSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, scale, c);
         cudaSafeCall( cudaGetLastError() );
 
-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
     }    
 
     //////////////////////////////////////////////////////////////////////////
@@ -1015,17 +1017,18 @@ namespace cv { namespace gpu { namespace imgproc
         }
     }
 
-    void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel)
+    void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream)
     {
         cudaSafeCall(cudaMemcpyToSymbol(c_convolveKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
 
         const dim3 block(16, 16);
         const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
 
-        convolve<<<grid, block>>>(src, dst, kWidth, kHeight);
+        convolve<<<grid, block, 0, stream>>>(src, dst, kWidth, kHeight);
         cudaSafeCall(cudaGetLastError());
 
-        cudaSafeCall(cudaDeviceSynchronize());
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
     }
 
 
diff --git a/modules/gpu/src/cuda/match_template.cu b/modules/gpu/src/cuda/match_template.cu
index 242cf3c762..e954a26635 100644
--- a/modules/gpu/src/cuda/match_template.cu
+++ b/modules/gpu/src/cuda/match_template.cu
@@ -78,11 +78,11 @@ __device__ __forceinline__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a
 __device__ __forceinline__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
 __device__ __forceinline__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
 
+//////////////////////////////////////////////////////////////////////
+// Naive_CCORR
 
-template <typename T, int cn>
-__global__ void matchTemplateNaiveKernel_CCORR(
-        int w, int h, const PtrStepb image, const PtrStepb templ, 
-        DevMem2Df result)
+template <typename T, int cn> 
+__global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, DevMem2Df result)
 {
     typedef typename TypeVec<T, cn>::vec_type Type;
     typedef typename TypeVec<float, cn>::vec_type Typef;
@@ -106,73 +106,49 @@ __global__ void matchTemplateNaiveKernel_CCORR(
     }
 }
 
+template <typename T, int cn>
+void matchTemplateNaive_CCORR(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream)
+{
+    const dim3 threads(32, 8);
+    const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
 
-void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ,
-                                  DevMem2Df result, int cn)
+    matchTemplateNaiveKernel_CCORR<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
+    cudaSafeCall( cudaGetLastError() );
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)
 {
-    dim3 threads(32, 8);
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+    typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);
 
-    switch (cn)
+    static const caller_t callers[] = 
     {
-    case 1:
-        matchTemplateNaiveKernel_CCORR<float, 1><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 2:
-        matchTemplateNaiveKernel_CCORR<float, 2><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 3:
-        matchTemplateNaiveKernel_CCORR<float, 3><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 4:
-        matchTemplateNaiveKernel_CCORR<float, 4><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    }
-    cudaSafeCall( cudaGetLastError() );
+        0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4>
+    };
 
-    cudaSafeCall( cudaDeviceSynchronize() );
+    callers[cn](image, templ, result, stream);
 }
 
 
-void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ,
-                                 DevMem2Df result, int cn)
+void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)
 {
-    dim3 threads(32, 8);
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+    typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);
 
-    switch (cn)
+    static const caller_t callers[] = 
     {
-    case 1:
-        matchTemplateNaiveKernel_CCORR<uchar, 1><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 2:
-        matchTemplateNaiveKernel_CCORR<uchar, 2><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 3:
-        matchTemplateNaiveKernel_CCORR<uchar, 3><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 4:
-        matchTemplateNaiveKernel_CCORR<uchar, 4><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    }
-    cudaSafeCall( cudaGetLastError() );
+        0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4>
+    };
 
-    cudaSafeCall( cudaDeviceSynchronize() );
+    callers[cn](image, templ, result, stream);
 }
 
+//////////////////////////////////////////////////////////////////////
+// Naive_SQDIFF
 
 template <typename T, int cn>
-__global__ void matchTemplateNaiveKernel_SQDIFF(
-        int w, int h, const PtrStepb image, const PtrStepb templ, 
-        DevMem2Df result)
+__global__ void matchTemplateNaiveKernel_SQDIFF(int w, int h, const PtrStepb image, const PtrStepb templ, DevMem2Df result)
 {
     typedef typename TypeVec<T, cn>::vec_type Type;
     typedef typename TypeVec<float, cn>::vec_type Typef;
@@ -200,73 +176,48 @@ __global__ void matchTemplateNaiveKernel_SQDIFF(
     }
 }
 
-
-void matchTemplateNaive_SQDIFF_32F(const DevMem2Db image, const DevMem2Db templ,
-                                   DevMem2Df result, int cn)
+template <typename T, int cn>
+void matchTemplateNaive_SQDIFF(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream)
 {
-    dim3 threads(32, 8);
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+    const dim3 threads(32, 8);
+    const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
 
-    switch (cn)
-    {
-    case 1:
-        matchTemplateNaiveKernel_SQDIFF<float, 1><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 2:
-        matchTemplateNaiveKernel_SQDIFF<float, 2><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 3:
-        matchTemplateNaiveKernel_SQDIFF<float, 3><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 4:
-        matchTemplateNaiveKernel_SQDIFF<float, 4><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    }
+    matchTemplateNaiveKernel_SQDIFF<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
     cudaSafeCall( cudaGetLastError() );
 
-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }
 
+void matchTemplateNaive_SQDIFF_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)
+{
+    typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);
+
+    static const caller_t callers[] = 
+    {
+        0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4>
+    };
+
+    callers[cn](image, templ, result, stream);
+}
 
-void matchTemplateNaive_SQDIFF_8U(const DevMem2Db image, const DevMem2Db templ,
-                                  DevMem2Df result, int cn)
+void matchTemplateNaive_SQDIFF_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)
 {
-    dim3 threads(32, 8);
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+    typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);
 
-    switch (cn)
+    static const caller_t callers[] = 
     {
-    case 1:
-        matchTemplateNaiveKernel_SQDIFF<uchar, 1><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 2:
-        matchTemplateNaiveKernel_SQDIFF<uchar, 2><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 3:
-        matchTemplateNaiveKernel_SQDIFF<uchar, 3><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 4:
-        matchTemplateNaiveKernel_SQDIFF<uchar, 4><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    }
-    cudaSafeCall( cudaGetLastError() );
+        0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4>
+    };
 
-    cudaSafeCall( cudaDeviceSynchronize() );
+    callers[cn](image, templ, result, stream);
 }
 
+//////////////////////////////////////////////////////////////////////
+// Prepared_SQDIFF
 
 template <int cn>
-__global__ void matchTemplatePreparedKernel_SQDIFF_8U(
-        int w, int h, const PtrStep<unsigned long long> image_sqsum, 
-        unsigned int templ_sqsum, DevMem2Df result)
+__global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result)
 {
     const int x = blockIdx.x * blockDim.x + threadIdx.x;
     const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -281,37 +232,34 @@ __global__ void matchTemplatePreparedKernel_SQDIFF_8U(
     }
 }
 
+template <int cn>
+void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream)
+{
+    const dim3 threads(32, 8);
+    const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+    matchTemplatePreparedKernel_SQDIFF_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+    cudaSafeCall( cudaGetLastError() );
 
-void matchTemplatePrepared_SQDIFF_8U(
-        int w, int h, const DevMem2D_<unsigned long long> image_sqsum, 
-        unsigned int templ_sqsum, DevMem2Df result, int cn)
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, int cn, 
+                                     cudaStream_t stream)
 {
-    dim3 threads(32, 8);
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-    switch (cn)
+    typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream);
+
+    static const caller_t callers[] = 
     {
-    case 1:
-        matchTemplatePreparedKernel_SQDIFF_8U<1><<<grid, threads>>>(
-                w, h, image_sqsum, templ_sqsum, result);
-        break;
-    case 2:
-        matchTemplatePreparedKernel_SQDIFF_8U<2><<<grid, threads>>>(
-                w, h, image_sqsum, templ_sqsum, result);
-        break;
-    case 3:
-        matchTemplatePreparedKernel_SQDIFF_8U<3><<<grid, threads>>>(
-                w, h, image_sqsum, templ_sqsum, result);
-        break;
-    case 4:
-        matchTemplatePreparedKernel_SQDIFF_8U<4><<<grid, threads>>>(
-                w, h, image_sqsum, templ_sqsum, result);
-        break;
-    }
-    cudaSafeCall( cudaGetLastError() );
+        0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4>
+    };
 
-    cudaSafeCall( cudaDeviceSynchronize() );
+    callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
 }
 
+//////////////////////////////////////////////////////////////////////
+// Prepared_SQDIFF_NORMED
 
 // normAcc* are accurate normalization routines which make GPU matchTemplate
 // consistent with CPU one
@@ -337,9 +285,7 @@ __device__ float normAcc_SQDIFF(float num, float denum)
 
 
 template <int cn>
-__global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(
-        int w, int h, const PtrStep<unsigned long long> image_sqsum, 
-        unsigned int templ_sqsum, DevMem2Df result)
+__global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result)
 {
     const int x = blockIdx.x * blockDim.x + threadIdx.x;
     const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -355,41 +301,37 @@ __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(
     }
 }
 
+template <int cn>
+void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, 
+                                            DevMem2Df result, cudaStream_t stream)
+{
+    const dim3 threads(32, 8);
+    const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+    matchTemplatePreparedKernel_SQDIFF_NORMED_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+    cudaSafeCall( cudaGetLastError() );
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
 
-void matchTemplatePrepared_SQDIFF_NORMED_8U(
-        int w, int h, const DevMem2D_<unsigned long long> image_sqsum, 
-        unsigned int templ_sqsum, DevMem2Df result, int cn)
+void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, 
+                                            DevMem2Df result, int cn, cudaStream_t stream)
 {
-    dim3 threads(32, 8);
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-    switch (cn)
+    typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream);
+    static const caller_t callers[] = 
     {
-    case 1:
-        matchTemplatePreparedKernel_SQDIFF_NORMED_8U<1><<<grid, threads>>>(
-                w, h, image_sqsum, templ_sqsum, result);
-        break;
-    case 2:
-        matchTemplatePreparedKernel_SQDIFF_NORMED_8U<2><<<grid, threads>>>(
-                w, h, image_sqsum, templ_sqsum, result);
-        break;
-    case 3:
-        matchTemplatePreparedKernel_SQDIFF_NORMED_8U<3><<<grid, threads>>>(
-                w, h, image_sqsum, templ_sqsum, result);
-        break;
-    case 4:
-        matchTemplatePreparedKernel_SQDIFF_NORMED_8U<4><<<grid, threads>>>(
-                w, h, image_sqsum, templ_sqsum, result);
-        break;
-    }
-    cudaSafeCall( cudaGetLastError() );
+        0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>
+    };
 
-    cudaSafeCall( cudaDeviceSynchronize() );
+    callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
 }
 
+//////////////////////////////////////////////////////////////////////
+// Prepared_CCOFF
 
-__global__ void matchTemplatePreparedKernel_CCOFF_8U(
-        int w, int h, float templ_sum_scale, 
-        const PtrStep<unsigned int> image_sum, DevMem2Df result)
+__global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<unsigned int> image_sum, DevMem2Df result)
 {
     const int x = blockIdx.x * blockDim.x + threadIdx.x;
     const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -404,21 +346,20 @@ __global__ void matchTemplatePreparedKernel_CCOFF_8U(
     }
 }
 
-
-void matchTemplatePrepared_CCOFF_8U(
-        int w, int h, const DevMem2D_<unsigned int> image_sum,
-        unsigned int templ_sum, DevMem2Df result)
+void matchTemplatePrepared_CCOFF_8U(int w, int h, const DevMem2D_<unsigned int> image_sum, unsigned int templ_sum, DevMem2Df result, cudaStream_t stream)
 {
     dim3 threads(32, 8);
     dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-    matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads>>>(
-            w, h, (float)templ_sum / (w * h), image_sum, result);
+
+    matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads, 0, stream>>>(w, h, (float)templ_sum / (w * h), image_sum, result);
     cudaSafeCall( cudaGetLastError() );
 
-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }
 
 
+
 __global__ void matchTemplatePreparedKernel_CCOFF_8UC2(
         int w, int h, float templ_sum_scale_r, float templ_sum_scale_g,
         const PtrStep<unsigned int> image_sum_r,
@@ -442,25 +383,27 @@ __global__ void matchTemplatePreparedKernel_CCOFF_8UC2(
     }
 }
 
-
 void matchTemplatePrepared_CCOFF_8UC2(
         int w, int h, 
         const DevMem2D_<unsigned int> image_sum_r, 
         const DevMem2D_<unsigned int> image_sum_g,
         unsigned int templ_sum_r, unsigned int templ_sum_g, 
-        DevMem2Df result)
+        DevMem2Df result, cudaStream_t stream)
 {
     dim3 threads(32, 8);
     dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-    matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads>>>(
+
+    matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads, 0, stream>>>(
             w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h),
             image_sum_r, image_sum_g, result);
     cudaSafeCall( cudaGetLastError() );
 
-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }
 
 
+
 __global__ void matchTemplatePreparedKernel_CCOFF_8UC3(
         int w, int h, 
         float templ_sum_scale_r,
@@ -492,7 +435,6 @@ __global__ void matchTemplatePreparedKernel_CCOFF_8UC3(
     }
 }
 
-
 void matchTemplatePrepared_CCOFF_8UC3(
         int w, int h, 
         const DevMem2D_<unsigned int> image_sum_r, 
@@ -501,11 +443,12 @@ void matchTemplatePrepared_CCOFF_8UC3(
         unsigned int templ_sum_r, 
         unsigned int templ_sum_g, 
         unsigned int templ_sum_b, 
-        DevMem2Df result)
+        DevMem2Df result, cudaStream_t stream)
 {
     dim3 threads(32, 8);
     dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-    matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads>>>(
+
+    matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>(
             w, h, 
             (float)templ_sum_r / (w * h),
             (float)templ_sum_g / (w * h),
@@ -513,10 +456,12 @@ void matchTemplatePrepared_CCOFF_8UC3(
             image_sum_r, image_sum_g, image_sum_b, result);
     cudaSafeCall( cudaGetLastError() );
 
-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }
 
 
+
 __global__ void matchTemplatePreparedKernel_CCOFF_8UC4(
         int w, int h, 
         float templ_sum_scale_r, 
@@ -554,7 +499,6 @@ __global__ void matchTemplatePreparedKernel_CCOFF_8UC4(
     }
 }
 
-
 void matchTemplatePrepared_CCOFF_8UC4(
         int w, int h, 
         const DevMem2D_<unsigned int> image_sum_r, 
@@ -565,11 +509,12 @@ void matchTemplatePrepared_CCOFF_8UC4(
         unsigned int templ_sum_g, 
         unsigned int templ_sum_b, 
         unsigned int templ_sum_a, 
-        DevMem2Df result)
+        DevMem2Df result, cudaStream_t stream)
 {
     dim3 threads(32, 8);
     dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-    matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads>>>(
+
+    matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>(
             w, h, 
             (float)templ_sum_r / (w * h), 
             (float)templ_sum_g / (w * h), 
@@ -579,9 +524,12 @@ void matchTemplatePrepared_CCOFF_8UC4(
             result);
     cudaSafeCall( cudaGetLastError() );
 
-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }
 
+//////////////////////////////////////////////////////////////////////
+// Prepared_CCOFF_NORMED
 
 __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
         int w, int h, float weight, 
@@ -607,12 +555,11 @@ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
     }
 }
 
-
 void matchTemplatePrepared_CCOFF_NORMED_8U(
             int w, int h, const DevMem2D_<unsigned int> image_sum, 
             const DevMem2D_<unsigned long long> image_sqsum,
             unsigned int templ_sum, unsigned int templ_sqsum,
-            DevMem2Df result)
+            DevMem2Df result, cudaStream_t stream)
 {
     dim3 threads(32, 8);
     dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@@ -620,15 +567,18 @@ void matchTemplatePrepared_CCOFF_NORMED_8U(
     float weight = 1.f / (w * h);
     float templ_sum_scale = templ_sum * weight;
     float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum;
-    matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads>>>(
+
+    matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>(
             w, h, weight, templ_sum_scale, templ_sqsum_scale, 
             image_sum, image_sqsum, result);
     cudaSafeCall( cudaGetLastError() );
 
-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }
 
 
+
 __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(
         int w, int h, float weight, 
         float templ_sum_scale_r, float templ_sum_scale_g, 
@@ -663,14 +613,13 @@ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(
     }
 }
 
-
 void matchTemplatePrepared_CCOFF_NORMED_8UC2(
             int w, int h, 
             const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
             const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
             unsigned int templ_sum_r, unsigned int templ_sqsum_r,
             unsigned int templ_sum_g, unsigned int templ_sqsum_g,
-            DevMem2Df result)
+            DevMem2Df result, cudaStream_t stream)
 {
     dim3 threads(32, 8);
     dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@@ -680,7 +629,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC2(
     float templ_sum_scale_g = templ_sum_g * weight;
     float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r 
                                + templ_sqsum_g - weight * templ_sum_g * templ_sum_g;
-    matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads>>>(
+
+    matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>(
             w, h, weight, 
             templ_sum_scale_r, templ_sum_scale_g,
             templ_sqsum_scale,
@@ -689,10 +639,12 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC2(
             result);
     cudaSafeCall( cudaGetLastError() );
 
-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }
 
 
+
 __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(
         int w, int h, float weight, 
         float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, 
@@ -736,7 +688,6 @@ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(
     }
 }
 
-
 void matchTemplatePrepared_CCOFF_NORMED_8UC3(
             int w, int h, 
             const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
@@ -745,7 +696,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC3(
             unsigned int templ_sum_r, unsigned int templ_sqsum_r,
             unsigned int templ_sum_g, unsigned int templ_sqsum_g,
             unsigned int templ_sum_b, unsigned int templ_sqsum_b,
-            DevMem2Df result)
+            DevMem2Df result, cudaStream_t stream)
 {
     dim3 threads(32, 8);
     dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@@ -757,7 +708,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC3(
     float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r 
                               + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
                               + templ_sqsum_b - weight * templ_sum_b * templ_sum_b;
-    matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads>>>(
+
+    matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>(
             w, h, weight, 
             templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, 
             templ_sqsum_scale, 
@@ -767,10 +719,12 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC3(
             result);
     cudaSafeCall( cudaGetLastError() );
 
-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }
 
 
+
 __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(
         int w, int h, float weight, 
         float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, 
@@ -821,7 +775,6 @@ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(
     }
 }
 
-
 void matchTemplatePrepared_CCOFF_NORMED_8UC4(
             int w, int h, 
             const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
@@ -832,7 +785,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC4(
             unsigned int templ_sum_g, unsigned int templ_sqsum_g,
             unsigned int templ_sum_b, unsigned int templ_sqsum_b,
             unsigned int templ_sum_a, unsigned int templ_sqsum_a,
-            DevMem2Df result)
+            DevMem2Df result, cudaStream_t stream)
 {
     dim3 threads(32, 8);
     dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@@ -846,7 +799,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC4(
                               + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
                               + templ_sqsum_b - weight * templ_sum_b * templ_sum_b
                               + templ_sqsum_a - weight * templ_sum_a * templ_sum_a;
-    matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads>>>(
+
+    matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>(
             w, h, weight, 
             templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a, 
             templ_sqsum_scale, 
@@ -857,9 +811,12 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC4(
             result);
     cudaSafeCall( cudaGetLastError() );
 
-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }
 
+//////////////////////////////////////////////////////////////////////
+// normalize
 
 template <int cn>
 __global__ void normalizeKernel_8U(
@@ -878,32 +835,36 @@ __global__ void normalizeKernel_8U(
     }
 }
 
-
 void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, 
-                  unsigned int templ_sqsum, DevMem2Df result, int cn)
+                  unsigned int templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream)
 {
     dim3 threads(32, 8);
     dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
     switch (cn)
     {
     case 1:
-        normalizeKernel_8U<1><<<grid, threads>>>(w, h, image_sqsum, templ_sqsum, result);
+        normalizeKernel_8U<1><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
         break;
     case 2:
-        normalizeKernel_8U<2><<<grid, threads>>>(w, h, image_sqsum, templ_sqsum, result);
+        normalizeKernel_8U<2><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
         break;
     case 3:
-        normalizeKernel_8U<3><<<grid, threads>>>(w, h, image_sqsum, templ_sqsum, result);
+        normalizeKernel_8U<3><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
         break;
     case 4:
-        normalizeKernel_8U<4><<<grid, threads>>>(w, h, image_sqsum, templ_sqsum, result);
+        normalizeKernel_8U<4><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
         break;
     }
+
     cudaSafeCall( cudaGetLastError() );
 
-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }
 
+//////////////////////////////////////////////////////////////////////
+// extractFirstChannel
 
 template <int cn>
 __global__ void extractFirstChannel_32F(const PtrStepb image, DevMem2Df result)
@@ -920,8 +881,7 @@ __global__ void extractFirstChannel_32F(const PtrStepb image, DevMem2Df result)
     }
 }
 
-
-void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn)
+void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cudaStream_t stream)
 {
     dim3 threads(32, 8);
     dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@@ -929,23 +889,21 @@ void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn)
     switch (cn)
     {
     case 1:
-        extractFirstChannel_32F<1><<<grid, threads>>>(image, result);
+        extractFirstChannel_32F<1><<<grid, threads, 0, stream>>>(image, result);
         break;
     case 2:
-        extractFirstChannel_32F<2><<<grid, threads>>>(image, result);
+        extractFirstChannel_32F<2><<<grid, threads, 0, stream>>>(image, result);
         break;
     case 3:
-        extractFirstChannel_32F<3><<<grid, threads>>>(image, result);
+        extractFirstChannel_32F<3><<<grid, threads, 0, stream>>>(image, result);
         break;
     case 4:
-        extractFirstChannel_32F<4><<<grid, threads>>>(image, result);
+        extractFirstChannel_32F<4><<<grid, threads, 0, stream>>>(image, result);
         break;
     }
     cudaSafeCall( cudaGetLastError() );
 
-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }
-
-
 }}}
-
diff --git a/modules/gpu/src/cuda/safe_call.hpp b/modules/gpu/src/cuda/safe_call.hpp
index e3e00beaaf..6e0c219e19 100644
--- a/modules/gpu/src/cuda/safe_call.hpp
+++ b/modules/gpu/src/cuda/safe_call.hpp
@@ -45,16 +45,18 @@
 
 #include "cuda_runtime_api.h"
 #include "cufft.h"
-//#include <nppdefs.h>
+#include "NCV.hpp"
 
 #if defined(__GNUC__)
     #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
-    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__, __func__)
     #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, __func__)
+    #define ncvSafeCall(expr)  ___ncvSafeCall(expr, __FILE__, __LINE__, __func__)
+    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__, __func__)
 #else /* defined(__CUDACC__) || defined(__MSVC__) */
     #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
-    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__)
     #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__)
+    #define ncvSafeCall(expr)  ___ncvSafeCall(expr, __FILE__, __LINE__)
+    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__)
 #endif
 
 namespace cv
@@ -62,8 +64,9 @@ namespace cv
     namespace gpu
     {
         void error(const char *error_string, const char *file, const int line, const char *func = "");
-        void nppError(int err, const char *file, const int line, const char *func = "");   
-        void cufftError(int err, const char *file, const int line, const char *func = "");   
+        void nppError(int err, const char *file, const int line, const char *func = "");
+        void ncvError(int err, const char *file, const int line, const char *func = "");
+        void cufftError(int err, const char *file, const int line, const char *func = "");
 
         static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
         {
@@ -71,17 +74,23 @@ namespace cv
                 cv::gpu::error(cudaGetErrorString(err), file, line, func);
         }
 
-        static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
-        {
-            if (CUFFT_SUCCESS != err)
-                cv::gpu::cufftError(err, file, line, func);
-        }
-
         static inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
         {
             if (err < 0)
                 cv::gpu::nppError(err, file, line, func);
         }
+
+        static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")
+        {
+            if (NCV_SUCCESS != err)
+                cv::gpu::ncvError(err, file, line, func);
+        }
+
+        static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
+        {
+            if (CUFFT_SUCCESS != err)
+                cv::gpu::cufftError(err, file, line, func);
+        }
     }
 }
 
diff --git a/modules/gpu/src/error.cpp b/modules/gpu/src/error.cpp
index 1f38f6174c..e5090dbc3b 100644
--- a/modules/gpu/src/error.cpp
+++ b/modules/gpu/src/error.cpp
@@ -42,30 +42,45 @@
 
 #include "precomp.hpp"
 
-
 using namespace cv;
 using namespace cv::gpu;
+using namespace std;
 
-
-#if !defined (HAVE_CUDA)
-
-#else /* !defined (HAVE_CUDA) */
-
+#ifdef HAVE_CUDA
 
 namespace 
 {
     #define error_entry(entry)  { entry, #entry }
 
-    //////////////////////////////////////////////////////////////////////////
-    // NPP errors
-
-    struct NppError
+    struct ErrorEntry
     {
-        int error;
+        int code;
         string str;
-    } 
+    }; 
+
+    struct ErrorEntryComparer
+    {
+        int code;
+        ErrorEntryComparer(int code_) : code(code_) {};
+        bool operator()(const ErrorEntry& e) const { return e.code == code; }
+    };
+
+    string getErrorString(int code, const ErrorEntry* errors, size_t n)
+    {
+        size_t idx = find_if(errors, errors + n, ErrorEntryComparer(code)) - errors;
+
+        const string& msg = (idx != n) ? errors[idx].str : string("Unknown error code");
+
+        ostringstream ostr;
+        ostr << msg << " [Code = " << code << "]";
+
+        return ostr.str();
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    // NPP errors
     
-    npp_errors [] = 
+    const ErrorEntry npp_errors [] = 
     {
         error_entry( NPP_NOT_SUPPORTED_MODE_ERROR ),
         error_entry( NPP_ROUND_MODE_NOT_SUPPORTED_ERROR ),
@@ -74,6 +89,7 @@ namespace
 #if defined (_MSC_VER)
         error_entry( NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY ),
 #endif
+
         error_entry( NPP_BAD_ARG_ERROR ),
         error_entry( NPP_LUT_NUMBER_OF_LEVELS_ERROR ),
         error_entry( NPP_TEXTURE_BIND_ERROR ),
@@ -110,106 +126,116 @@ namespace
         error_entry( NPP_ODD_ROI_WARNING )
     };
 
-    const size_t error_num = sizeof(npp_errors) / sizeof(npp_errors[0]);
+    const size_t npp_error_num = sizeof(npp_errors) / sizeof(npp_errors[0]);
 
-    struct Searcher
+    //////////////////////////////////////////////////////////////////////////
+    // NCV errors
+    
+    const ErrorEntry ncv_errors [] = 
     {
-        int err;
-        Searcher(int err_) : err(err_) {};
-        bool operator()(const NppError& e) const { return e.error == err; }
+        error_entry( NCV_SUCCESS ),
+        error_entry( NCV_UNKNOWN_ERROR ),
+        error_entry( NCV_CUDA_ERROR ),
+        error_entry( NCV_NPP_ERROR ),
+        error_entry( NCV_FILE_ERROR ),
+        error_entry( NCV_NULL_PTR ),
+        error_entry( NCV_INCONSISTENT_INPUT ),
+        error_entry( NCV_TEXTURE_BIND_ERROR ),
+        error_entry( NCV_DIMENSIONS_INVALID ),
+        error_entry( NCV_INVALID_ROI ),
+        error_entry( NCV_INVALID_STEP ),
+        error_entry( NCV_INVALID_SCALE ),
+        error_entry( NCV_INVALID_SCALE ),
+        error_entry( NCV_ALLOCATOR_NOT_INITIALIZED ),
+        error_entry( NCV_ALLOCATOR_BAD_ALLOC ),
+        error_entry( NCV_ALLOCATOR_BAD_DEALLOC ),
+        error_entry( NCV_ALLOCATOR_INSUFFICIENT_CAPACITY ),
+        error_entry( NCV_ALLOCATOR_DEALLOC_ORDER ),
+        error_entry( NCV_ALLOCATOR_BAD_REUSE ),
+        error_entry( NCV_MEM_COPY_ERROR ),
+        error_entry( NCV_MEM_RESIDENCE_ERROR ),
+        error_entry( NCV_MEM_INSUFFICIENT_CAPACITY ),
+        error_entry( NCV_HAAR_INVALID_PIXEL_STEP ),
+        error_entry( NCV_HAAR_TOO_MANY_FEATURES_IN_CLASSIFIER ),
+        error_entry( NCV_HAAR_TOO_MANY_FEATURES_IN_CASCADE ),
+        error_entry( NCV_HAAR_TOO_LARGE_FEATURES ),
+        error_entry( NCV_HAAR_XML_LOADING_EXCEPTION ),
+        error_entry( NCV_NOIMPL_HAAR_TILTED_FEATURES ),
+        error_entry( NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW ),
+        error_entry( NPPST_SUCCESS ),
+        error_entry( NPPST_ERROR ),
+        error_entry( NPPST_CUDA_KERNEL_EXECUTION_ERROR ),
+        error_entry( NPPST_NULL_POINTER_ERROR ),
+        error_entry( NPPST_TEXTURE_BIND_ERROR ),
+        error_entry( NPPST_MEMCPY_ERROR ),
+        error_entry( NPPST_MEM_ALLOC_ERR ),
+        error_entry( NPPST_MEMFREE_ERR ),
+        error_entry( NPPST_INVALID_ROI ),
+        error_entry( NPPST_INVALID_STEP ),
+        error_entry( NPPST_INVALID_SCALE ),
+        error_entry( NPPST_MEM_INSUFFICIENT_BUFFER ),
+        error_entry( NPPST_MEM_RESIDENCE_ERROR ),
+        error_entry( NPPST_MEM_INTERNAL_ERROR )
     };
 
+    const size_t ncv_error_num = sizeof(npp_errors) / sizeof(npp_errors[0]);
+
     //////////////////////////////////////////////////////////////////////////
     // CUFFT errors
 
-    struct CufftError
-    {
-        int code;
-        string message;
-    };
-
-    const CufftError cufft_errors[] = 
-    {
-        error_entry(CUFFT_INVALID_PLAN),
-        error_entry(CUFFT_ALLOC_FAILED),
-        error_entry(CUFFT_INVALID_TYPE),
-        error_entry(CUFFT_INVALID_VALUE),
-        error_entry(CUFFT_INTERNAL_ERROR),
-        error_entry(CUFFT_EXEC_FAILED),
-        error_entry(CUFFT_SETUP_FAILED),
-        error_entry(CUFFT_INVALID_SIZE),
-        error_entry(CUFFT_UNALIGNED_DATA)
-    };
-
-    struct CufftErrorComparer
+    const ErrorEntry cufft_errors[] = 
     {
-        CufftErrorComparer(int code_): code(code_) {}
-        bool operator()(const CufftError& other) const 
-        { 
-            return other.code == code; 
-        }
-        int code;
+        error_entry( CUFFT_INVALID_PLAN ),
+        error_entry( CUFFT_ALLOC_FAILED ),
+        error_entry( CUFFT_INVALID_TYPE ),
+        error_entry( CUFFT_INVALID_VALUE ),
+        error_entry( CUFFT_INTERNAL_ERROR ),
+        error_entry( CUFFT_EXEC_FAILED ),
+        error_entry( CUFFT_SETUP_FAILED ),
+        error_entry( CUFFT_INVALID_SIZE ),
+        error_entry( CUFFT_UNALIGNED_DATA )
     };
 
     const int cufft_error_num = sizeof(cufft_errors) / sizeof(cufft_errors[0]);
-
 }
 
 namespace cv
 {
     namespace gpu
     {
-        const string getNppErrorString( int err )
-        {
-            size_t idx = std::find_if(npp_errors, npp_errors + error_num, Searcher(err)) - npp_errors;
-            const string& msg = (idx != error_num) ? npp_errors[idx].str : string("Unknown error code");
-
-            std::stringstream interpreter;
-            interpreter << msg <<" [Code = " << err << "]";
-
-            return interpreter.str();
-        }
-
-        void nppError( int err, const char *file, const int line, const char *func)
-        {                    
-            cv::error( cv::Exception(CV_GpuNppCallError, getNppErrorString(err), func, file, line) );                
-        }
-
-        const string getCufftErrorString(int err_code)
-        {
-            const CufftError* cufft_error = std::find_if(
-                    cufft_errors, cufft_errors + cufft_error_num, 
-                    CufftErrorComparer(err_code));
-
-            bool found = cufft_error != cufft_errors + cufft_error_num;
-
-            std::stringstream ss;
-            ss << (found ? cufft_error->message : "Unknown error code");
-            ss << " [Code = " << err_code << "]";
-
-            return ss.str();
-        }
-
-        void cufftError(int err, const char *file, const int line, const char *func)
-        {
-            cv::error(cv::Exception(CV_GpuCufftCallError, getCufftErrorString(err), func, file, line));
-        }
-
         void error(const char *error_string, const char *file, const int line, const char *func)
         {          
             int code = CV_GpuApiCallError;
 
-            if (std::uncaught_exception())
+            if (uncaught_exception())
             {
                 const char* errorStr = cvErrorStr(code);            
                 const char* function = func ? func : "unknown function";    
 
-                std::cerr << "OpenCV Error: " << errorStr << "(" << error_string << ") in " << function << ", file " << file << ", line " << line;
-                std::cerr.flush();            
+                cerr << "OpenCV Error: " << errorStr << "(" << error_string << ") in " << function << ", file " << file << ", line " << line;
+                cerr.flush();            
             }
             else    
                 cv::error( cv::Exception(code, error_string, func, file, line) );
         }
+
+        void nppError(int code, const char *file, const int line, const char *func)
+        {
+            string msg = getErrorString(code, npp_errors, npp_error_num);
+            cv::gpu::error(msg.c_str(), file, line, func);
+        }
+
+        void ncvError(int code, const char *file, const int line, const char *func)
+        {
+            string msg = getErrorString(code, ncv_errors, ncv_error_num);
+            cv::gpu::error(msg.c_str(), file, line, func);
+        }
+
+        void cufftError(int code, const char *file, const int line, const char *func)
+        {
+            string msg = getErrorString(code, cufft_errors, cufft_error_num);
+            cv::gpu::error(msg.c_str(), file, line, func);
+        }
     }
 }
 
diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp
index 0aa88e81b5..528ac65794 100644
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -48,8 +48,8 @@ using namespace cv::gpu;
 #if !defined (HAVE_CUDA)
 
 void cv::gpu::remap(const GpuMat&, GpuMat&, const GpuMat&, const GpuMat&, int, int, const Scalar&, Stream&){ throw_nogpu(); }
-void cv::gpu::meanShiftFiltering(const GpuMat&, GpuMat&, int, int, TermCriteria) { throw_nogpu(); }
-void cv::gpu::meanShiftProc(const GpuMat&, GpuMat&, GpuMat&, int, int, TermCriteria) { throw_nogpu(); }
+void cv::gpu::meanShiftFiltering(const GpuMat&, GpuMat&, int, int, TermCriteria, Stream&) { throw_nogpu(); }
+void cv::gpu::meanShiftProc(const GpuMat&, GpuMat&, GpuMat&, int, int, TermCriteria, Stream&) { throw_nogpu(); }
 void cv::gpu::drawColorDisp(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
 void cv::gpu::reprojectImageTo3D(const GpuMat&, GpuMat&, const Mat&, Stream&) { throw_nogpu(); }
 void cv::gpu::resize(const GpuMat&, GpuMat&, Size, double, double, int, Stream&) { throw_nogpu(); }
@@ -82,14 +82,16 @@ void cv::gpu::equalizeHist(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nog
 void cv::gpu::equalizeHist(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, int, int, double, int) { throw_nogpu(); }
 void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, double, int) { throw_nogpu(); }
+void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, double, int, Stream&) { throw_nogpu(); }
 void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, int, int, int) { throw_nogpu(); }
 void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, int) { throw_nogpu(); }
-void cv::gpu::mulSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, bool) { throw_nogpu(); }
-void cv::gpu::mulAndScaleSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, float, bool) { throw_nogpu(); }
-void cv::gpu::dft(const GpuMat&, GpuMat&, Size, int) { throw_nogpu(); }
+void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, int, Stream&) { throw_nogpu(); }
+void cv::gpu::mulSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, bool, Stream&) { throw_nogpu(); }
+void cv::gpu::mulAndScaleSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, float, bool, Stream&) { throw_nogpu(); }
+void cv::gpu::dft(const GpuMat&, GpuMat&, Size, int, Stream&) { throw_nogpu(); }
 void cv::gpu::ConvolveBuf::create(Size, Size) { throw_nogpu(); }
 void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool) { throw_nogpu(); }
-void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&) { throw_nogpu(); }
+void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream& stream) { throw_nogpu(); }
 void cv::gpu::pyrDown(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
 void cv::gpu::pyrUp(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
 void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int, bool) { throw_nogpu(); }
@@ -151,10 +153,10 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
 
 namespace cv { namespace gpu {  namespace imgproc
 {
-    extern "C" void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps);
+    void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
 }}}
 
-void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, TermCriteria criteria)
+void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, TermCriteria criteria, Stream& stream)
 {
     if( src.empty() )
         CV_Error( CV_StsBadArg, "The input image is empty" );
@@ -174,7 +176,7 @@ void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,
         eps = 1.f;
     eps = (float)std::max(criteria.epsilon, 0.0);
 
-    imgproc::meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps);
+    imgproc::meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps, StreamAccessor::getStream(stream));
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -182,10 +184,10 @@ void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,
 
 namespace cv { namespace gpu {  namespace imgproc
 {
-    extern "C" void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps);
+    void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
 }}}
 
-void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, TermCriteria criteria)
+void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, TermCriteria criteria, Stream& stream)
 {
     if( src.empty() )
         CV_Error( CV_StsBadArg, "The input image is empty" );
@@ -206,7 +208,7 @@ void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int
         eps = 1.f;
     eps = (float)std::max(criteria.epsilon, 0.0);
 
-    imgproc::meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps);
+    imgproc::meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps, StreamAccessor::getStream(stream));
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -766,14 +768,14 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S
 	cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
 
     Ncv32u bufSize;
-    nppSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
+    ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
     ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer);
 
     cudaStream_t stream = StreamAccessor::getStream(s);
 
     NppStStreamHandler h(stream);
 
-    nppSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), static_cast<int>(src.step), 
+    ncvSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), static_cast<int>(src.step), 
         sum.ptr<Ncv32u>(), static_cast<int>(sum.step), roiSize, buffer.ptr<Ncv8u>(), bufSize, prop) );
 
     if (stream == 0)
@@ -819,7 +821,7 @@ void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& s)
 	cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
 
     Ncv32u bufSize;
-    nppSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop));	
+    ncvSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop));	
     GpuMat buf(1, bufSize, CV_8U);
 
     cudaStream_t stream = StreamAccessor::getStream(s);
@@ -827,7 +829,7 @@ void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& s)
     NppStStreamHandler h(stream);
 
     sqsum.create(src.rows + 1, src.cols + 1, CV_64F);
-    nppSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>(0)), static_cast<int>(src.step), 
+    ncvSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>(0)), static_cast<int>(src.step), 
             sqsum.ptr<Ncv64u>(0), static_cast<int>(sqsum.step), roiSize, buf.ptr<Ncv8u>(0), bufSize, prop));
 
     if (stream == 0)
@@ -1260,16 +1262,16 @@ void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat&
 
 namespace cv { namespace gpu { namespace imgproc {
 
-    void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst);
-    void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type);
-    void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type);
+    void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst, cudaStream_t stream);
+    void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream);
+    void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream);
 
 }}}
 
 namespace 
 {
     template <typename T>
-    void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType)
+    void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)
     {   
         double scale = (double)(1 << ((ksize > 0 ? ksize : 3) - 1)) * blockSize;
         if (ksize < 0) 
@@ -1283,25 +1285,25 @@ namespace
 
         if (ksize > 0)
         {
-            Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, borderType);
-            Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, borderType);
+            Sobel(src, Dx, CV_32F, 1, 0, buf, ksize, scale, borderType, -1, stream);
+            Sobel(src, Dy, CV_32F, 0, 1, buf, ksize, scale, borderType, -1, stream);
         }
         else
         {
-            Scharr(src, Dx, CV_32F, 1, 0, scale, borderType);
-            Scharr(src, Dy, CV_32F, 0, 1, scale, borderType);
+            Scharr(src, Dx, CV_32F, 1, 0, buf, scale, borderType, -1, stream);
+            Scharr(src, Dy, CV_32F, 0, 1, buf, scale, borderType, -1, stream);
         }
     }
 
-    void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType)
+    void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)
     {
         switch (src.type())
         {
         case CV_8U:
-            extractCovData<unsigned char>(src, Dx, Dy, blockSize, ksize, borderType);
+            extractCovData<unsigned char>(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);
             break;
         case CV_32F:
-            extractCovData<float>(src, Dx, Dy, blockSize, ksize, borderType);
+            extractCovData<float>(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);
             break;
         default:
             CV_Error(CV_StsBadArg, "extractCovData: unsupported type of the source matrix");
@@ -1343,6 +1345,12 @@ void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ks
 }
 
 void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, double k, int borderType)
+{
+    GpuMat buf;
+    cornerHarris(src, dst, Dx, Dy, buf, blockSize, ksize, k, borderType);
+}
+
+void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, double k, int borderType, Stream& stream)
 {
     CV_Assert(borderType == cv::BORDER_REFLECT101 ||
               borderType == cv::BORDER_REPLICATE);
@@ -1350,9 +1358,9 @@ void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& D
     int gpuBorderType;
     CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
 
-    extractCovData(src, Dx, Dy, blockSize, ksize, borderType);
+    extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);
     dst.create(src.size(), CV_32F);
-    imgproc::cornerHarris_caller(blockSize, (float)k, Dx, Dy, dst, gpuBorderType);
+    imgproc::cornerHarris_caller(blockSize, (float)k, Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream));
 }
 
 void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType)
@@ -1362,6 +1370,12 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, i
 }
 
 void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType)
+{
+    GpuMat buf;
+    cornerMinEigenVal(src, dst, Dx, Dy, buf, blockSize, ksize, borderType);
+}
+
+void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)
 {  
     CV_Assert(borderType == cv::BORDER_REFLECT101 ||
               borderType == cv::BORDER_REPLICATE);
@@ -1369,9 +1383,9 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuM
     int gpuBorderType;
     CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
 
-    extractCovData(src, Dx, Dy, blockSize, ksize, borderType);    
+    extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);    
     dst.create(src.size(), CV_32F);
-    imgproc::cornerMinEigenVal_caller(blockSize, Dx, Dy, dst, gpuBorderType);
+    imgproc::cornerMinEigenVal_caller(blockSize, Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream));
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -1379,21 +1393,16 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuM
 
 namespace cv { namespace gpu { namespace imgproc 
 {
-    void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, 
-                      DevMem2D_<cufftComplex> c);
+    void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream);
 
-    void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, 
-                           DevMem2D_<cufftComplex> c);
+    void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream);
 }}}
 
 
-void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, 
-                           int flags, bool conjB) 
+void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB, Stream& stream) 
 {
-    typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, 
-                           DevMem2D_<cufftComplex>);
-    static Caller callers[] = { imgproc::mulSpectrums, 
-                                imgproc::mulSpectrums_CONJ };
+    typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, DevMem2D_<cufftComplex>, cudaStream_t stream);
+    static Caller callers[] = { imgproc::mulSpectrums, imgproc::mulSpectrums_CONJ };
 
     CV_Assert(a.type() == b.type() && a.type() == CV_32FC2);
     CV_Assert(a.size() == b.size());
@@ -1401,7 +1410,7 @@ void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c,
     c.create(a.size(), CV_32FC2);
 
     Caller caller = callers[(int)conjB];
-    caller(a, b, c);
+    caller(a, b, c, StreamAccessor::getStream(stream));
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -1409,21 +1418,16 @@ void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c,
 
 namespace cv { namespace gpu { namespace imgproc 
 {
-    void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,
-                             float scale, DevMem2D_<cufftComplex> c);
+    void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream);
 
-    void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,
-                                  float scale, DevMem2D_<cufftComplex> c);
+    void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream);
 }}}
 
 
-void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c,
-                                  int flags, float scale, bool conjB) 
+void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB, Stream& stream) 
 {
-    typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>,
-                           float scale, DevMem2D_<cufftComplex>);
-    static Caller callers[] = { imgproc::mulAndScaleSpectrums, 
-                                imgproc::mulAndScaleSpectrums_CONJ };
+    typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, float scale, DevMem2D_<cufftComplex>, cudaStream_t stream);
+    static Caller callers[] = { imgproc::mulAndScaleSpectrums, imgproc::mulAndScaleSpectrums_CONJ };
 
     CV_Assert(a.type() == b.type() && a.type() == CV_32FC2);
     CV_Assert(a.size() == b.size());
@@ -1431,14 +1435,26 @@ void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c,
     c.create(a.size(), CV_32FC2);
 
     Caller caller = callers[(int)conjB];
-    caller(a, b, scale, c);
+    caller(a, b, scale, c, StreamAccessor::getStream(stream));
 }
 
 //////////////////////////////////////////////////////////////////////////////
 // dft
 
-void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags)
+void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags, Stream& stream)
 {
+#ifndef HAVE_CUFFT
+
+    OPENCV_GPU_UNUSED(src);
+    OPENCV_GPU_UNUSED(dst);
+    OPENCV_GPU_UNUSED(dft_size);
+    OPENCV_GPU_UNUSED(flags);
+    OPENCV_GPU_UNUSED(stream);
+
+    throw_nogpu();
+
+#else
+
     CV_Assert(src.type() == CV_32F || src.type() == CV_32FC2);
 
     // We don't support unpacked output (in the case of real input)
@@ -1483,6 +1499,8 @@ void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags)
     else
         cufftPlan2d(&plan, dft_size_opt.height, dft_size_opt.width, dft_type);
 
+    cufftSafeCall( cufftSetStream(plan, StreamAccessor::getStream(stream)) );
+
     if (is_complex_input)
     {
         if (is_complex_output)
@@ -1514,7 +1532,9 @@ void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags)
     cufftSafeCall(cufftDestroy(plan));
 
     if (is_scaled_dft)
-        multiply(dst, Scalar::all(1. / dft_size.area()), dst);
+        multiply(dst, Scalar::all(1. / dft_size.area()), dst, 1, -1, stream);
+
+#endif
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -1563,8 +1583,7 @@ Size cv::gpu::ConvolveBuf::estimateBlockSize(Size result_size, Size templ_size)
 }
 
 
-void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, 
-                       bool ccorr)
+void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr)
 {
     ConvolveBuf buf;
     convolve(image, templ, result, ccorr, buf);
@@ -1572,12 +1591,37 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
 
 namespace cv { namespace gpu { namespace imgproc
 {
-    void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel);
+    void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream);
 }}}
 
-void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, 
-                       bool ccorr, ConvolveBuf& buf)
+void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream)
 {
+#ifndef HAVE_CUFFT
+
+    CV_Assert(image.type() == CV_32F);
+    CV_Assert(templ.type() == CV_32F);
+    CV_Assert(templ.cols <= 17 && templ.rows <= 17);
+    
+    result.create(image.size(), CV_32F);
+
+    GpuMat& contKernel = buf.templ_block;
+
+    if (templ.isContinuous())
+        contKernel = templ;
+    else
+    {
+        contKernel = createContinuous(templ.size(), templ.type());
+
+        if (stream)
+            stream.enqueueCopy(templ, contKernel);
+        else
+            templ.copyTo(contKernel);
+    }
+
+    imgproc::convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr<float>(), StreamAccessor::getStream(stream));
+
+#else
+
     StaticAssert<sizeof(float) == sizeof(cufftReal)>::check();
     StaticAssert<sizeof(float) * 2 == sizeof(cufftComplex)>::check();
 
@@ -1587,77 +1631,91 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
     if (templ.cols < 13 && templ.rows < 13)
     {
         result.create(image.size(), CV_32F);
-        GpuMat contKernel;
+
+        GpuMat& contKernel = buf.templ_block;
 
         if (templ.isContinuous())
             contKernel = templ;
         else
         {
             contKernel = createContinuous(templ.size(), templ.type());
-            templ.copyTo(contKernel);
-        }
 
-        imgproc::convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr<float>());
+            if (stream)
+                stream.enqueueCopy(templ, contKernel);
+            else
+                templ.copyTo(contKernel);
+        }
 
-        return;
+        imgproc::convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr<float>(), StreamAccessor::getStream(stream));
     }
+    else
+    {
+        buf.create(image.size(), templ.size());
+        result.create(buf.result_size, CV_32F);
 
-    buf.create(image.size(), templ.size());
-    result.create(buf.result_size, CV_32F);
+        Size& block_size = buf.block_size;
+        Size& dft_size = buf.dft_size;
 
-    Size& block_size = buf.block_size;
-    Size& dft_size = buf.dft_size;
+        GpuMat& image_block = buf.image_block;
+        GpuMat& templ_block = buf.templ_block;
+        GpuMat& result_data = buf.result_data;
 
-    GpuMat& image_block = buf.image_block;
-    GpuMat& templ_block = buf.templ_block;
-    GpuMat& result_data = buf.result_data;
+        GpuMat& image_spect = buf.image_spect;
+        GpuMat& templ_spect = buf.templ_spect;
+        GpuMat& result_spect = buf.result_spect;
 
-    GpuMat& image_spect = buf.image_spect;
-    GpuMat& templ_spect = buf.templ_spect;
-    GpuMat& result_spect = buf.result_spect;
+        cufftHandle planR2C, planC2R;
+        cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R));
+        cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C));
 
-    cufftHandle planR2C, planC2R;
-    cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R));
-    cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C));
+        cufftSafeCall( cufftSetStream(planR2C, StreamAccessor::getStream(stream)) );
+        cufftSafeCall( cufftSetStream(planC2R, StreamAccessor::getStream(stream)) );
 
-    GpuMat templ_roi(templ.size(), CV_32F, templ.data, templ.step);
-    copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0, 
-                   templ_block.cols - templ_roi.cols, 0);
+        GpuMat templ_roi(templ.size(), CV_32F, templ.data, templ.step);
+        copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0, 
+                       templ_block.cols - templ_roi.cols, 0, Scalar(), stream);
 
-    cufftSafeCall(cufftExecR2C(planR2C, templ_block.ptr<cufftReal>(), 
-                               templ_spect.ptr<cufftComplex>()));
+        cufftSafeCall(cufftExecR2C(planR2C, templ_block.ptr<cufftReal>(), 
+                                   templ_spect.ptr<cufftComplex>()));
 
-    // Process all blocks of the result matrix
-    for (int y = 0; y < result.rows; y += block_size.height)
-    {
-        for (int x = 0; x < result.cols; x += block_size.width)
+        // Process all blocks of the result matrix
+        for (int y = 0; y < result.rows; y += block_size.height)
         {
-            Size image_roi_size(std::min(x + dft_size.width, image.cols) - x,
-                                std::min(y + dft_size.height, image.rows) - y);
-            GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr<float>(y) + x), 
-                             image.step);
-            copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows,
-                           0, image_block.cols - image_roi.cols, 0);
-
-            cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr<cufftReal>(), 
-                                       image_spect.ptr<cufftComplex>()));
-            mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0,
-                                 1.f / dft_size.area(), ccorr);
-            cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr<cufftComplex>(), 
-                                       result_data.ptr<cufftReal>()));
-
-            Size result_roi_size(std::min(x + block_size.width, result.cols) - x,
-                                 std::min(y + block_size.height, result.rows) - y);
-            GpuMat result_roi(result_roi_size, result.type(), 
-                              (void*)(result.ptr<float>(y) + x), result.step);
-            GpuMat result_block(result_roi_size, result_data.type(), 
-                                result_data.ptr(), result_data.step);
-            result_block.copyTo(result_roi);
+            for (int x = 0; x < result.cols; x += block_size.width)
+            {
+                Size image_roi_size(std::min(x + dft_size.width, image.cols) - x,
+                                    std::min(y + dft_size.height, image.rows) - y);
+                GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr<float>(y) + x), 
+                                 image.step);
+                copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows,
+                               0, image_block.cols - image_roi.cols, 0, Scalar(), stream);
+
+                cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr<cufftReal>(), 
+                                           image_spect.ptr<cufftComplex>()));
+                mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0,
+                                     1.f / dft_size.area(), ccorr, stream);
+                cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr<cufftComplex>(), 
+                                           result_data.ptr<cufftReal>()));
+
+                Size result_roi_size(std::min(x + block_size.width, result.cols) - x,
+                                     std::min(y + block_size.height, result.rows) - y);
+                GpuMat result_roi(result_roi_size, result.type(), 
+                                  (void*)(result.ptr<float>(y) + x), result.step);
+                GpuMat result_block(result_roi_size, result_data.type(), 
+                                    result_data.ptr(), result_data.step);
+
+                if (stream)
+                    stream.enqueueCopy(result_block, result_roi);
+                else
+                    result_block.copyTo(result_roi);
+            }
         }
+
+        cufftSafeCall(cufftDestroy(planR2C));
+        cufftSafeCall(cufftDestroy(planC2R));
     }
 
-    cufftSafeCall(cufftDestroy(planR2C));
-    cufftSafeCall(cufftDestroy(planC2R));
+#endif
 }
 
 //////////////////////////////////////////////////////////////////////////////
diff --git a/modules/gpu/src/match_template.cpp b/modules/gpu/src/match_template.cpp
index 58bc8bb911..e74d0fdc55 100644
--- a/modules/gpu/src/match_template.cpp
+++ b/modules/gpu/src/match_template.cpp
@@ -47,43 +47,32 @@ using namespace cv::gpu;
 
 #if !defined (HAVE_CUDA)
 
-void cv::gpu::matchTemplate(const GpuMat&, const GpuMat&, GpuMat&, int) { throw_nogpu(); }
+void cv::gpu::matchTemplate(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
 
 #else
 
 namespace cv { namespace gpu { namespace imgproc 
 {  
-    void matchTemplateNaive_CCORR_8U(
-            const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn);
+    void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
+    void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
 
-    void matchTemplateNaive_CCORR_32F(
-            const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn);
+    void matchTemplateNaive_SQDIFF_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
+    void matchTemplateNaive_SQDIFF_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
 
-    void matchTemplateNaive_SQDIFF_8U(
-            const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn);
+    void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, 
+        int cn, cudaStream_t stream);
 
-    void matchTemplateNaive_SQDIFF_32F(
-            const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn);
-
-    void matchTemplatePrepared_SQDIFF_8U(
-            int w, int h, const DevMem2D_<unsigned long long> image_sqsum, 
-            unsigned int templ_sqsum, DevMem2Df result, int cn);
-
-    void matchTemplatePrepared_SQDIFF_NORMED_8U(
-            int w, int h, const DevMem2D_<unsigned long long> image_sqsum, 
-            unsigned int templ_sqsum, DevMem2Df result, int cn);
-
-    void matchTemplatePrepared_CCOFF_8U(
-            int w, int h, const DevMem2D_<unsigned int> image_sum,
-            unsigned int templ_sum, DevMem2Df result);
+    void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, 
+        int cn, cudaStream_t stream);
 
+    void matchTemplatePrepared_CCOFF_8U(int w, int h, const DevMem2D_<unsigned int> image_sum, unsigned int templ_sum, DevMem2Df result, cudaStream_t stream);
     void matchTemplatePrepared_CCOFF_8UC2(
-            int w, int h, 
-            const DevMem2D_<unsigned int> image_sum_r, 
-            const DevMem2D_<unsigned int> image_sum_g,
-            unsigned int templ_sum_r, unsigned int templ_sum_g, 
-            DevMem2Df result);
-
+        int w, int h,
+        const DevMem2D_<unsigned int> image_sum_r, 
+        const DevMem2D_<unsigned int> image_sum_g, 
+        unsigned int templ_sum_r,
+        unsigned int templ_sum_g, 
+        DevMem2Df result, cudaStream_t stream);
     void matchTemplatePrepared_CCOFF_8UC3(
             int w, int h, 
             const DevMem2D_<unsigned int> image_sum_r, 
@@ -92,8 +81,7 @@ namespace cv { namespace gpu { namespace imgproc
             unsigned int templ_sum_r, 
             unsigned int templ_sum_g, 
             unsigned int templ_sum_b, 
-            DevMem2Df result);
-
+            DevMem2Df result, cudaStream_t stream);
     void matchTemplatePrepared_CCOFF_8UC4(
             int w, int h, 
             const DevMem2D_<unsigned int> image_sum_r, 
@@ -104,22 +92,21 @@ namespace cv { namespace gpu { namespace imgproc
             unsigned int templ_sum_g, 
             unsigned int templ_sum_b, 
             unsigned int templ_sum_a, 
-            DevMem2Df result);
+            DevMem2Df result, cudaStream_t stream);
+
 
     void matchTemplatePrepared_CCOFF_NORMED_8U(
             int w, int h, const DevMem2D_<unsigned int> image_sum, 
             const DevMem2D_<unsigned long long> image_sqsum,
             unsigned int templ_sum, unsigned int templ_sqsum,
-            DevMem2Df result);
-
+            DevMem2Df result, cudaStream_t stream);
     void matchTemplatePrepared_CCOFF_NORMED_8UC2(
             int w, int h, 
             const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
             const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
             unsigned int templ_sum_r, unsigned int templ_sqsum_r,
             unsigned int templ_sum_g, unsigned int templ_sqsum_g,
-            DevMem2Df result);
-
+            DevMem2Df result, cudaStream_t stream);
     void matchTemplatePrepared_CCOFF_NORMED_8UC3(
             int w, int h, 
             const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
@@ -128,8 +115,7 @@ namespace cv { namespace gpu { namespace imgproc
             unsigned int templ_sum_r, unsigned int templ_sqsum_r,
             unsigned int templ_sum_g, unsigned int templ_sqsum_g,
             unsigned int templ_sum_b, unsigned int templ_sqsum_b,
-            DevMem2Df result);
-
+            DevMem2Df result, cudaStream_t stream);
     void matchTemplatePrepared_CCOFF_NORMED_8UC4(
             int w, int h, 
             const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
@@ -140,12 +126,12 @@ namespace cv { namespace gpu { namespace imgproc
             unsigned int templ_sum_g, unsigned int templ_sqsum_g,
             unsigned int templ_sum_b, unsigned int templ_sqsum_b,
             unsigned int templ_sum_a, unsigned int templ_sqsum_a,
-            DevMem2Df result);
+            DevMem2Df result, cudaStream_t stream);
 
     void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, 
-                      unsigned int templ_sqsum, DevMem2Df result, int cn);
+                      unsigned int templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream);
 
-    void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn);
+    void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cudaStream_t stream);
 }}}
 
 
@@ -186,103 +172,111 @@ namespace
     }
 
     
-    void matchTemplate_CCORR_32F(const GpuMat& image, const GpuMat& templ, GpuMat& result)
+    void matchTemplate_CCORR_32F(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)
     {
         result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
         if (templ.size().area() < getTemplateThreshold(CV_TM_CCORR, CV_32F))
         {
-            imgproc::matchTemplateNaive_CCORR_32F(image, templ, result, image.channels());
+            imgproc::matchTemplateNaive_CCORR_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
             return;
         }
 
         GpuMat result_;
-        convolve(image.reshape(1), templ.reshape(1), result_, true);
-        imgproc::extractFirstChannel_32F(result_, result, image.channels());
+        ConvolveBuf buf;
+        convolve(image.reshape(1), templ.reshape(1), result_, true, buf, stream);
+        imgproc::extractFirstChannel_32F(result_, result, image.channels(), StreamAccessor::getStream(stream));
     }
 
 
-    void matchTemplate_CCORR_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result)
+    void matchTemplate_CCORR_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)
     {
         if (templ.size().area() < getTemplateThreshold(CV_TM_CCORR, CV_8U))
         {
             result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
-            imgproc::matchTemplateNaive_CCORR_8U(image, templ, result, image.channels());
+            imgproc::matchTemplateNaive_CCORR_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
             return;
         }
 
         GpuMat imagef, templf;
-        image.convertTo(imagef, CV_32F);
-        templ.convertTo(templf, CV_32F);
-        matchTemplate_CCORR_32F(imagef, templf, result);
+        if (stream)
+        {
+            stream.enqueueConvert(image, imagef, CV_32F);
+            stream.enqueueConvert(templ, templf, CV_32F);
+        }
+        else
+        {
+            image.convertTo(imagef, CV_32F);
+            templ.convertTo(templf, CV_32F);
+        }
+        matchTemplate_CCORR_32F(imagef, templf, result, stream);
     }
 
 
-    void matchTemplate_CCORR_NORMED_8U(const GpuMat& image, const GpuMat& templ, 
-                                       GpuMat& result)
+    void matchTemplate_CCORR_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)
     {
-        matchTemplate_CCORR_8U(image, templ, result);
+        matchTemplate_CCORR_8U(image, templ, result, stream);
 
         GpuMat img_sqsum;
-        sqrIntegral(image.reshape(1), img_sqsum);
+        sqrIntegral(image.reshape(1), img_sqsum, stream);
 
         unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0];
         imgproc::normalize_8U(templ.cols, templ.rows, img_sqsum, templ_sqsum, 
-                              result, image.channels());
+                              result, image.channels(), StreamAccessor::getStream(stream));
     }
 
     
-    void matchTemplate_SQDIFF_32F(const GpuMat& image, const GpuMat& templ, GpuMat& result)
+    void matchTemplate_SQDIFF_32F(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)
     {
         result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
-        imgproc::matchTemplateNaive_SQDIFF_32F(image, templ, result, image.channels());
+        imgproc::matchTemplateNaive_SQDIFF_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
     }
 
 
-    void matchTemplate_SQDIFF_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result)
+    void matchTemplate_SQDIFF_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)
     {
         if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, CV_8U))
         {
             result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
-            imgproc::matchTemplateNaive_SQDIFF_8U(image, templ, result, image.channels());
+            imgproc::matchTemplateNaive_SQDIFF_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
             return;
         }
 
         GpuMat img_sqsum;
-        sqrIntegral(image.reshape(1), img_sqsum);
+        sqrIntegral(image.reshape(1), img_sqsum, stream);
 
         unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0];
 
-        matchTemplate_CCORR_8U(image, templ, result);
+        matchTemplate_CCORR_8U(image, templ, result, stream);
         imgproc::matchTemplatePrepared_SQDIFF_8U(
-                templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels());
+                templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
     }
 
 
-    void matchTemplate_SQDIFF_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result)
+    void matchTemplate_SQDIFF_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)
     {
         GpuMat img_sqsum;
-        sqrIntegral(image.reshape(1), img_sqsum);
+        sqrIntegral(image.reshape(1), img_sqsum, stream);
 
         unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0];
 
-        matchTemplate_CCORR_8U(image, templ, result);
+        matchTemplate_CCORR_8U(image, templ, result, stream);
         imgproc::matchTemplatePrepared_SQDIFF_NORMED_8U(
-                templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels());
+                templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
     }
 
 
-    void matchTemplate_CCOFF_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result)
+    void matchTemplate_CCOFF_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)
     {
-        matchTemplate_CCORR_8U(image, templ, result);
+        matchTemplate_CCORR_8U(image, templ, result, stream);
 
         if (image.channels() == 1)
         {
             GpuMat image_sum;
-            integral(image, image_sum);
+            integral(image, image_sum, stream);
 
             unsigned int templ_sum = (unsigned int)sum(templ)[0];
             imgproc::matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, 
-                                                    image_sum, templ_sum, result);
+                                                    image_sum, templ_sum, result, StreamAccessor::getStream(stream));
         }
         else
         {
@@ -291,7 +285,7 @@ namespace
 
             split(image, images);
             for (int i = 0; i < image.channels(); ++i)
-                integral(images[i], image_sums[i]);
+                integral(images[i], image_sums[i], stream);
 
             Scalar templ_sum = sum(templ);
 
@@ -301,19 +295,19 @@ namespace
                 imgproc::matchTemplatePrepared_CCOFF_8UC2(
                         templ.cols, templ.rows, image_sums[0], image_sums[1],
                         (unsigned int)templ_sum[0], (unsigned int)templ_sum[1],
-                        result);
+                        result, StreamAccessor::getStream(stream));
                 break;
             case 3:
                 imgproc::matchTemplatePrepared_CCOFF_8UC3(
                         templ.cols, templ.rows, image_sums[0], image_sums[1], image_sums[2],
                         (unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2],
-                        result);
+                        result, StreamAccessor::getStream(stream));
                 break;
             case 4:
                 imgproc::matchTemplatePrepared_CCOFF_8UC4(
                         templ.cols, templ.rows, image_sums[0], image_sums[1], image_sums[2], image_sums[3],
                         (unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2],
-                        (unsigned int)templ_sum[3], result);
+                        (unsigned int)templ_sum[3], result, StreamAccessor::getStream(stream));
                 break;
             default:
                 CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");
@@ -322,25 +316,34 @@ namespace
     }
 
 
-    void matchTemplate_CCOFF_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result)
+    void matchTemplate_CCOFF_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)
     {
         GpuMat imagef, templf;
-        image.convertTo(imagef, CV_32F);
-        templ.convertTo(templf, CV_32F);
-        matchTemplate_CCORR_32F(imagef, templf, result);
+        if (stream)
+        {
+            stream.enqueueConvert(image, imagef, CV_32F);
+            stream.enqueueConvert(templ, templf, CV_32F);
+        }
+        else
+        {
+            image.convertTo(imagef, CV_32F);
+            templ.convertTo(templf, CV_32F);
+        }
+
+        matchTemplate_CCORR_32F(imagef, templf, result, stream);
 
         if (image.channels() == 1)
         {
             GpuMat image_sum, image_sqsum;
-            integral(image, image_sum);
-            sqrIntegral(image, image_sqsum);
+            integral(image, image_sum, stream);
+            sqrIntegral(image, image_sqsum, stream);
 
             unsigned int templ_sum = (unsigned int)sum(templ)[0];
             unsigned int templ_sqsum = (unsigned int)sqrSum(templ)[0];
 
             imgproc::matchTemplatePrepared_CCOFF_NORMED_8U(
                     templ.cols, templ.rows, image_sum, image_sqsum, 
-                    templ_sum, templ_sqsum, result);
+                    templ_sum, templ_sqsum, result, StreamAccessor::getStream(stream));
         }
         else
         {
@@ -351,8 +354,8 @@ namespace
             split(image, images);
             for (int i = 0; i < image.channels(); ++i)
             {
-                integral(images[i], image_sums[i]);
-                sqrIntegral(images[i], image_sqsums[i]);
+                integral(images[i], image_sums[i], stream);
+                sqrIntegral(images[i], image_sqsums[i], stream);
             }
 
             Scalar templ_sum = sum(templ);
@@ -367,7 +370,7 @@ namespace
                         image_sums[1], image_sqsums[1],
                         (unsigned int)templ_sum[0], (unsigned int)templ_sqsum[0],
                         (unsigned int)templ_sum[1], (unsigned int)templ_sqsum[1],
-                        result);
+                        result, StreamAccessor::getStream(stream));
                 break;
             case 3:
                 imgproc::matchTemplatePrepared_CCOFF_NORMED_8UC3(
@@ -378,7 +381,7 @@ namespace
                         (unsigned int)templ_sum[0], (unsigned int)templ_sqsum[0],
                         (unsigned int)templ_sum[1], (unsigned int)templ_sqsum[1],
                         (unsigned int)templ_sum[2], (unsigned int)templ_sqsum[2],
-                        result);
+                        result, StreamAccessor::getStream(stream));
                 break;
             case 4:
                 imgproc::matchTemplatePrepared_CCOFF_NORMED_8UC4(
@@ -391,7 +394,7 @@ namespace
                         (unsigned int)templ_sum[1], (unsigned int)templ_sqsum[1],
                         (unsigned int)templ_sum[2], (unsigned int)templ_sqsum[2],
                         (unsigned int)templ_sum[3], (unsigned int)templ_sqsum[3],
-                        result);                
+                        result, StreamAccessor::getStream(stream));                
                 break;
             default:
                 CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");
@@ -401,12 +404,12 @@ namespace
 }
 
 
-void cv::gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method)
+void cv::gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream& stream)
 {
     CV_Assert(image.type() == templ.type());
     CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows);
 
-    typedef void (*Caller)(const GpuMat&, const GpuMat&, GpuMat&);
+    typedef void (*Caller)(const GpuMat&, const GpuMat&, GpuMat&, Stream& stream);
 
     static const Caller callers8U[] = { ::matchTemplate_SQDIFF_8U, ::matchTemplate_SQDIFF_NORMED_8U, 
                                         ::matchTemplate_CCORR_8U, ::matchTemplate_CCORR_NORMED_8U, 
@@ -424,7 +427,7 @@ void cv::gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& re
 
     Caller caller = callers[method];
     CV_Assert(caller);
-    caller(image, templ, result);
+    caller(image, templ, result, stream);
 }
 
 #endif
diff --git a/modules/gpu/src/optical_flow.cpp b/modules/gpu/src/optical_flow.cpp
index 7891ef5548..19754c0578 100644
--- a/modules/gpu/src/optical_flow.cpp
+++ b/modules/gpu/src/optical_flow.cpp
@@ -59,10 +59,8 @@ namespace
                       NCVMatrix<Ncv32f>& u, NCVMatrix<Ncv32f>& v, const cudaDeviceProp& devProp)
     {
         NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
-        CV_Assert(gpuCounter.isInitialized());
 
-        NCVStatus ncvStat = NCVBroxOpticalFlow(desc, gpuCounter, frame0, frame1, u, v, 0);
-        CV_Assert(ncvStat == NCV_SUCCESS);
+        ncvSafeCall( NCVBroxOpticalFlow(desc, gpuCounter, frame0, frame1, u, v, 0) );
 
         return gpuCounter.maxSize();
     }
@@ -130,10 +128,8 @@ void cv::gpu::BroxOpticalFlow::operator ()(const GpuMat& frame0, const GpuMat& f
     ensureSizeIsEnough(1, bufSize, CV_8UC1, buf);
 
     NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), buf.ptr());
-    CV_Assert(gpuAllocator.isInitialized());
     
-    NCVStatus ncvStat = NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, stream);
-    CV_Assert(ncvStat == NCV_SUCCESS);
+    ncvSafeCall( NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, stream) );
 }
 
 void cv::gpu::interpolateFrames(const GpuMat& frame0, const GpuMat& frame1, const GpuMat& fu, const GpuMat& fv, const GpuMat& bu, const GpuMat& bv, 
@@ -189,7 +185,7 @@ void cv::gpu::interpolateFrames(const GpuMat& frame0, const GpuMat& frame1, cons
     state.ppBuffers[4] = bui.ptr<Ncv32f>();
     state.ppBuffers[5] = bvi.ptr<Ncv32f>();
 
-    nppSafeCall( nppiStInterpolateFrames(&state) );
+    ncvSafeCall( nppiStInterpolateFrames(&state) );
 
     if (stream == 0)
         cudaSafeCall( cudaDeviceSynchronize() );
diff --git a/modules/gpu/src/precomp.hpp b/modules/gpu/src/precomp.hpp
index ea5259b075..3c84c93ba2 100644
--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
@@ -39,15 +39,16 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
+
 #ifndef __OPENCV_PRECOMP_H__
 #define __OPENCV_PRECOMP_H__
 
 #if _MSC_VER >= 1200
-#pragma warning( disable: 4251 4710 4711 4514 4996 )
+    #pragma warning( disable: 4251 4710 4711 4514 4996 )
 #endif
 
 #ifdef HAVE_CVCONFIG_H
-#include "cvconfig.h"
+    #include "cvconfig.h"
 #endif
 
 #include <iostream>
@@ -65,33 +66,43 @@
 #include "opencv2/calib3d/calib3d.hpp"
 #include "opencv2/core/internal.hpp"
 
-#if defined(HAVE_CUDA)
+#define OPENCV_GPU_UNUSED(x) (void)x
+
+#ifdef HAVE_CUDA
 
-    #include "internal_shared.hpp"
     #include "cuda_runtime_api.h"
-    #include "cufft.h"
+    #include "npp.h"
+    
+    #ifdef HAVE_CUFFT
+        #include "cufft.h"
+    #endif
+
+    #ifdef HAVE_CUBLAS
+        #include "cublas.h"
+    #endif
+
+    #include "internal_shared.hpp"
     #include "opencv2/gpu/stream_accessor.hpp"
-    #include "npp.h"    
     
     #include "nvidia/core/NCV.hpp"
     #include "nvidia/NPP_staging/NPP_staging.hpp"
     #include "nvidia/NCVHaarObjectDetection.hpp"
     #include "nvidia/NCVBroxOpticalFlow.hpp"
 
-#define CUDART_MINIMUM_REQUIRED_VERSION 4000
-#define NPP_MINIMUM_REQUIRED_VERSION 4000
+    #define CUDART_MINIMUM_REQUIRED_VERSION 4000
+    #define NPP_MINIMUM_REQUIRED_VERSION 4000
 
-#if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION)
-    #error "Insufficient Cuda Runtime library version, please update it."
-#endif
+    #if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION)
+        #error "Insufficient Cuda Runtime library version, please update it."
+    #endif
 
-#if (NPP_VERSION_MAJOR*1000+NPP_VERSION_MINOR*100+NPP_VERSION_BUILD < NPP_MINIMUM_REQUIRED_VERSION)
-    #error "Insufficient NPP version, please update it."
-#endif
+    #if (NPP_VERSION_MAJOR * 1000 + NPP_VERSION_MINOR * 100 + NPP_VERSION_BUILD < NPP_MINIMUM_REQUIRED_VERSION)
+        #error "Insufficient NPP version, please update it."
+    #endif
 
-#if defined(CUDA_ARCH_BIN_OR_PTX_10)
-    #error "OpenCV GPU module doesn't support NVIDIA compute capability 1.0"
-#endif
+    #if defined(CUDA_ARCH_BIN_OR_PTX_10)
+        #error "OpenCV GPU module doesn't support NVIDIA compute capability 1.0"
+    #endif
 
     static inline void throw_nogpu() { CV_Error(CV_GpuNotSupported, "The called functionality is disabled for current build or platform"); }