diff --git a/modules/core/include/opencv2/core/gpu.hpp b/modules/core/include/opencv2/core/gpu.hpp
index c22803bad0..46d07100a8 100644
--- a/modules/core/include/opencv2/core/gpu.hpp
+++ b/modules/core/include/opencv2/core/gpu.hpp
@@ -51,8 +51,7 @@
 #include "opencv2/core.hpp"
 #include "opencv2/core/gpu_types.hpp"
 
-namespace cv { namespace gpu
-{
+namespace cv { namespace gpu {
 
 //////////////////////////////// GpuMat ///////////////////////////////
 
@@ -337,59 +336,56 @@ CV_EXPORTS void registerPageLocked(Mat& m);
 //! unmaps the memory of matrix m, and makes it pageable again
 CV_EXPORTS void unregisterPageLocked(Mat& m);
 
-//////////////////////////////// CudaStream ////////////////////////////////
+///////////////////////////////// Stream //////////////////////////////////
+
 // Encapculates Cuda Stream. Provides interface for async coping.
 // Passed to each function that supports async kernel execution.
-// Reference counting is enabled
+// Reference counting is enabled.
 
 class CV_EXPORTS Stream
 {
+    typedef void (Stream::*bool_type)() const;
+    void this_type_does_not_support_comparisons() const {}
+
 public:
+    typedef void (*StreamCallback)(int status, void* userData);
+
+    //! creates a new asynchronous stream
     Stream();
-    ~Stream();
 
-    Stream(const Stream&);
-    Stream& operator =(const Stream&);
+    //! queries an asynchronous stream for completion status
+    bool queryIfComplete() const;
 
-    bool queryIfComplete();
+    //! waits for stream tasks to complete
     void waitForCompletion();
 
-    //! downloads asynchronously
-    // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its subMat)
-    void enqueueDownload(const GpuMat& src, CudaMem& dst);
-    void enqueueDownload(const GpuMat& src, Mat& dst);
+    //! adds a callback to be called on the host after all currently enqueued items in the stream have completed
+    void enqueueHostCallback(StreamCallback callback, void* userData);
 
-    //! uploads asynchronously
-    // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its ROI)
-    void enqueueUpload(const CudaMem& src, GpuMat& dst);
-    void enqueueUpload(const Mat& src, GpuMat& dst);
+    //! return Stream object for default CUDA stream
+    static Stream& Null();
 
-    //! copy asynchronously
-    void enqueueCopy(const GpuMat& src, GpuMat& dst);
+    //! returns true if stream object is not default (!= 0)
+    operator bool_type() const;
 
-    //! memory set asynchronously
-    void enqueueMemSet(GpuMat& src, Scalar val);
-    void enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask);
+    // obsolete methods
 
-    //! converts matrix type, ex from float to uchar depending on type
-    void enqueueConvert(const GpuMat& src, GpuMat& dst, int dtype, double a = 1, double b = 0);
+    void enqueueDownload(const GpuMat& src, OutputArray dst);
 
-    //! adds a callback to be called on the host after all currently enqueued items in the stream have completed
-    typedef void (*StreamCallback)(Stream& stream, int status, void* userData);
-    void enqueueHostCallback(StreamCallback callback, void* userData);
+    void enqueueUpload(InputArray src, GpuMat& dst);
 
-    static Stream& Null();
+    void enqueueCopy(const GpuMat& src, OutputArray dst);
 
-    operator bool() const;
+    void enqueueMemSet(GpuMat& src, Scalar val);
+    void enqueueMemSet(GpuMat& src, Scalar val, InputArray mask);
 
-private:
-    struct Impl;
+    void enqueueConvert(const GpuMat& src, OutputArray dst, int dtype, double alpha = 1.0, double beta = 0.0);
 
-    explicit Stream(Impl* impl);
-    void create();
-    void release();
+    class Impl;
 
-    Impl *impl;
+private:
+    Ptr<Impl> impl_;
+    Stream(const Ptr<Impl>& impl);
 
     friend struct StreamAccessor;
 };
@@ -498,7 +494,13 @@ CV_EXPORTS void printCudaDeviceInfo(int device);
 
 CV_EXPORTS void printShortCudaDeviceInfo(int device);
 
-}} // cv::gpu
+}} // namespace cv { namespace gpu {
+
+namespace cv {
+
+template <> CV_EXPORTS void Ptr<cv::gpu::Stream::Impl>::delete_obj();
+
+}
 
 #include "opencv2/core/gpu.inl.hpp"
 
diff --git a/modules/core/include/opencv2/core/gpu.inl.hpp b/modules/core/include/opencv2/core/gpu.inl.hpp
index 10b8ff5947..1983cbcb25 100644
--- a/modules/core/include/opencv2/core/gpu.inl.hpp
+++ b/modules/core/include/opencv2/core/gpu.inl.hpp
@@ -46,8 +46,7 @@
 
 #include "opencv2/core/gpu.hpp"
 
-namespace cv { namespace gpu
-{
+namespace cv { namespace gpu {
 
 //////////////////////////////// GpuMat ///////////////////////////////
 
@@ -524,7 +523,51 @@ void swap(CudaMem& a, CudaMem& b)
     a.swap(b);
 }
 
-}} // namespace cv { namespace gpu
+//////////////////////////////// Stream ///////////////////////////////
+
+inline
+void Stream::enqueueDownload(const GpuMat& src, OutputArray dst)
+{
+    src.download(dst, *this);
+}
+
+inline
+void Stream::enqueueUpload(InputArray src, GpuMat& dst)
+{
+    dst.upload(src, *this);
+}
+
+inline
+void Stream::enqueueCopy(const GpuMat& src, OutputArray dst)
+{
+    src.copyTo(dst, *this);
+}
+
+inline
+void Stream::enqueueMemSet(GpuMat& src, Scalar val)
+{
+    src.setTo(val, *this);
+}
+
+inline
+void Stream::enqueueMemSet(GpuMat& src, Scalar val, InputArray mask)
+{
+    src.setTo(val, mask, *this);
+}
+
+inline
+void Stream::enqueueConvert(const GpuMat& src, OutputArray dst, int dtype, double alpha, double beta)
+{
+    src.convertTo(dst, dtype, alpha, beta, *this);
+}
+
+inline
+Stream::Stream(const Ptr<Impl>& impl)
+    : impl_(impl)
+{
+}
+
+}} // namespace cv { namespace gpu {
 
 //////////////////////////////// Mat ////////////////////////////////
 
diff --git a/modules/core/src/gpu_stream.cpp b/modules/core/src/gpu_stream.cpp
index 251e3a2aab..cf90501593 100644
--- a/modules/core/src/gpu_stream.cpp
+++ b/modules/core/src/gpu_stream.cpp
@@ -45,170 +45,103 @@
 using namespace cv;
 using namespace cv::gpu;
 
-#if !defined (HAVE_CUDA)
+#ifndef HAVE_CUDA
 
-cv::gpu::Stream::Stream() { throw_no_cuda(); }
-cv::gpu::Stream::~Stream() {}
-cv::gpu::Stream::Stream(const Stream&) { throw_no_cuda(); }
-Stream& cv::gpu::Stream::operator=(const Stream&) { throw_no_cuda(); return *this; }
-bool cv::gpu::Stream::queryIfComplete() { throw_no_cuda(); return false; }
-void cv::gpu::Stream::waitForCompletion() { throw_no_cuda(); }
-void cv::gpu::Stream::enqueueDownload(const GpuMat&, Mat&) { throw_no_cuda(); }
-void cv::gpu::Stream::enqueueDownload(const GpuMat&, CudaMem&) { throw_no_cuda(); }
-void cv::gpu::Stream::enqueueUpload(const CudaMem&, GpuMat&) { throw_no_cuda(); }
-void cv::gpu::Stream::enqueueUpload(const Mat&, GpuMat&) { throw_no_cuda(); }
-void cv::gpu::Stream::enqueueCopy(const GpuMat&, GpuMat&) { throw_no_cuda(); }
-void cv::gpu::Stream::enqueueMemSet(GpuMat&, Scalar) { throw_no_cuda(); }
-void cv::gpu::Stream::enqueueMemSet(GpuMat&, Scalar, const GpuMat&) { throw_no_cuda(); }
-void cv::gpu::Stream::enqueueConvert(const GpuMat&, GpuMat&, int, double, double) { throw_no_cuda(); }
-void cv::gpu::Stream::enqueueHostCallback(StreamCallback, void*) { throw_no_cuda(); }
-Stream& cv::gpu::Stream::Null() { throw_no_cuda(); static Stream s; return s; }
-cv::gpu::Stream::operator bool() const { throw_no_cuda(); return false; }
-cv::gpu::Stream::Stream(Impl*) { throw_no_cuda(); }
-void cv::gpu::Stream::create() { throw_no_cuda(); }
-void cv::gpu::Stream::release() { throw_no_cuda(); }
-
-#else /* !defined (HAVE_CUDA) */
-
-struct Stream::Impl
+class cv::gpu::Stream::Impl
 {
-    static cudaStream_t getStream(const Impl* impl)
+public:
+    Impl(void* ptr = 0)
     {
-        return impl ? impl->stream : 0;
+        (void) ptr;
+        throw_no_cuda();
     }
+};
+
+#else
 
+class cv::gpu::Stream::Impl
+{
+public:
     cudaStream_t stream;
-    int ref_counter;
+
+    Impl();
+    Impl(cudaStream_t stream);
+
+    ~Impl();
 };
 
-cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream)
+cv::gpu::Stream::Impl::Impl() : stream(0)
 {
-    return Stream::Impl::getStream(stream.impl);
+    cudaSafeCall( cudaStreamCreate(&stream) );
 }
 
-cv::gpu::Stream::Stream() : impl(0)
+cv::gpu::Stream::Impl::Impl(cudaStream_t stream_) : stream(stream_)
 {
-    create();
 }
 
-cv::gpu::Stream::~Stream()
+cv::gpu::Stream::Impl::~Impl()
 {
-    release();
+    if (stream)
+        cudaStreamDestroy(stream);
 }
 
-cv::gpu::Stream::Stream(const Stream& stream) : impl(stream.impl)
+cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream)
 {
-    if (impl)
-        CV_XADD(&impl->ref_counter, 1);
+    return stream.impl_->stream;
 }
 
-Stream& cv::gpu::Stream::operator =(const Stream& stream)
-{
-    if (this != &stream)
-    {
-        release();
-        impl = stream.impl;
-        if (impl)
-            CV_XADD(&impl->ref_counter, 1);
-    }
+#endif
 
-    return *this;
+cv::gpu::Stream::Stream()
+{
+#ifndef HAVE_CUDA
+    throw_no_cuda();
+#else
+    impl_ = new Impl;
+#endif
 }
 
-bool cv::gpu::Stream::queryIfComplete()
+bool cv::gpu::Stream::queryIfComplete() const
 {
-    cudaStream_t stream = Impl::getStream(impl);
-    cudaError_t err = cudaStreamQuery(stream);
+#ifndef HAVE_CUDA
+    throw_no_cuda();
+    return false;
+#else
+    cudaError_t err = cudaStreamQuery(impl_->stream);
 
     if (err == cudaErrorNotReady || err == cudaSuccess)
         return err == cudaSuccess;
 
     cudaSafeCall(err);
     return false;
+#endif
 }
 
 void cv::gpu::Stream::waitForCompletion()
 {
-    cudaStream_t stream = Impl::getStream(impl);
-    cudaSafeCall( cudaStreamSynchronize(stream) );
-}
-
-void cv::gpu::Stream::enqueueDownload(const GpuMat& src, Mat& dst)
-{
-    // if not -> allocation will be done, but after that dst will not point to page locked memory
-    CV_Assert( src.size() == dst.size() && src.type() == dst.type() );
-
-    cudaStream_t stream = Impl::getStream(impl);
-    size_t bwidth = src.cols * src.elemSize();
-    cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, cudaMemcpyDeviceToHost, stream) );
-}
-
-void cv::gpu::Stream::enqueueDownload(const GpuMat& src, CudaMem& dst)
-{
-    dst.create(src.size(), src.type());
-
-    cudaStream_t stream = Impl::getStream(impl);
-    size_t bwidth = src.cols * src.elemSize();
-    cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, cudaMemcpyDeviceToHost, stream) );
-}
-
-void cv::gpu::Stream::enqueueUpload(const CudaMem& src, GpuMat& dst)
-{
-    dst.create(src.size(), src.type());
-
-    cudaStream_t stream = Impl::getStream(impl);
-    size_t bwidth = src.cols * src.elemSize();
-    cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, cudaMemcpyHostToDevice, stream) );
-}
-
-void cv::gpu::Stream::enqueueUpload(const Mat& src, GpuMat& dst)
-{
-    dst.create(src.size(), src.type());
-
-    cudaStream_t stream = Impl::getStream(impl);
-    size_t bwidth = src.cols * src.elemSize();
-    cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, cudaMemcpyHostToDevice, stream) );
-}
-
-void cv::gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst)
-{
-    dst.create(src.size(), src.type());
-
-    cudaStream_t stream = Impl::getStream(impl);
-    size_t bwidth = src.cols * src.elemSize();
-    cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, cudaMemcpyDeviceToDevice, stream) );
-}
-
-void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val)
-{
-    src.setTo(val, *this);
-}
-
-void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask)
-{
-    src.setTo(val, mask, *this);
-}
-
-void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int dtype, double alpha, double beta)
-{
-    src.convertTo(dst, dtype, alpha, beta, *this);
+#ifndef HAVE_CUDA
+    throw_no_cuda();
+#else
+    cudaSafeCall( cudaStreamSynchronize(impl_->stream) );
+#endif
 }
 
-#if CUDART_VERSION >= 5000
+#if defined(HAVE_CUDA) && (CUDART_VERSION >= 5000)
 
 namespace
 {
     struct CallbackData
     {
-        cv::gpu::Stream::StreamCallback callback;
+        Stream::StreamCallback callback;
         void* userData;
-        Stream stream;
+
+        CallbackData(Stream::StreamCallback callback_, void* userData_) : callback(callback_), userData(userData_) {}
     };
 
     void CUDART_CB cudaStreamCallback(cudaStream_t, cudaError_t status, void* userData)
     {
         CallbackData* data = reinterpret_cast<CallbackData*>(userData);
-        data->callback(data->stream, static_cast<int>(status), data->userData);
+        data->callback(static_cast<int>(status), data->userData);
         delete data;
     }
 }
@@ -217,58 +150,39 @@ namespace
 
 void cv::gpu::Stream::enqueueHostCallback(StreamCallback callback, void* userData)
 {
-#if CUDART_VERSION >= 5000
-    CallbackData* data = new CallbackData;
-    data->callback = callback;
-    data->userData = userData;
-    data->stream = *this;
-
-    cudaStream_t stream = Impl::getStream(impl);
-
-    cudaSafeCall( cudaStreamAddCallback(stream, cudaStreamCallback, data, 0) );
-#else
+#ifndef HAVE_CUDA
     (void) callback;
     (void) userData;
-    CV_Error(CV_StsNotImplemented, "This function requires CUDA 5.0");
+    throw_no_cuda();
+#else
+    #if CUDART_VERSION < 5000
+        (void) callback;
+        (void) userData;
+        CV_Error(cv::Error::StsNotImplemented, "This function requires CUDA 5.0");
+    #else
+        CallbackData* data = new CallbackData(callback, userData);
+
+        cudaSafeCall( cudaStreamAddCallback(impl_->stream, cudaStreamCallback, data, 0) );
+    #endif
 #endif
 }
 
-cv::gpu::Stream& cv::gpu::Stream::Null()
+Stream& cv::gpu::Stream::Null()
 {
-    static Stream s((Impl*) 0);
+    static Stream s(new Impl(0));
     return s;
 }
 
-cv::gpu::Stream::operator bool() const
-{
-    return impl && impl->stream;
-}
-
-cv::gpu::Stream::Stream(Impl* impl_) : impl(impl_)
+cv::gpu::Stream::operator bool_type() const
 {
+#ifndef HAVE_CUDA
+    return 0;
+#else
+    return (impl_->stream != 0) ? &Stream::this_type_does_not_support_comparisons : 0;
+#endif
 }
 
-void cv::gpu::Stream::create()
-{
-    if (impl)
-        release();
-
-    cudaStream_t stream;
-    cudaSafeCall( cudaStreamCreate( &stream ) );
-
-    impl = (Stream::Impl*) fastMalloc(sizeof(Stream::Impl));
-
-    impl->stream = stream;
-    impl->ref_counter = 1;
-}
-
-void cv::gpu::Stream::release()
+template <> void cv::Ptr<Stream::Impl>::delete_obj()
 {
-    if (impl && CV_XADD(&impl->ref_counter, -1) == 1)
-    {
-        cudaSafeCall( cudaStreamDestroy(impl->stream) );
-        cv::fastFree(impl);
-    }
+    if (obj) delete obj;
 }
-
-#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/gpuarithm/src/arithm.cpp b/modules/gpuarithm/src/arithm.cpp
index c605b989ea..a6cd1cb62e 100644
--- a/modules/gpuarithm/src/arithm.cpp
+++ b/modules/gpuarithm/src/arithm.cpp
@@ -217,10 +217,7 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
     {
         if (src3.empty())
         {
-            if (stream)
-                stream.enqueueMemSet(dst, Scalar::all(0));
-            else
-                dst.setTo(Scalar::all(0));
+            dst.setTo(Scalar::all(0), stream);
         }
         else
         {
@@ -230,10 +227,7 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
             }
             else
             {
-                if (stream)
-                    stream.enqueueCopy(src3, dst);
-                else
-                    src3.copyTo(dst);
+                src3.copyTo(dst, stream);
             }
         }
     }
@@ -336,18 +330,13 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S
         cv::gpu::cudev::imgproc::shfl_integral_gpu(src, buffer, stream);
 
         sum.create(src.rows + 1, src.cols + 1, CV_32SC1);
-        if (s)
-            s.enqueueMemSet(sum, Scalar::all(0));
-        else
-            sum.setTo(Scalar::all(0));
+
+        sum.setTo(Scalar::all(0), s);
 
         GpuMat inner = sum(Rect(1, 1, src.cols, src.rows));
         GpuMat res = buffer(Rect(0, 0, src.cols, src.rows));
 
-        if (s)
-            s.enqueueCopy(res, inner);
-        else
-            res.copyTo(inner);
+        res.copyTo(inner, s);
     }
     else
     {
@@ -720,10 +709,7 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
             GpuMat result_block(result_roi_size, result_data.type(),
                                 result_data.ptr(), result_data.step);
 
-            if (stream)
-                stream.enqueueCopy(result_block, result_roi);
-            else
-                result_block.copyTo(result_roi);
+            result_block.copyTo(result_roi, stream);
         }
     }
 
diff --git a/modules/gpubgsegm/src/gmg.cpp b/modules/gpubgsegm/src/gmg.cpp
index f29bf45519..a38cbffaca 100644
--- a/modules/gpubgsegm/src/gmg.cpp
+++ b/modules/gpubgsegm/src/gmg.cpp
@@ -134,10 +134,7 @@ void cv::gpu::GMG_GPU::operator ()(const cv::gpu::GpuMat& frame, cv::gpu::GpuMat
         initialize(frame.size(), 0.0f, frame.depth() == CV_8U ? 255.0f : frame.depth() == CV_16U ? std::numeric_limits<ushort>::max() : 1.0f);
 
     fgmask.create(frameSize_, CV_8UC1);
-    if (stream)
-        stream.enqueueMemSet(fgmask, cv::Scalar::all(0));
-    else
-        fgmask.setTo(cv::Scalar::all(0));
+    fgmask.setTo(cv::Scalar::all(0), stream);
 
     funcs[frame.depth()][frame.channels() - 1](frame, fgmask, colors_, weights_, nfeatures_, frameNum_, learningRate, updateBackgroundModel, cv::gpu::StreamAccessor::getStream(stream));
 
diff --git a/modules/gpufeatures2d/src/brute_force_matcher.cpp b/modules/gpufeatures2d/src/brute_force_matcher.cpp
index e350d48cfa..feb0cc6928 100644
--- a/modules/gpufeatures2d/src/brute_force_matcher.cpp
+++ b/modules/gpufeatures2d/src/brute_force_matcher.cpp
@@ -497,10 +497,7 @@ void cv::gpu::BFMatcher_GPU::knnMatchSingle(const GpuMat& query, const GpuMat& t
         ensureSizeIsEnough(nQuery, nTrain, CV_32FC1, allDist);
     }
 
-    if (stream)
-        stream.enqueueMemSet(trainIdx, Scalar::all(-1));
-    else
-        trainIdx.setTo(Scalar::all(-1));
+    trainIdx.setTo(Scalar::all(-1), stream);
 
     caller_t func = callers[query.depth()];
     CV_Assert(func != 0);
@@ -616,10 +613,7 @@ void cv::gpu::BFMatcher_GPU::knnMatch2Collection(const GpuMat& query, const GpuM
     ensureSizeIsEnough(1, nQuery, CV_32SC2, imgIdx);
     ensureSizeIsEnough(1, nQuery, CV_32FC2, distance);
 
-    if (stream)
-        stream.enqueueMemSet(trainIdx, Scalar::all(-1));
-    else
-        trainIdx.setTo(Scalar::all(-1));
+    trainIdx.setTo(Scalar::all(-1), stream);
 
     caller_t func = callers[query.depth()];
     CV_Assert(func != 0);
@@ -803,10 +797,7 @@ void cv::gpu::BFMatcher_GPU::radiusMatchSingle(const GpuMat& query, const GpuMat
         ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32FC1, distance);
     }
 
-    if (stream)
-        stream.enqueueMemSet(nMatches, Scalar::all(0));
-    else
-        nMatches.setTo(Scalar::all(0));
+    nMatches.setTo(Scalar::all(0), stream);
 
     caller_t func = callers[query.depth()];
     CV_Assert(func != 0);
@@ -931,10 +922,7 @@ void cv::gpu::BFMatcher_GPU::radiusMatchCollection(const GpuMat& query, GpuMat&
         ensureSizeIsEnough(nQuery, std::max((nQuery / 100), 10), CV_32FC1, distance);
     }
 
-    if (stream)
-        stream.enqueueMemSet(nMatches, Scalar::all(0));
-    else
-        nMatches.setTo(Scalar::all(0));
+    nMatches.setTo(Scalar::all(0), stream);
 
     caller_t func = callers[query.depth()];
     CV_Assert(func != 0);
diff --git a/modules/gpufilters/src/filtering.cpp b/modules/gpufilters/src/filtering.cpp
index 8232ab804c..26442f546f 100644
--- a/modules/gpufilters/src/filtering.cpp
+++ b/modules/gpufilters/src/filtering.cpp
@@ -157,10 +157,7 @@ namespace
 
             if (roi.size() != src_size)
             {
-                if (stream)
-                    stream.enqueueMemSet(dst, Scalar::all(0));
-                else
-                    dst.setTo(Scalar::all(0));
+                dst.setTo(Scalar::all(0), stream);
             }
 
             normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size);
@@ -221,10 +218,7 @@ namespace
 
             if (roi.size() != src_size)
             {
-                if (stream)
-                    stream.enqueueMemSet(dst, Scalar::all(0));
-                else
-                    dst.setTo(Scalar::all(0));
+                dst.setTo(Scalar::all(0), stream);
             }
 
             ensureSizeIsEnough(src_size, bufType, *pbuf);
@@ -487,10 +481,7 @@ namespace
 
             if (roi.size() != src_size)
             {
-                if (stream)
-                    stream.enqueueMemSet(dst, Scalar::all(0));
-                else
-                    dst.setTo(Scalar::all(0));
+                dst.setTo(Scalar::all(0), stream);
             }
 
             normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size);
@@ -557,10 +548,7 @@ namespace
 
         if (iterations == 0 || _kernel.rows * _kernel.cols == 1)
         {
-            if (stream)
-                stream.enqueueCopy(src, dst);
-            else
-                src.copyTo(dst);
+            src.copyTo(dst, stream);
             return;
         }
 
diff --git a/modules/gpuimgproc/src/match_template.cpp b/modules/gpuimgproc/src/match_template.cpp
index 17d7b7686e..008d3da1ce 100644
--- a/modules/gpuimgproc/src/match_template.cpp
+++ b/modules/gpuimgproc/src/match_template.cpp
@@ -196,16 +196,9 @@ namespace
             return;
         }
 
-        if (stream)
-        {
-            stream.enqueueConvert(image, buf.imagef, CV_32F);
-            stream.enqueueConvert(templ, buf.templf, CV_32F);
-        }
-        else
-        {
-            image.convertTo(buf.imagef, CV_32F);
-            templ.convertTo(buf.templf, CV_32F);
-        }
+        image.convertTo(buf.imagef, CV_32F, stream);
+        templ.convertTo(buf.templf, CV_32F, stream);
+
         matchTemplate_CCORR_32F(buf.imagef, buf.templf, result, buf, stream);
     }
 
@@ -317,16 +310,8 @@ namespace
     void matchTemplate_CCOFF_NORMED_8U(
             const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
     {
-        if (stream)
-        {
-            stream.enqueueConvert(image, buf.imagef, CV_32F);
-            stream.enqueueConvert(templ, buf.templf, CV_32F);
-        }
-        else
-        {
-            image.convertTo(buf.imagef, CV_32F);
-            templ.convertTo(buf.templf, CV_32F);
-        }
+        image.convertTo(buf.imagef, CV_32F, stream);
+        templ.convertTo(buf.templf, CV_32F, stream);
 
         matchTemplate_CCORR_32F(buf.imagef, buf.templf, result, buf, stream);
 
diff --git a/modules/gpuoptflow/src/farneback.cpp b/modules/gpuoptflow/src/farneback.cpp
index 60a9cda67c..9ed6403eea 100644
--- a/modules/gpuoptflow/src/farneback.cpp
+++ b/modules/gpuoptflow/src/farneback.cpp
@@ -235,8 +235,8 @@ void cv::gpu::FarnebackOpticalFlow::operator ()(
             break;
     }
 
-    streams[0].enqueueConvert(frame0, frames_[0], CV_32F);
-    streams[1].enqueueConvert(frame1, frames_[1], CV_32F);
+    frame0.convertTo(frames_[0], CV_32F, streams[0]);
+    frame1.convertTo(frames_[1], CV_32F, streams[1]);
 
     if (fastPyramids)
     {
@@ -293,21 +293,21 @@ void cv::gpu::FarnebackOpticalFlow::operator ()(
             {
                 gpu::resize(flowx0, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]);
                 gpu::resize(flowy0, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]);
-                streams[0].enqueueConvert(curFlowX, curFlowX, curFlowX.depth(), scale);
-                streams[1].enqueueConvert(curFlowY, curFlowY, curFlowY.depth(), scale);
+                curFlowX.convertTo(curFlowX, curFlowX.depth(), scale, streams[0]);
+                curFlowY.convertTo(curFlowY, curFlowY.depth(), scale, streams[1]);
             }
             else
             {
-                streams[0].enqueueMemSet(curFlowX, 0);
-                streams[1].enqueueMemSet(curFlowY, 0);
+                curFlowX.setTo(0, streams[0]);
+                curFlowY.setTo(0, streams[1]);
             }
         }
         else
         {
             gpu::resize(prevFlowX, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]);
             gpu::resize(prevFlowY, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]);
-            streams[0].enqueueConvert(curFlowX, curFlowX, curFlowX.depth(), 1./pyrScale);
-            streams[1].enqueueConvert(curFlowY, curFlowY, curFlowY.depth(), 1./pyrScale);
+            curFlowX.convertTo(curFlowX, curFlowX.depth(), 1./pyrScale, streams[0]);
+            curFlowY.convertTo(curFlowY, curFlowY.depth(), 1./pyrScale, streams[1]);
         }
 
         GpuMat M = allocMatFromBuf(5*height, width, CV_32F, M_);
@@ -343,7 +343,7 @@ void cv::gpu::FarnebackOpticalFlow::operator ()(
             {
                 cudev::optflow_farneback::gaussianBlurGpu(
                         frames_[i], smoothSize/2, blurredFrame[i], BORDER_REFLECT101, S(streams[i]));
-                gpu::resize(blurredFrame[i], pyrLevel[i], Size(width, height), INTER_LINEAR, streams[i]);
+                gpu::resize(blurredFrame[i], pyrLevel[i], Size(width, height), 0.0, 0.0, INTER_LINEAR, streams[i]);
                 cudev::optflow_farneback::polynomialExpansionGpu(pyrLevel[i], polyN, R[i], S(streams[i]));
             }
         }
diff --git a/modules/gpustereo/src/disparity_bilateral_filter.cpp b/modules/gpustereo/src/disparity_bilateral_filter.cpp
index ef5be018da..d13fcc004f 100644
--- a/modules/gpustereo/src/disparity_bilateral_filter.cpp
+++ b/modules/gpustereo/src/disparity_bilateral_filter.cpp
@@ -113,10 +113,7 @@ namespace
 
         if (&dst != &disp)
         {
-            if (stream)
-                stream.enqueueCopy(disp, dst);
-            else
-                disp.copyTo(dst);
+            disp.copyTo(dst, stream);
         }
 
         disp_bilateral_filter<T>(dst, img, img.channels(), iters, StreamAccessor::getStream(stream));
diff --git a/modules/gpustereo/src/stereobp.cpp b/modules/gpustereo/src/stereobp.cpp
index 957eb70144..5ce56c1d70 100644
--- a/modules/gpustereo/src/stereobp.cpp
+++ b/modules/gpustereo/src/stereobp.cpp
@@ -194,20 +194,10 @@ namespace
             if (rthis.levels & 1)
             {
                 //can clear less area
-                if (stream)
-                {
-                    stream.enqueueMemSet(u, zero);
-                    stream.enqueueMemSet(d, zero);
-                    stream.enqueueMemSet(l, zero);
-                    stream.enqueueMemSet(r, zero);
-                }
-                else
-                {
-                    u.setTo(zero);
-                    d.setTo(zero);
-                    l.setTo(zero);
-                    r.setTo(zero);
-                }
+                u.setTo(zero, stream);
+                d.setTo(zero, stream);
+                l.setTo(zero, stream);
+                r.setTo(zero, stream);
             }
 
             if (rthis.levels > 1)
@@ -222,20 +212,10 @@ namespace
 
                 if ((rthis.levels & 1) == 0)
                 {
-                    if (stream)
-                    {
-                        stream.enqueueMemSet(u2, zero);
-                        stream.enqueueMemSet(d2, zero);
-                        stream.enqueueMemSet(l2, zero);
-                        stream.enqueueMemSet(r2, zero);
-                    }
-                    else
-                    {
-                        u2.setTo(zero);
-                        d2.setTo(zero);
-                        l2.setTo(zero);
-                        r2.setTo(zero);
-                    }
+                    u2.setTo(zero, stream);
+                    d2.setTo(zero, stream);
+                    l2.setTo(zero, stream);
+                    r2.setTo(zero, stream);
                 }
             }
 
@@ -313,20 +293,12 @@ namespace
 
             out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out));
 
-            if (stream)
-                stream.enqueueMemSet(out, zero);
-            else
-                out.setTo(zero);
+            out.setTo(zero, stream);
 
             output_callers[funcIdx](u, d, l, r, datas.front(), out, cudaStream);
 
             if (disp.type() != CV_16S)
-            {
-                if (stream)
-                    stream.enqueueConvert(out, disp, disp.type());
-                else
-                    out.convertTo(disp, disp.type());
-            }
+                out.convertTo(disp, disp.type(), stream);
         }
 
         StereoBeliefPropagation& rthis;
diff --git a/modules/gpustereo/src/stereocsbp.cpp b/modules/gpustereo/src/stereocsbp.cpp
index bd5ef4be9f..cedba1eeb9 100644
--- a/modules/gpustereo/src/stereocsbp.cpp
+++ b/modules/gpustereo/src/stereocsbp.cpp
@@ -213,36 +213,18 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat& mbuf, GpuMat& te
 
     load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight, rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp);
 
-    if (stream)
-    {
-        stream.enqueueMemSet(l[0], zero);
-        stream.enqueueMemSet(d[0], zero);
-        stream.enqueueMemSet(r[0], zero);
-        stream.enqueueMemSet(u[0], zero);
-
-        stream.enqueueMemSet(l[1], zero);
-        stream.enqueueMemSet(d[1], zero);
-        stream.enqueueMemSet(r[1], zero);
-        stream.enqueueMemSet(u[1], zero);
-
-        stream.enqueueMemSet(data_cost, zero);
-        stream.enqueueMemSet(data_cost_selected, zero);
-    }
-    else
-    {
-        l[0].setTo(zero);
-        d[0].setTo(zero);
-        r[0].setTo(zero);
-        u[0].setTo(zero);
-
-        l[1].setTo(zero);
-        d[1].setTo(zero);
-        r[1].setTo(zero);
-        u[1].setTo(zero);
-
-        data_cost.setTo(zero);
-        data_cost_selected.setTo(zero);
-    }
+    l[0].setTo(zero, stream);
+    d[0].setTo(zero, stream);
+    r[0].setTo(zero, stream);
+    u[0].setTo(zero, stream);
+
+    l[1].setTo(zero, stream);
+    d[1].setTo(zero, stream);
+    r[1].setTo(zero, stream);
+    u[1].setTo(zero, stream);
+
+    data_cost.setTo(zero, stream);
+    data_cost_selected.setTo(zero, stream);
 
     int cur_idx = 0;
 
@@ -279,20 +261,14 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat& mbuf, GpuMat& te
 
     out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out));
 
-    if (stream)
-        stream.enqueueMemSet(out, zero);
-    else
-        out.setTo(zero);
+    out.setTo(zero, stream);
 
     compute_disp(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
                  data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), elem_step, out, nr_plane_pyr[0], cudaStream);
 
     if (disp.type() != CV_16S)
     {
-        if (stream)
-            stream.enqueueConvert(out, disp, disp.type());
-        else
-            out.convertTo(disp, disp.type());
+        out.convertTo(disp, disp.type(), stream);
     }
 }
 
diff --git a/modules/gpuwarping/src/pyramids.cpp b/modules/gpuwarping/src/pyramids.cpp
index db9dd611a0..19d5dcf94a 100644
--- a/modules/gpuwarping/src/pyramids.cpp
+++ b/modules/gpuwarping/src/pyramids.cpp
@@ -184,10 +184,7 @@ void cv::gpu::ImagePyramid::getLayer(GpuMat& outImg, Size outRoi, Stream& stream
 
     if (outRoi.width == layer0_.cols && outRoi.height == layer0_.rows)
     {
-        if (stream)
-            stream.enqueueCopy(layer0_, outImg);
-        else
-            layer0_.copyTo(outImg);
+        layer0_.copyTo(outImg, stream);
     }
 
     float lastScale = 1.0f;
@@ -202,10 +199,7 @@ void cv::gpu::ImagePyramid::getLayer(GpuMat& outImg, Size outRoi, Stream& stream
 
         if (outRoi.width == curLayer.cols && outRoi.height == curLayer.rows)
         {
-            if (stream)
-                stream.enqueueCopy(curLayer, outImg);
-            else
-                curLayer.copyTo(outImg);
+            curLayer.copyTo(outImg, stream);
         }
 
         if (outRoi.width >= curLayer.cols && outRoi.height >= curLayer.rows)
diff --git a/modules/gpuwarping/src/resize.cpp b/modules/gpuwarping/src/resize.cpp
index 68708b41a0..5cb5184833 100644
--- a/modules/gpuwarping/src/resize.cpp
+++ b/modules/gpuwarping/src/resize.cpp
@@ -77,10 +77,7 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, doub
 
     if (dsize == src.size())
     {
-        if (s)
-            s.enqueueCopy(src, dst);
-        else
-            src.copyTo(dst);
+        src.copyTo(dst, s);
         return;
     }
 
diff --git a/modules/softcascade/src/detector_cuda.cpp b/modules/softcascade/src/detector_cuda.cpp
index 3e7795df80..6119620e27 100644
--- a/modules/softcascade/src/detector_cuda.cpp
+++ b/modules/softcascade/src/detector_cuda.cpp
@@ -335,10 +335,7 @@ struct cv::softcascade::SCascade::Fields
 
     void detect(cv::gpu::GpuMat& objects, cv::gpu::Stream& s) const
     {
-        if (s)
-            s.enqueueMemSet(objects, 0);
-        else
-            cudaMemset(objects.data, 0, sizeof(Detection));
+        objects.setTo(Scalar::all(0), s);
 
         cudaSafeCall( cudaGetLastError());
 
@@ -354,16 +351,8 @@ struct cv::softcascade::SCascade::Fields
         cv::gpu::GpuMat ndetections = cv::gpu::GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1));
         ensureSizeIsEnough(objects.rows, objects.cols, CV_8UC1, overlaps);
 
-        if (s)
-        {
-            s.enqueueMemSet(overlaps, 0);
-            s.enqueueMemSet(suppressed, 0);
-        }
-        else
-        {
-            overlaps.setTo(0);
-            suppressed.setTo(0);
-        }
+        overlaps.setTo(0, s);
+        suppressed.setTo(0, s);
 
         cudaStream_t stream = cv::gpu::StreamAccessor::getStream(s);
         cudev::suppress(objects, overlaps, ndetections, suppressed, stream);
@@ -488,18 +477,12 @@ void integral(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& sum, cv::gpu::GpuMat&
         cv::softcascade::cudev::shfl_integral(src, buffer, stream);
 
         sum.create(src.rows + 1, src.cols + 1, CV_32SC1);
-        if (s)
-            s.enqueueMemSet(sum, cv::Scalar::all(0));
-        else
-            sum.setTo(cv::Scalar::all(0));
+        sum.setTo(cv::Scalar::all(0), s);
 
         cv::gpu::GpuMat inner = sum(cv::Rect(1, 1, src.cols, src.rows));
         cv::gpu::GpuMat res = buffer(cv::Rect(0, 0, src.cols, src.rows));
 
-        if (s)
-            s.enqueueCopy(res, inner);
-        else
-            res.copyTo(inner);
+        res.copyTo(inner, s);
     }
     else {CV_Error(cv::Error::GpuNotSupported, ": CC 3.x required.");}
 }
@@ -541,10 +524,7 @@ void cv::softcascade::SCascade::detect(InputArray _image, InputArray _rois, Outp
     }
     else
     {
-        if (s)
-            s.enqueueCopy(image, flds.hogluv);
-        else
-            image.copyTo(flds.hogluv);
+        image.copyTo(flds.hogluv, s);
     }
 
     flds.detect(objects, s);
@@ -571,10 +551,7 @@ using cv::gpu::GpuMat;
 
 inline void setZero(cv::gpu::GpuMat& m, cv::gpu::Stream& s)
 {
-    if (s)
-        s.enqueueMemSet(m, 0);
-    else
-        m.setTo(0);
+    m.setTo(0, s);
 }
 
 struct SeparablePreprocessor : public cv::softcascade::ChannelsProcessor
diff --git a/samples/cpp/tutorial_code/gpu/gpu-basics-similarity/gpu-basics-similarity.cpp b/samples/cpp/tutorial_code/gpu/gpu-basics-similarity/gpu-basics-similarity.cpp
index 3a0f99e499..87b5255990 100644
--- a/samples/cpp/tutorial_code/gpu/gpu-basics-similarity/gpu-basics-similarity.cpp
+++ b/samples/cpp/tutorial_code/gpu/gpu-basics-similarity/gpu-basics-similarity.cpp
@@ -368,8 +368,8 @@ Scalar getMSSIM_GPU_optimized( const Mat& i1, const Mat& i2, BufferMSSIM& b)
 
     gpu::Stream stream;
 
-    stream.enqueueConvert(b.gI1, b.t1, CV_32F);
-    stream.enqueueConvert(b.gI2, b.t2, CV_32F);
+    b.gI1.convertTo(b.t1, CV_32F, stream);
+    b.gI2.convertTo(b.t2, CV_32F, stream);
 
     gpu::split(b.t1, b.vI1, stream);
     gpu::split(b.t2, b.vI2, stream);
@@ -379,16 +379,16 @@ Scalar getMSSIM_GPU_optimized( const Mat& i1, const Mat& i2, BufferMSSIM& b)
 
     for( int i = 0; i < b.gI1.channels(); ++i )
     {
-        gpu::multiply(b.vI2[i], b.vI2[i], b.I2_2, stream);        // I2^2
-        gpu::multiply(b.vI1[i], b.vI1[i], b.I1_2, stream);        // I1^2
-        gpu::multiply(b.vI1[i], b.vI2[i], b.I1_I2, stream);       // I1 * I2
+        gpu::multiply(b.vI2[i], b.vI2[i], b.I2_2, 1, -1, stream);        // I2^2
+        gpu::multiply(b.vI1[i], b.vI1[i], b.I1_2, 1, -1, stream);        // I1^2
+        gpu::multiply(b.vI1[i], b.vI2[i], b.I1_I2, 1, -1, stream);       // I1 * I2
 
         gpu::GaussianBlur(b.vI1[i], b.mu1, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream);
         gpu::GaussianBlur(b.vI2[i], b.mu2, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream);
 
-        gpu::multiply(b.mu1, b.mu1, b.mu1_2, stream);
-        gpu::multiply(b.mu2, b.mu2, b.mu2_2, stream);
-        gpu::multiply(b.mu1, b.mu2, b.mu1_mu2, stream);
+        gpu::multiply(b.mu1, b.mu1, b.mu1_2, 1, -1, stream);
+        gpu::multiply(b.mu2, b.mu2, b.mu2_2, 1, -1, stream);
+        gpu::multiply(b.mu1, b.mu2, b.mu1_mu2, 1, -1, stream);
 
         gpu::GaussianBlur(b.I1_2, b.sigma1_2, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream);
         gpu::subtract(b.sigma1_2, b.mu1_2, b.sigma1_2, gpu::GpuMat(), -1, stream);