fixed gpu::pyrUp (now it matches cpu analog)

fixed several warnings
13 years ago · 6397fa5b38
parent 484fe1d598
commit 6397fa5b38
11 changed files with 658 additions and 607 deletions
--- a/modules/gpu/doc/image_processing.rst
+++ b/modules/gpu/doc/image_processing.rst
@ -713,13 +713,11 @@ gpu::pyrDown
 -------------------
 Smoothes an image and downsamples it.

-.. ocv:function:: void gpu::pyrDown(const GpuMat& src, GpuMat& dst, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())

    :param src: Source image.

    :param dst: Destination image. Will have ``Size((src.cols+1)/2, (src.rows+1)/2)`` size and the same type as ``src`` .
-    
-    :param borderType: Pixel extrapolation method (see  :ocv:func:`borderInterpolate` ). ``BORDER_REFLECT101`` , ``BORDER_REPLICATE`` , ``BORDER_CONSTANT`` , ``BORDER_REFLECT`` and ``BORDER_WRAP`` are supported for now.

    :param stream: Stream for the asynchronous version.

@ -731,13 +729,11 @@ gpu::pyrUp
 -------------------
 Upsamples an image and then smoothes it.

-.. ocv:function:: void gpu::pyrUp(const GpuMat& src, GpuMat& dst, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())

    :param src: Source image.

    :param dst: Destination image. Will have ``Size(src.cols*2, src.rows*2)`` size and the same type as ``src`` .
-    
-    :param borderType: Pixel extrapolation method (see  :ocv:func:`borderInterpolate` ). ``BORDER_REFLECT101`` , ``BORDER_REPLICATE`` , ``BORDER_CONSTANT`` , ``BORDER_REFLECT`` and ``BORDER_WRAP`` are supported for now.

    :param stream: Stream for the asynchronous version.

--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@ -836,10 +836,10 @@ private:
 CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream& stream = Stream::Null());

 //! smoothes the source image and downsamples it
-CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());
+CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());

 //! upsamples the source image and then smoothes it
-CV_EXPORTS void pyrUp(const GpuMat& src, GpuMat& dst, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());
+CV_EXPORTS void pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());

 //! performs linear blending of two images
 //! to avoid accuracy errors sum of weigths shouldn't be very close to zero
@ -1572,7 +1572,7 @@ public:
    int nOctaveLayers;
    bool extended;
    bool upright;
-    
+
    //! max keypoints = min(keypointsRatio * img.size().area(), 65535)
    float keypointsRatio;

--- a/modules/gpu/src/cuda/pyr_down.cu
+++ b/modules/gpu/src/cuda/pyr_down.cu
@ -46,9 +46,9 @@
 #include "opencv2/gpu/device/vec_math.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace imgproc 
+    namespace imgproc
    {
        template <typename T, typename B> __global__ void pyrDown(const PtrStep<T> src, PtrStep<T> dst, const B b, int dst_cols)
        {
@ -60,11 +60,11 @@ namespace cv { namespace gpu { namespace device
            __shared__ value_type smem[256 + 4];

            value_type sum;
-            
+
            const int src_y = 2*y;

            sum = VecTraits<value_type>::all(0);
-            
+
            sum = sum + 0.0625f * b.at(src_y - 2, x, src.data, src.step);
            sum = sum + 0.25f   * b.at(src_y - 1, x, src.data, src.step);
            sum = sum + 0.375f  * b.at(src_y    , x, src.data, src.step);
@ -78,7 +78,7 @@ namespace cv { namespace gpu { namespace device
                const int left_x = x - 2;

                sum = VecTraits<value_type>::all(0);
-            
+
                sum = sum + 0.0625f * b.at(src_y - 2, left_x, src.data, src.step);
                sum = sum + 0.25f   * b.at(src_y - 1, left_x, src.data, src.step);
                sum = sum + 0.375f  * b.at(src_y    , left_x, src.data, src.step);
@ -93,7 +93,7 @@ namespace cv { namespace gpu { namespace device
                const int right_x = x + 2;

                sum = VecTraits<value_type>::all(0);
-            
+
                sum = sum + 0.0625f * b.at(src_y - 2, right_x, src.data, src.step);
                sum = sum + 0.25f   * b.at(src_y - 1, right_x, src.data, src.step);
                sum = sum + 0.375f  * b.at(src_y    , right_x, src.data, src.step);
@ -124,7 +124,7 @@ namespace cv { namespace gpu { namespace device
            }
        }

-        template <typename T, template <typename> class B> void pyrDown_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
+        template <typename T, template <typename> class B> void pyrDown_caller(DevMem2D_<T> src, DevMem2D_<T> dst, cudaStream_t stream)
        {
            const dim3 block(256);
            const dim3 grid(divUp(src.cols, block.x), dst.rows);
@ -138,48 +138,39 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        template <typename T, int cn> void pyrDown_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)
+        template <typename T> void pyrDown_gpu(DevMem2Db src, DevMem2Db dst, cudaStream_t stream)
        {
-            typedef typename TypeVec<T, cn>::vec_type type;
-
-            typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
-
-            static const caller_t callers[] = 
-            {
-                pyrDown_caller<type, BrdReflect101>, pyrDown_caller<type, BrdReplicate>, pyrDown_caller<type, BrdConstant>, pyrDown_caller<type, BrdReflect>, pyrDown_caller<type, BrdWrap>
-            };
-
-            callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
+            pyrDown_caller<T, BrdReflect101>(static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<T> >(dst), stream);
        }

-        template void pyrDown_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrDown_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrDown_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrDown_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-
-        template void pyrDown_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrDown_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrDown_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrDown_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-
-        template void pyrDown_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrDown_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrDown_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrDown_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-
-        template void pyrDown_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrDown_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrDown_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrDown_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-
-        template void pyrDown_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrDown_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrDown_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrDown_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-
-        template void pyrDown_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrDown_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrDown_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrDown_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<uchar>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrDown_gpu<uchar2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrDown_gpu<uchar3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrDown_gpu<uchar4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+
+        //template void pyrDown_gpu<schar>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrDown_gpu<char2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrDown_gpu<char3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrDown_gpu<char4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+
+        template void pyrDown_gpu<ushort>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrDown_gpu<ushort2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrDown_gpu<ushort3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrDown_gpu<ushort4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+
+        template void pyrDown_gpu<short>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrDown_gpu<short2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrDown_gpu<short3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrDown_gpu<short4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+
+        //template void pyrDown_gpu<int>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrDown_gpu<int2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrDown_gpu<int3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrDown_gpu<int4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+
+        template void pyrDown_gpu<float>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrDown_gpu<float2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrDown_gpu<float3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrDown_gpu<float4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
    } // namespace imgproc
 }}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/pyr_up.cu
+++ b/modules/gpu/src/cuda/pyr_up.cu
@ -46,14 +46,13 @@
 #include "opencv2/gpu/device/vec_math.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace imgproc 
+    namespace imgproc
    {
-        template <class SrcPtr, typename D> __global__ void pyrUp(const SrcPtr src, DevMem2D_<D> dst)
+        template <typename T> __global__ void pyrUp(const DevMem2D_<T> src, DevMem2D_<T> dst)
        {
-            typedef typename SrcPtr::elem_type src_t;
-            typedef typename TypeVec<float, VecTraits<D>::cn>::vec_type sum_t;
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;

            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -63,8 +62,14 @@ namespace cv { namespace gpu { namespace device

            if (threadIdx.x < 10 && threadIdx.y < 10)
            {
-                const int srcx = static_cast<int>((blockIdx.x * blockDim.x) / 2 + threadIdx.x) - 1;
-                const int srcy = static_cast<int>((blockIdx.y * blockDim.y) / 2 + threadIdx.y) - 1;
+                int srcx = static_cast<int>((blockIdx.x * blockDim.x) / 2 + threadIdx.x) - 1;
+                int srcy = static_cast<int>((blockIdx.y * blockDim.y) / 2 + threadIdx.y) - 1;
+
+                srcx = ::abs(srcx);
+                srcx = ::min(src.cols - 1, srcx);
+
+                srcy = ::abs(srcy);
+                srcy = ::min(src.rows - 1, srcy);

                s_srcPatch[threadIdx.y][threadIdx.x] = saturate_cast<sum_t>(src(srcy, srcx));
            }
@ -134,66 +139,54 @@ namespace cv { namespace gpu { namespace device
            sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][threadIdx.x];

            if (x < dst.cols && y < dst.rows)
-                dst(y, x) = saturate_cast<D>(4.0f * sum);
+                dst(y, x) = saturate_cast<T>(4.0f * sum);
        }

-        template <typename T, template <typename> class B> void pyrUp_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
+        template <typename T> void pyrUp_caller(DevMem2D_<T> src, DevMem2D_<T> dst, cudaStream_t stream)
        {
            const dim3 block(16, 16);
            const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));

-            B<T> b(src.rows, src.cols);
-            BorderReader< PtrStep<T>, B<T> > srcReader(src, b);
-
-            pyrUp<<<grid, block, 0, stream>>>(srcReader, dst);
+            pyrUp<<<grid, block, 0, stream>>>(src, dst);
            cudaSafeCall( cudaGetLastError() );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        template <typename T, int cn> void pyrUp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)
+        template <typename T> void pyrUp_gpu(DevMem2Db src, DevMem2Db dst, cudaStream_t stream)
        {
-            typedef typename TypeVec<T, cn>::vec_type type;
-
-            typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
-
-            static const caller_t callers[] = 
-            {
-                pyrUp_caller<type, BrdReflect101>, pyrUp_caller<type, BrdReplicate>, pyrUp_caller<type, BrdConstant>, pyrUp_caller<type, BrdReflect>, pyrUp_caller<type, BrdWrap>
-            };
-
-            callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
+            pyrUp_caller<T>(static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<T> >(dst), stream);
        }

-        template void pyrUp_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrUp_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrUp_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrUp_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-
-        template void pyrUp_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrUp_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrUp_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrUp_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-
-        template void pyrUp_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrUp_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrUp_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrUp_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-
-        template void pyrUp_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrUp_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrUp_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrUp_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-
-        template void pyrUp_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrUp_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrUp_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrUp_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-
-        template void pyrUp_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrUp_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrUp_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-        template void pyrUp_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<uchar>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrUp_gpu<uchar2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrUp_gpu<uchar3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrUp_gpu<uchar4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+
+        //template void pyrUp_gpu<schar>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrUp_gpu<char2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrUp_gpu<char3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrUp_gpu<char4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+
+        template void pyrUp_gpu<ushort>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrUp_gpu<ushort2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrUp_gpu<ushort3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrUp_gpu<ushort4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+
+        template void pyrUp_gpu<short>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrUp_gpu<short2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrUp_gpu<short3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrUp_gpu<short4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+
+        //template void pyrUp_gpu<int>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrUp_gpu<int2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrUp_gpu<int3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrUp_gpu<int4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+
+        template void pyrUp_gpu<float>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrUp_gpu<float2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrUp_gpu<float3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrUp_gpu<float4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
    } // namespace imgproc
 }}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@ -87,8 +87,6 @@ void cv::gpu::dft(const GpuMat&, GpuMat&, Size, int, Stream&) { throw_nogpu(); }
 void cv::gpu::ConvolveBuf::create(Size, Size) { throw_nogpu(); }
 void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool) { throw_nogpu(); }
 void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream&) { throw_nogpu(); }
-void cv::gpu::pyrDown(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
-void cv::gpu::pyrUp(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
 void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int, bool) { throw_nogpu(); }
 void cv::gpu::Canny(const GpuMat&, CannyBuf&, GpuMat&, double, double, int, bool) { throw_nogpu(); }
 void cv::gpu::Canny(const GpuMat&, const GpuMat&, GpuMat&, double, double, bool) { throw_nogpu(); }
@ -96,17 +94,15 @@ void cv::gpu::Canny(const GpuMat&, const GpuMat&, CannyBuf&, GpuMat&, double, do
 cv::gpu::CannyBuf::CannyBuf(const GpuMat&, const GpuMat&) { throw_nogpu(); }
 void cv::gpu::CannyBuf::create(const Size&, int) { throw_nogpu(); }
 void cv::gpu::CannyBuf::release() { throw_nogpu(); }
-void cv::gpu::ImagePyramid::build(const GpuMat&, int, Stream&) { throw_nogpu(); }
-void cv::gpu::ImagePyramid::getLayer(GpuMat&, Size, Stream&) const { throw_nogpu(); }

 #else /* !defined (HAVE_CUDA) */

 ////////////////////////////////////////////////////////////////////////
 // meanShiftFiltering_GPU

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace imgproc 
+    namespace imgproc
    {
        void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
    }
@ -140,9 +136,9 @@ void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,
 ////////////////////////////////////////////////////////////////////////
 // meanShiftProc_GPU

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace imgproc 
+    namespace imgproc
    {
        void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
    }
@ -177,9 +173,9 @@ void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int
 ////////////////////////////////////////////////////////////////////////
 // drawColorDisp

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace imgproc 
+    namespace imgproc
    {
        void drawColorDisp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream);
        void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream);
@ -213,9 +209,9 @@ void cv::gpu::drawColorDisp(const GpuMat& src, GpuMat& dst, int ndisp, Stream& s
 ////////////////////////////////////////////////////////////////////////
 // reprojectImageTo3D

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace imgproc 
+    namespace imgproc
    {
        void reprojectImageTo3D_gpu(const DevMem2Db& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream);
        void reprojectImageTo3D_gpu(const DevMem2D_<short>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream);
@ -249,9 +245,9 @@ void cv::gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q,
 ////////////////////////////////////////////////////////////////////////
 // copyMakeBorder

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace imgproc 
+    namespace imgproc
    {
        template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const T* borderValue, cudaStream_t stream);
    }
@ -329,7 +325,7 @@ void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom
    else
    {
        typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream);
-        static const caller_t callers[6][4] = 
+        static const caller_t callers[6][4] =
        {
            {   copyMakeBorder_caller<uchar, 1>  , 0/*copyMakeBorder_caller<uchar, 2>*/ ,    copyMakeBorder_caller<uchar, 3>  ,    copyMakeBorder_caller<uchar, 4>},
            {0/*copyMakeBorder_caller<schar, 1>*/, 0/*copyMakeBorder_caller<schar, 2>*/ , 0/*copyMakeBorder_caller<schar, 3>*/, 0/*copyMakeBorder_caller<schar, 4>*/},
@ -352,9 +348,9 @@ void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom
 //////////////////////////////////////////////////////////////////////////////
 // buildWarpPlaneMaps

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace imgproc 
+    namespace imgproc
    {
        void buildWarpPlaneMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
                                const float k_rinv[9], const float r_kinv[9], const float t[3], float scale,
@ -362,7 +358,7 @@ namespace cv { namespace gpu { namespace device
    }
 }}}

-void cv::gpu::buildWarpPlaneMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, const Mat &T, 
+void cv::gpu::buildWarpPlaneMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, const Mat &T,
                                 float scale, GpuMat& map_x, GpuMat& map_y, Stream& stream)
 {
    using namespace ::cv::gpu::device::imgproc;
@ -378,16 +374,16 @@ void cv::gpu::buildWarpPlaneMaps(Size src_size, Rect dst_roi, const Mat &K, cons

    map_x.create(dst_roi.size(), CV_32F);
    map_y.create(dst_roi.size(), CV_32F);
-    buildWarpPlaneMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(), 
+    buildWarpPlaneMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(),
                       T.ptr<float>(), scale, StreamAccessor::getStream(stream));
 }

 //////////////////////////////////////////////////////////////////////////////
 // buildWarpCylyndricalMaps

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace imgproc 
+    namespace imgproc
    {
        void buildWarpCylindricalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
                                      const float k_rinv[9], const float r_kinv[9], float scale,
@ -417,9 +413,9 @@ void cv::gpu::buildWarpCylindricalMaps(Size src_size, Rect dst_roi, const Mat &K
 //////////////////////////////////////////////////////////////////////////////
 // buildWarpSphericalMaps

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace imgproc 
+    namespace imgproc
    {
        void buildWarpSphericalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
                                    const float k_rinv[9], const float r_kinv[9], float scale,
@ -449,7 +445,7 @@ void cv::gpu::buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K,
 // rotate

 namespace
-{    
+{
    template<int DEPTH> struct NppTypeTraits;
    template<> struct NppTypeTraits<CV_8U>  { typedef Npp8u npp_t; };
    template<> struct NppTypeTraits<CV_8S>  { typedef Npp8s npp_t; };
@ -463,7 +459,7 @@ namespace
    {
        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;

-        typedef NppStatus (*func_t)(const npp_t* pSrc, NppiSize oSrcSize, int nSrcStep, NppiRect oSrcROI, 
+        typedef NppStatus (*func_t)(const npp_t* pSrc, NppiSize oSrcSize, int nSrcStep, NppiRect oSrcROI,
                                    npp_t* pDst, int nDstStep, NppiRect oDstROI,
                                    double nAngle, double nShiftX, double nShiftY, int eInterpolation);
    };
@ -503,7 +499,7 @@ void cv::gpu::rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, d
 {
    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, cudaStream_t stream);

-    static const func_t funcs[6][4] = 
+    static const func_t funcs[6][4] =
    {
        {NppRotate<CV_8U, nppiRotate_8u_C1R>::call, 0, NppRotate<CV_8U, nppiRotate_8u_C3R>::call, NppRotate<CV_8U, nppiRotate_8u_C4R>::call},
        {0,0,0,0},
@ -536,13 +532,13 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S
    CV_Assert(src.type() == CV_8UC1);

    sum.create(src.rows + 1, src.cols + 1, CV_32S);
-    
+
    NcvSize32u roiSize;
    roiSize.width = src.cols;
    roiSize.height = src.rows;

-	cudaDeviceProp prop;
-	cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
+    cudaDeviceProp prop;
+    cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );

    Ncv32u bufSize;
    ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
@ -552,7 +548,7 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S

    NppStStreamHandler h(stream);

-    ncvSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), static_cast<int>(src.step), 
+    ncvSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), static_cast<int>(src.step),
        sum.ptr<Ncv32u>(), static_cast<int>(sum.step), roiSize, buffer.ptr<Ncv8u>(), bufSize, prop) );

    if (stream == 0)
@ -570,11 +566,11 @@ void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& s)
    roiSize.width = src.cols;
    roiSize.height = src.rows;

-	cudaDeviceProp prop;
-	cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
+    cudaDeviceProp prop;
+    cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );

    Ncv32u bufSize;
-    ncvSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop));	
+    ncvSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop));
    GpuMat buf(1, bufSize, CV_8U);

    cudaStream_t stream = StreamAccessor::getStream(s);
@ -582,7 +578,7 @@ void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& s)
    NppStStreamHandler h(stream);

    sqsum.create(src.rows + 1, src.cols + 1, CV_64F);
-    ncvSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>(0)), static_cast<int>(src.step), 
+    ncvSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>(0)), static_cast<int>(src.step),
            sqsum.ptr<Ncv64u>(0), static_cast<int>(sqsum.step), roiSize, buf.ptr<Ncv8u>(0), bufSize, prop));

    if (stream == 0)
@ -592,7 +588,7 @@ void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& s)
 //////////////////////////////////////////////////////////////////////////////
 // columnSum

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    namespace imgproc
    {
@ -651,8 +647,8 @@ namespace
    {
        typedef typename NppTypeTraits<SDEPTH>::npp_t src_t;

-	typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s * pHist,
-		    int nLevels, Npp32s nLowerLevel, Npp32s nUpperLevel, Npp8u * pBuffer);
+    typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s * pHist,
+            int nLevels, Npp32s nLowerLevel, Npp32s nUpperLevel, Npp8u * pBuffer);
    };
    template<int SDEPTH> struct NppHistogramEvenFuncC4
    {
@ -779,7 +775,7 @@ namespace

            int buf_size;
            get_buf_size(sz, levels.cols, &buf_size);
-            
+
            ensureSizeIsEnough(1, buf_size, CV_8U, buffer);

            NppStreamHandler h(stream);
@ -931,7 +927,7 @@ void cv::gpu::histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4
    hist_callers[src.depth()](src, hist, levels, buf, StreamAccessor::getStream(stream));
 }

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    namespace hist
    {
@ -1002,7 +998,7 @@ void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat&
    NppStreamHandler h(stream);

    nppSafeCall( nppsIntegral_32s(hist.ptr<Npp32s>(), lut.ptr<Npp32s>(), 256, intBuf.ptr<Npp8u>()) );
-    
+
    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );

@ -1012,22 +1008,22 @@ void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat&
 ////////////////////////////////////////////////////////////////////////
 // cornerHarris & minEgenVal

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace imgproc 
+    namespace imgproc
    {
        void cornerHarris_gpu(int block_size, float k, DevMem2Df Dx, DevMem2Df Dy, DevMem2Df dst, int border_type, cudaStream_t stream);
        void cornerMinEigenVal_gpu(int block_size, DevMem2Df Dx, DevMem2Df Dy, DevMem2Df dst, int border_type, cudaStream_t stream);
    }
 }}}

-namespace 
+namespace
 {
    void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)
    {
        double scale = static_cast<double>(1 << ((ksize > 0 ? ksize : 3) - 1)) * blockSize;

-        if (ksize < 0) 
+        if (ksize < 0)
            scale *= 2.;

        if (src.depth() == CV_8U)
@ -1105,7 +1101,7 @@ void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& D
 }

 void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType)
-{  
+{
    GpuMat Dx, Dy;
    cornerMinEigenVal(src, dst, Dx, Dy, blockSize, ksize, borderType);
 }
@ -1117,7 +1113,7 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuM
 }

 void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)
-{  
+{
    using namespace ::cv::gpu::device::imgproc;

    CV_Assert(borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
@ -1125,7 +1121,7 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuM
    int gpuBorderType;
    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));

-    extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);   
+    extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);

    dst.create(src.size(), CV_32F);

@ -1135,9 +1131,9 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuM
 //////////////////////////////////////////////////////////////////////////////
 // mulSpectrums

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace imgproc 
+    namespace imgproc
    {
        void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream);

@ -1145,7 +1141,7 @@ namespace cv { namespace gpu { namespace device
    }
 }}}

-void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB, Stream& stream) 
+void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB, Stream& stream)
 {
    using namespace ::cv::gpu::device::imgproc;

@ -1165,9 +1161,9 @@ void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flag
 //////////////////////////////////////////////////////////////////////////////
 // mulAndScaleSpectrums

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace imgproc 
+    namespace imgproc
    {
        void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream);

@ -1175,7 +1171,7 @@ namespace cv { namespace gpu { namespace device
    }
 }}}

-void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB, Stream& stream) 
+void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB, Stream& stream)
 {
    using namespace ::cv::gpu::device::imgproc;

@ -1225,7 +1221,7 @@ void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags, Stre

    GpuMat src_data;

-    // Make sure here we work with the continuous input, 
+    // Make sure here we work with the continuous input,
    // as CUFFT can't handle gaps
    src_data = src;
    createContinuous(src.rows, src.cols, src.type(), src_data);
@ -1241,7 +1237,7 @@ void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags, Stre
    }

    cufftType dft_type = CUFFT_R2C;
-    if (is_complex_input) 
+    if (is_complex_input)
        dft_type = is_complex_output ? CUFFT_C2C : CUFFT_C2R;

    CV_Assert(dft_size_opt.width > 1);
@ -1304,7 +1300,7 @@ void cv::gpu::ConvolveBuf::create(Size image_size, Size templ_size)
 void cv::gpu::ConvolveBuf::create(Size image_size, Size templ_size, Size block_size)
 {
    result_size = Size(image_size.width - templ_size.width + 1,
-                       image_size.height - templ_size.height + 1);   
+                       image_size.height - templ_size.height + 1);

    this->block_size = block_size;

@ -1377,10 +1373,10 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
    cufftSafeCall( cufftSetStream(planC2R, StreamAccessor::getStream(stream)) );

    GpuMat templ_roi(templ.size(), CV_32F, templ.data, templ.step);
-    copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0, 
+    copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0,
                   templ_block.cols - templ_roi.cols, 0, Scalar(), stream);

-    cufftSafeCall(cufftExecR2C(planR2C, templ_block.ptr<cufftReal>(), 
+    cufftSafeCall(cufftExecR2C(planR2C, templ_block.ptr<cufftReal>(),
                               templ_spect.ptr<cufftComplex>()));

    // Process all blocks of the result matrix
@ -1390,23 +1386,23 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
        {
            Size image_roi_size(std::min(x + dft_size.width, image.cols) - x,
                                std::min(y + dft_size.height, image.rows) - y);
-            GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr<float>(y) + x), 
+            GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr<float>(y) + x),
                             image.step);
            copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows,
                           0, image_block.cols - image_roi.cols, 0, Scalar(), stream);

-            cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr<cufftReal>(), 
+            cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr<cufftReal>(),
                                       image_spect.ptr<cufftComplex>()));
            mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0,
                                 1.f / dft_size.area(), ccorr, stream);
-            cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr<cufftComplex>(), 
+            cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr<cufftComplex>(),
                                       result_data.ptr<cufftReal>()));

            Size result_roi_size(std::min(x + block_size.width, result.cols) - x,
                                 std::min(y + block_size.height, result.rows) - y);
-            GpuMat result_roi(result_roi_size, result.type(), 
+            GpuMat result_roi(result_roi_size, result.type(),
                              (void*)(result.ptr<float>(y) + x), result.step);
-            GpuMat result_block(result_roi_size, result_data.type(), 
+            GpuMat result_block(result_roi_size, result_data.type(),
                                result_data.ptr(), result_data.step);

            if (stream)
@ -1421,83 +1417,6 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
 #endif
 }

-//////////////////////////////////////////////////////////////////////////////
-// pyrDown
-
-namespace cv { namespace gpu { namespace device 
-{
-    namespace imgproc 
-    {
-        template <typename T, int cn> void pyrDown_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    }
-}}}
-
-void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, int borderType, Stream& stream)
-{
-    using namespace ::cv::gpu::device::imgproc;
-
-    typedef void (*func_t)(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-
-    static const func_t funcs[6][4] = 
-    {
-        {pyrDown_gpu<uchar, 1>, pyrDown_gpu<uchar, 2>, pyrDown_gpu<uchar, 3>, pyrDown_gpu<uchar, 4>},
-        {pyrDown_gpu<schar, 1>, pyrDown_gpu<schar, 2>, pyrDown_gpu<schar, 3>, pyrDown_gpu<schar, 4>},
-        {pyrDown_gpu<ushort, 1>, pyrDown_gpu<ushort, 2>, pyrDown_gpu<ushort, 3>, pyrDown_gpu<ushort, 4>},
-        {pyrDown_gpu<short, 1>, pyrDown_gpu<short, 2>, pyrDown_gpu<short, 3>, pyrDown_gpu<short, 4>},
-        {pyrDown_gpu<int, 1>, pyrDown_gpu<int, 2>, pyrDown_gpu<int, 3>, pyrDown_gpu<int, 4>},
-        {pyrDown_gpu<float, 1>, pyrDown_gpu<float, 2>, pyrDown_gpu<float, 3>, pyrDown_gpu<float, 4>},
-    };
-
-    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
-
-    CV_Assert(borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP);
-    int gpuBorderType;
-    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
-
-    dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type());
-
-    funcs[src.depth()][src.channels() - 1](src, dst, gpuBorderType, StreamAccessor::getStream(stream));
-}
-
-
-//////////////////////////////////////////////////////////////////////////////
-// pyrUp
-
-namespace cv { namespace gpu { namespace device 
-{
-    namespace imgproc 
-    {
-        template <typename T, int cn> void pyrUp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    }
-}}}
-
-void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, int borderType, Stream& stream)
-{
-    using namespace ::cv::gpu::device::imgproc;
-
-    typedef void (*func_t)(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-
-    static const func_t funcs[6][4] = 
-    {
-        {pyrUp_gpu<uchar, 1>, pyrUp_gpu<uchar, 2>, pyrUp_gpu<uchar, 3>, pyrUp_gpu<uchar, 4>},
-        {pyrUp_gpu<schar, 1>, pyrUp_gpu<schar, 2>, pyrUp_gpu<schar, 3>, pyrUp_gpu<schar, 4>},
-        {pyrUp_gpu<ushort, 1>, pyrUp_gpu<ushort, 2>, pyrUp_gpu<ushort, 3>, pyrUp_gpu<ushort, 4>},
-        {pyrUp_gpu<short, 1>, pyrUp_gpu<short, 2>, pyrUp_gpu<short, 3>, pyrUp_gpu<short, 4>},
-        {pyrUp_gpu<int, 1>, pyrUp_gpu<int, 2>, pyrUp_gpu<int, 3>, pyrUp_gpu<int, 4>},
-        {pyrUp_gpu<float, 1>, pyrUp_gpu<float, 2>, pyrUp_gpu<float, 3>, pyrUp_gpu<float, 4>},
-    };
-
-    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
-
-    CV_Assert(borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP);
-    int gpuBorderType;
-    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
-
-    dst.create(src.rows*2, src.cols*2, src.type());
-
-    funcs[src.depth()][src.channels() - 1](src, dst, gpuBorderType, StreamAccessor::getStream(stream));
-}
-

 //////////////////////////////////////////////////////////////////////////////
 // Canny
@ -1544,9 +1463,9 @@ void cv::gpu::CannyBuf::release()
    trackBuf2.release();
 }

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace canny 
+    namespace canny
    {
        void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols);

@ -1554,7 +1473,7 @@ namespace cv { namespace gpu { namespace device
        void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad);

        void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh);
-        
+
        void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols);

        void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols);
@ -1570,11 +1489,11 @@ namespace
        using namespace ::cv::gpu::device::canny;

        calcMap_gpu(buf.dx, buf.dy, buf.edgeBuf, buf.edgeBuf, dst.rows, dst.cols, low_thresh, high_thresh);
-        
+
        edgesHysteresisLocal_gpu(buf.edgeBuf, buf.trackBuf1.ptr<ushort2>(), dst.rows, dst.cols);
-        
+
        edgesHysteresisGlobal_gpu(buf.edgeBuf, buf.trackBuf1.ptr<ushort2>(), buf.trackBuf2.ptr<ushort2>(), dst.rows, dst.cols);
-        
+
        getEdges_gpu(buf.edgeBuf, dst, dst.rows, dst.cols);
    }
 }
@ -1597,7 +1516,7 @@ void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_th

    dst.create(src.size(), CV_8U);
    dst.setTo(Scalar::all(0));
-    
+
    buf.create(src.size(), apperture_size);
    buf.edgeBuf.setTo(Scalar::all(0));

@ -1636,7 +1555,7 @@ void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& d

    dst.create(dx.size(), CV_8U);
    dst.setTo(Scalar::all(0));
-    
+
    buf.dx = dx; buf.dy = dy;
    buf.create(dx.size(), -1);
    buf.edgeBuf.setTo(Scalar::all(0));
@ -1646,129 +1565,6 @@ void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& d
    CannyCaller(buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
 }

-
-//////////////////////////////////////////////////////////////////////////////
-// ImagePyramid
-
-namespace cv { namespace gpu { namespace device 
-{
-    namespace pyramid
-    {
-        template <typename T> void kernelDownsampleX2_gpu(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-        template <typename T> void kernelInterpolateFrom1_gpu(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-    }
-}}}
-
-void cv::gpu::ImagePyramid::build(const GpuMat& img, int numLayers, Stream& stream)
-{
-    using namespace cv::gpu::device::pyramid;
-
-    typedef void (*func_t)(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-
-    static const func_t funcs[7][4] =
-    {
-        {kernelDownsampleX2_gpu<uchar1>, /*kernelDownsampleX2_gpu<uchar2>*/ 0, kernelDownsampleX2_gpu<uchar3>, kernelDownsampleX2_gpu<uchar4>},
-        {/*kernelDownsampleX2_gpu<char1>*/0, /*kernelDownsampleX2_gpu<char2>*/ 0, /*kernelDownsampleX2_gpu<char3>*/ 0, /*kernelDownsampleX2_gpu<char4>*/ 0},
-        {kernelDownsampleX2_gpu<ushort1>, /*kernelDownsampleX2_gpu<ushort2>*/ 0, kernelDownsampleX2_gpu<ushort3>, kernelDownsampleX2_gpu<ushort4>},
-        {/*kernelDownsampleX2_gpu<short1>*/ 0, /*kernelDownsampleX2_gpu<short2>*/ 0, /*kernelDownsampleX2_gpu<short3>*/ 0, /*kernelDownsampleX2_gpu<short4>*/ 0},
-        {/*kernelDownsampleX2_gpu<int1>*/ 0, /*kernelDownsampleX2_gpu<int2>*/ 0, /*kernelDownsampleX2_gpu<int3>*/ 0, /*kernelDownsampleX2_gpu<int4>*/ 0},
-        {kernelDownsampleX2_gpu<float1>, /*kernelDownsampleX2_gpu<float2>*/ 0, kernelDownsampleX2_gpu<float3>, kernelDownsampleX2_gpu<float4>},
-        {/*kernelDownsampleX2_gpu<double1>*/ 0, /*kernelDownsampleX2_gpu<double2>*/ 0, /*kernelDownsampleX2_gpu<double3>*/ 0, /*kernelDownsampleX2_gpu<double4>*/ 0}
-    };
-
-    CV_Assert(img.channels() == 1 || img.channels() == 3 || img.channels() == 4);
-    CV_Assert(img.depth() == CV_8U || img.depth() == CV_16U || img.depth() == CV_32F);
-
-    layer0_ = img;
-    Size szLastLayer = img.size();
-    nLayers_ = 1;
-
-    if (numLayers <= 0)
-        numLayers = 255; //it will cut-off when any of the dimensions goes 1
-
-    pyramid_.resize(numLayers);
-
-    for (int i = 0; i < numLayers - 1; ++i)
-    {
-        Size szCurLayer(szLastLayer.width / 2, szLastLayer.height / 2);
-
-        if (szCurLayer.width == 0 || szCurLayer.height == 0)
-            break;
-
-        ensureSizeIsEnough(szCurLayer, img.type(), pyramid_[i]);
-        nLayers_++;
-
-        const GpuMat& prevLayer = i == 0 ? layer0_ : pyramid_[i - 1];
-
-        func_t func = funcs[img.depth()][img.channels() - 1];
-        CV_Assert(func != 0);
-
-        func(prevLayer, pyramid_[i], StreamAccessor::getStream(stream));
-
-        szLastLayer = szCurLayer;
-    }
-}
-
-void cv::gpu::ImagePyramid::getLayer(GpuMat& outImg, Size outRoi, Stream& stream) const
-{
-    using namespace cv::gpu::device::pyramid;
-
-    typedef void (*func_t)(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
-
-    static const func_t funcs[7][4] =
-    {
-        {kernelInterpolateFrom1_gpu<uchar1>, /*kernelInterpolateFrom1_gpu<uchar2>*/ 0, kernelInterpolateFrom1_gpu<uchar3>, kernelInterpolateFrom1_gpu<uchar4>},
-        {/*kernelInterpolateFrom1_gpu<char1>*/0, /*kernelInterpolateFrom1_gpu<char2>*/ 0, /*kernelInterpolateFrom1_gpu<char3>*/ 0, /*kernelInterpolateFrom1_gpu<char4>*/ 0},
-        {kernelInterpolateFrom1_gpu<ushort1>, /*kernelInterpolateFrom1_gpu<ushort2>*/ 0, kernelInterpolateFrom1_gpu<ushort3>, kernelInterpolateFrom1_gpu<ushort4>},
-        {/*kernelInterpolateFrom1_gpu<short1>*/ 0, /*kernelInterpolateFrom1_gpu<short2>*/ 0, /*kernelInterpolateFrom1_gpu<short3>*/ 0, /*kernelInterpolateFrom1_gpu<short4>*/ 0},
-        {/*kernelInterpolateFrom1_gpu<int1>*/ 0, /*kernelInterpolateFrom1_gpu<int2>*/ 0, /*kernelInterpolateFrom1_gpu<int3>*/ 0, /*kernelInterpolateFrom1_gpu<int4>*/ 0},
-        {kernelInterpolateFrom1_gpu<float1>, /*kernelInterpolateFrom1_gpu<float2>*/ 0, kernelInterpolateFrom1_gpu<float3>, kernelInterpolateFrom1_gpu<float4>},
-        {/*kernelInterpolateFrom1_gpu<double1>*/ 0, /*kernelInterpolateFrom1_gpu<double2>*/ 0, /*kernelInterpolateFrom1_gpu<double3>*/ 0, /*kernelInterpolateFrom1_gpu<double4>*/ 0}
-    };
-
-    CV_Assert(outRoi.width <= layer0_.cols && outRoi.height <= layer0_.rows && outRoi.width > 0 && outRoi.height > 0);
-
-    ensureSizeIsEnough(outRoi, layer0_.type(), outImg);
-
-    if (outRoi.width == layer0_.cols && outRoi.height == layer0_.rows)
-    {
-        if (stream)
-            stream.enqueueCopy(layer0_, outImg);
-        else
-            layer0_.copyTo(outImg);
-    }
-
-    float lastScale = 1.0f;
-    float curScale;
-    GpuMat lastLayer = layer0_;
-    GpuMat curLayer;
-
-    for (int i = 0; i < nLayers_ - 1; ++i)
-    {
-        curScale = lastScale * 0.5f;
-        curLayer = pyramid_[i];
-
-        if (outRoi.width == curLayer.cols && outRoi.height == curLayer.rows)
-        {
-            if (stream)
-                stream.enqueueCopy(curLayer, outImg);
-            else
-                curLayer.copyTo(outImg);
-        }
-
-        if (outRoi.width >= curLayer.cols && outRoi.height >= curLayer.rows)
-            break;
-
-        lastScale = curScale;
-        lastLayer = curLayer;
-    }
-
-    func_t func = funcs[outImg.depth()][outImg.channels() - 1];
-    CV_Assert(func != 0);
-
-    func(lastLayer, outImg, StreamAccessor::getStream(stream));
-}
-
 #endif /* !defined (HAVE_CUDA) */


--- a/modules/gpu/src/pyramids.cpp
+++ b/modules/gpu/src/pyramids.cpp
@ -0,0 +1,249 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#ifndef HAVE_CUDA
+
+void cv::gpu::pyrDown(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
+void cv::gpu::pyrUp(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
+void cv::gpu::ImagePyramid::build(const GpuMat&, int, Stream&) { throw_nogpu(); }
+void cv::gpu::ImagePyramid::getLayer(GpuMat&, Size, Stream&) const { throw_nogpu(); }
+
+#else // HAVE_CUDA
+
+//////////////////////////////////////////////////////////////////////////////
+// pyrDown
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace imgproc
+    {
+        template <typename T> void pyrDown_gpu(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+    }
+}}}
+
+void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream)
+{
+    using namespace cv::gpu::device::imgproc;
+
+    typedef void (*func_t)(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+
+    static const func_t funcs[6][4] =
+    {
+        {pyrDown_gpu<uchar>      , 0 /*pyrDown_gpu<uchar2>*/ , pyrDown_gpu<uchar3>      , pyrDown_gpu<uchar4>      },
+        {0 /*pyrDown_gpu<schar>*/, 0 /*pyrDown_gpu<schar2>*/ , 0 /*pyrDown_gpu<schar3>*/, 0 /*pyrDown_gpu<schar4>*/},
+        {pyrDown_gpu<ushort>     , 0 /*pyrDown_gpu<ushort2>*/, pyrDown_gpu<ushort3>     , pyrDown_gpu<ushort4>     },
+        {pyrDown_gpu<short>      , 0 /*pyrDown_gpu<short2>*/ , pyrDown_gpu<short3>      , pyrDown_gpu<short4>      },
+        {0 /*pyrDown_gpu<int>*/  , 0 /*pyrDown_gpu<int2>*/   , 0 /*pyrDown_gpu<int3>*/  , 0 /*pyrDown_gpu<int4>*/  },
+        {pyrDown_gpu<float>      , 0 /*pyrDown_gpu<float2>*/ , pyrDown_gpu<float3>      , pyrDown_gpu<float4>      }
+    };
+
+    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
+
+    const func_t func = funcs[src.depth()][src.channels() - 1];
+    CV_Assert(func != 0);
+
+    dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type());
+
+    func(src, dst, StreamAccessor::getStream(stream));
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+// pyrUp
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace imgproc
+    {
+        template <typename T> void pyrUp_gpu(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+    }
+}}}
+
+void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream)
+{
+    using namespace cv::gpu::device::imgproc;
+
+    typedef void (*func_t)(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+
+    static const func_t funcs[6][4] =
+    {
+        {pyrUp_gpu<uchar>      , 0 /*pyrUp_gpu<uchar2>*/ , pyrUp_gpu<uchar3>      , pyrUp_gpu<uchar4>      },
+        {0 /*pyrUp_gpu<schar>*/, 0 /*pyrUp_gpu<schar2>*/ , 0 /*pyrUp_gpu<schar3>*/, 0 /*pyrUp_gpu<schar4>*/},
+        {pyrUp_gpu<ushort>     , 0 /*pyrUp_gpu<ushort2>*/, pyrUp_gpu<ushort3>     , pyrUp_gpu<ushort4>     },
+        {pyrUp_gpu<short>      , 0 /*pyrUp_gpu<short2>*/ , pyrUp_gpu<short3>      , pyrUp_gpu<short4>      },
+        {0 /*pyrUp_gpu<int>*/  , 0 /*pyrUp_gpu<int2>*/   , 0 /*pyrUp_gpu<int3>*/  , 0 /*pyrUp_gpu<int4>*/  },
+        {pyrUp_gpu<float>      , 0 /*pyrUp_gpu<float2>*/ , pyrUp_gpu<float3>      , pyrUp_gpu<float4>      }
+    };
+
+    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
+
+    const func_t func = funcs[src.depth()][src.channels() - 1];
+    CV_Assert(func != 0);
+
+    dst.create(src.rows * 2, src.cols * 2, src.type());
+
+    func(src, dst, StreamAccessor::getStream(stream));
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+// ImagePyramid
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace pyramid
+    {
+        template <typename T> void kernelDownsampleX2_gpu(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template <typename T> void kernelInterpolateFrom1_gpu(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+    }
+}}}
+
+void cv::gpu::ImagePyramid::build(const GpuMat& img, int numLayers, Stream& stream)
+{
+    using namespace cv::gpu::device::pyramid;
+
+    typedef void (*func_t)(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+
+    static const func_t funcs[6][4] =
+    {
+        {kernelDownsampleX2_gpu<uchar1>       , 0 /*kernelDownsampleX2_gpu<uchar2>*/ , kernelDownsampleX2_gpu<uchar3>      , kernelDownsampleX2_gpu<uchar4>      },
+        {0 /*kernelDownsampleX2_gpu<char1>*/  , 0 /*kernelDownsampleX2_gpu<char2>*/  , 0 /*kernelDownsampleX2_gpu<char3>*/ , 0 /*kernelDownsampleX2_gpu<char4>*/ },
+        {kernelDownsampleX2_gpu<ushort1>      , 0 /*kernelDownsampleX2_gpu<ushort2>*/, kernelDownsampleX2_gpu<ushort3>     , kernelDownsampleX2_gpu<ushort4>     },
+        {0 /*kernelDownsampleX2_gpu<short1>*/ , 0 /*kernelDownsampleX2_gpu<short2>*/ , 0 /*kernelDownsampleX2_gpu<short3>*/, 0 /*kernelDownsampleX2_gpu<short4>*/},
+        {0 /*kernelDownsampleX2_gpu<int1>*/   , 0 /*kernelDownsampleX2_gpu<int2>*/   , 0 /*kernelDownsampleX2_gpu<int3>*/  , 0 /*kernelDownsampleX2_gpu<int4>*/  },
+        {kernelDownsampleX2_gpu<float1>       , 0 /*kernelDownsampleX2_gpu<float2>*/ , kernelDownsampleX2_gpu<float3>      , kernelDownsampleX2_gpu<float4>      }
+    };
+
+    CV_Assert(img.depth() <= CV_32F && img.channels() <= 4);
+
+    const func_t func = funcs[img.depth()][img.channels() - 1];
+    CV_Assert(func != 0);
+
+    layer0_ = img;
+    Size szLastLayer = img.size();
+    nLayers_ = 1;
+
+    if (numLayers <= 0)
+        numLayers = 255; //it will cut-off when any of the dimensions goes 1
+
+    pyramid_.resize(numLayers);
+
+    for (int i = 0; i < numLayers - 1; ++i)
+    {
+        Size szCurLayer(szLastLayer.width / 2, szLastLayer.height / 2);
+
+        if (szCurLayer.width == 0 || szCurLayer.height == 0)
+            break;
+
+        ensureSizeIsEnough(szCurLayer, img.type(), pyramid_[i]);
+        nLayers_++;
+
+        const GpuMat& prevLayer = i == 0 ? layer0_ : pyramid_[i - 1];
+
+        func(prevLayer, pyramid_[i], StreamAccessor::getStream(stream));
+
+        szLastLayer = szCurLayer;
+    }
+}
+
+void cv::gpu::ImagePyramid::getLayer(GpuMat& outImg, Size outRoi, Stream& stream) const
+{
+    using namespace cv::gpu::device::pyramid;
+
+    typedef void (*func_t)(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+
+    static const func_t funcs[6][4] =
+    {
+        {kernelInterpolateFrom1_gpu<uchar1>      , 0 /*kernelInterpolateFrom1_gpu<uchar2>*/ , kernelInterpolateFrom1_gpu<uchar3>      , kernelInterpolateFrom1_gpu<uchar4>      },
+        {0 /*kernelInterpolateFrom1_gpu<char1>*/ , 0 /*kernelInterpolateFrom1_gpu<char2>*/  , 0 /*kernelInterpolateFrom1_gpu<char3>*/ , 0 /*kernelInterpolateFrom1_gpu<char4>*/ },
+        {kernelInterpolateFrom1_gpu<ushort1>     , 0 /*kernelInterpolateFrom1_gpu<ushort2>*/, kernelInterpolateFrom1_gpu<ushort3>     , kernelInterpolateFrom1_gpu<ushort4>     },
+        {0 /*kernelInterpolateFrom1_gpu<short1>*/, 0 /*kernelInterpolateFrom1_gpu<short2>*/ , 0 /*kernelInterpolateFrom1_gpu<short3>*/, 0 /*kernelInterpolateFrom1_gpu<short4>*/},
+        {0 /*kernelInterpolateFrom1_gpu<int1>*/  , 0 /*kernelInterpolateFrom1_gpu<int2>*/   , 0 /*kernelInterpolateFrom1_gpu<int3>*/  , 0 /*kernelInterpolateFrom1_gpu<int4>*/  },
+        {kernelInterpolateFrom1_gpu<float1>      , 0 /*kernelInterpolateFrom1_gpu<float2>*/ , kernelInterpolateFrom1_gpu<float3>      , kernelInterpolateFrom1_gpu<float4>      }
+    };
+
+    CV_Assert(outRoi.width <= layer0_.cols && outRoi.height <= layer0_.rows && outRoi.width > 0 && outRoi.height > 0);
+
+    ensureSizeIsEnough(outRoi, layer0_.type(), outImg);
+
+    const func_t func = funcs[outImg.depth()][outImg.channels() - 1];
+    CV_Assert(func != 0);
+
+    if (outRoi.width == layer0_.cols && outRoi.height == layer0_.rows)
+    {
+        if (stream)
+            stream.enqueueCopy(layer0_, outImg);
+        else
+            layer0_.copyTo(outImg);
+    }
+
+    float lastScale = 1.0f;
+    float curScale;
+    GpuMat lastLayer = layer0_;
+    GpuMat curLayer;
+
+    for (int i = 0; i < nLayers_ - 1; ++i)
+    {
+        curScale = lastScale * 0.5f;
+        curLayer = pyramid_[i];
+
+        if (outRoi.width == curLayer.cols && outRoi.height == curLayer.rows)
+        {
+            if (stream)
+                stream.enqueueCopy(curLayer, outImg);
+            else
+                curLayer.copyTo(outImg);
+        }
+
+        if (outRoi.width >= curLayer.cols && outRoi.height >= curLayer.rows)
+            break;
+
+        lastScale = curScale;
+        lastLayer = curLayer;
+    }
+
+    func(lastLayer, outImg, StreamAccessor::getStream(stream));
+}
+
+#endif // HAVE_CUDA
--- a/modules/gpu/test/test_calib3d.cpp
+++ b/modules/gpu/test/test_calib3d.cpp
@ -54,38 +54,38 @@ struct StereoBlockMatching : TestWithParam<cv::gpu::DeviceInfo>
    cv::Mat img_l;
    cv::Mat img_r;
    cv::Mat img_template;
-    
-    cv::gpu::DeviceInfo devInfo;    
-    
-    virtual void SetUp() 
+
+    cv::gpu::DeviceInfo devInfo;
+
+    virtual void SetUp()
    {
        devInfo = GetParam();

        cv::gpu::setDevice(devInfo.deviceID());
-        
+
        img_l = readImage("stereobm/aloe-L.png", CV_LOAD_IMAGE_GRAYSCALE);
        img_r = readImage("stereobm/aloe-R.png", CV_LOAD_IMAGE_GRAYSCALE);
        img_template = readImage("stereobm/aloe-disp.png", CV_LOAD_IMAGE_GRAYSCALE);
-        
+
        ASSERT_FALSE(img_l.empty());
        ASSERT_FALSE(img_r.empty());
        ASSERT_FALSE(img_template.empty());
    }
 };

-TEST_P(StereoBlockMatching, Regression) 
-{    
+TEST_P(StereoBlockMatching, Regression)
+{
    cv::Mat disp;

    cv::gpu::GpuMat dev_disp;
    cv::gpu::StereoBM_GPU bm(0, 128, 19);

    bm(cv::gpu::GpuMat(img_l), cv::gpu::GpuMat(img_r), dev_disp);
-    
+
    dev_disp.download(disp);

    disp.convertTo(disp, img_template.type());
-    
+
    EXPECT_MAT_NEAR(img_template, disp, 0.0);
 }

@ -99,26 +99,26 @@ struct StereoBeliefPropagation : TestWithParam<cv::gpu::DeviceInfo>
    cv::Mat img_l;
    cv::Mat img_r;
    cv::Mat img_template;
-    
-    cv::gpu::DeviceInfo devInfo;  

-    virtual void SetUp() 
+    cv::gpu::DeviceInfo devInfo;
+
+    virtual void SetUp()
    {
        devInfo = GetParam();

        cv::gpu::setDevice(devInfo.deviceID());
-           
+
        img_l = readImage("stereobp/aloe-L.png");
        img_r = readImage("stereobp/aloe-R.png");
        img_template = readImage("stereobp/aloe-disp.png", CV_LOAD_IMAGE_GRAYSCALE);
-        
+
        ASSERT_FALSE(img_l.empty());
        ASSERT_FALSE(img_r.empty());
        ASSERT_FALSE(img_template.empty());
    }
 };

-TEST_P(StereoBeliefPropagation, Regression) 
+TEST_P(StereoBeliefPropagation, Regression)
 {
    cv::Mat disp;

@ -126,11 +126,11 @@ TEST_P(StereoBeliefPropagation, Regression)
    cv::gpu::StereoBeliefPropagation bpm(64, 8, 2, 25, 0.1f, 15, 1, CV_16S);

    bpm(cv::gpu::GpuMat(img_l), cv::gpu::GpuMat(img_r), dev_disp);
-    
+
    dev_disp.download(disp);

    disp.convertTo(disp, img_template.type());
-    
+
    EXPECT_MAT_NEAR(img_template, disp, 0.0);
 }

@ -144,15 +144,15 @@ struct StereoConstantSpaceBP : TestWithParam<cv::gpu::DeviceInfo>
    cv::Mat img_l;
    cv::Mat img_r;
    cv::Mat img_template;
-    
+
    cv::gpu::DeviceInfo devInfo;

-    virtual void SetUp() 
+    virtual void SetUp()
    {
        devInfo = GetParam();

        cv::gpu::setDevice(devInfo.deviceID());
-        
+
        img_l = readImage("csstereobp/aloe-L.png");
        img_r = readImage("csstereobp/aloe-R.png");

@ -160,14 +160,14 @@ struct StereoConstantSpaceBP : TestWithParam<cv::gpu::DeviceInfo>
            img_template = readImage("csstereobp/aloe-disp.png", CV_LOAD_IMAGE_GRAYSCALE);
        else
            img_template = readImage("csstereobp/aloe-disp_CC1X.png", CV_LOAD_IMAGE_GRAYSCALE);
-            
+
        ASSERT_FALSE(img_l.empty());
        ASSERT_FALSE(img_r.empty());
-        ASSERT_FALSE(img_template.empty());        
+        ASSERT_FALSE(img_template.empty());
    }
 };

-TEST_P(StereoConstantSpaceBP, Regression) 
+TEST_P(StereoConstantSpaceBP, Regression)
 {
    cv::Mat disp;

@ -175,11 +175,11 @@ TEST_P(StereoConstantSpaceBP, Regression)
    cv::gpu::StereoConstantSpaceBP bpm(128, 16, 4, 4);

    bpm(cv::gpu::GpuMat(img_l), cv::gpu::GpuMat(img_r), dev_disp);
-    
+
    dev_disp.download(disp);

    disp.convertTo(disp, img_template.type());
-    
+
    EXPECT_MAT_NEAR(img_template, disp, 1.0);
 }

@ -191,12 +191,12 @@ INSTANTIATE_TEST_CASE_P(Calib3D, StereoConstantSpaceBP, ALL_DEVICES);
 struct ProjectPoints : TestWithParam<cv::gpu::DeviceInfo>
 {
    cv::gpu::DeviceInfo devInfo;
-    
+
    cv::Mat src;
    cv::Mat rvec;
    cv::Mat tvec;
    cv::Mat camera_mat;
-    
+
    std::vector<cv::Point2f> dst_gold;

    virtual void SetUp()
@ -220,17 +220,17 @@ struct ProjectPoints : TestWithParam<cv::gpu::DeviceInfo>
    }
 };

-TEST_P(ProjectPoints, Accuracy) 
+TEST_P(ProjectPoints, Accuracy)
 {
    cv::Mat dst;
-   
+
    cv::gpu::GpuMat d_dst;

    cv::gpu::projectPoints(cv::gpu::GpuMat(src), rvec, tvec, camera_mat, cv::Mat(), d_dst);

    d_dst.download(dst);

-    ASSERT_EQ(dst_gold.size(), dst.cols);
+    ASSERT_EQ(dst_gold.size(), static_cast<size_t>(dst.cols));
    ASSERT_EQ(1, dst.rows);
    ASSERT_EQ(CV_32FC2, dst.type());

@ -257,7 +257,7 @@ struct TransformPoints : TestWithParam<cv::gpu::DeviceInfo>
    cv::Mat rvec;
    cv::Mat tvec;
    cv::Mat rot;
-        
+
    virtual void SetUp()
    {
        devInfo = GetParam();
@ -283,7 +283,7 @@ TEST_P(TransformPoints, Accuracy)
    cv::gpu::transformPoints(cv::gpu::GpuMat(src), rvec, tvec, d_dst);

    d_dst.download(dst);
-    
+
    ASSERT_EQ(src.size(), dst.size());
    ASSERT_EQ(src.type(), dst.type());

@ -318,7 +318,7 @@ struct SolvePnPRansac : TestWithParam<cv::gpu::DeviceInfo>

    cv::Mat rvec_gold;
    cv::Mat tvec_gold;
-        
+
    virtual void SetUp()
    {
        devInfo = GetParam();
@ -346,7 +346,7 @@ TEST_P(SolvePnPRansac, Accuracy)
    cv::Mat rvec, tvec;
    std::vector<int> inliers;

-    cv::gpu::solvePnPRansac(object, cv::Mat(1, image_vec.size(), CV_32FC2, &image_vec[0]), camera_mat, 
+    cv::gpu::solvePnPRansac(object, cv::Mat(1, image_vec.size(), CV_32FC2, &image_vec[0]), camera_mat,
                            cv::Mat(1, 8, CV_32F, cv::Scalar::all(0)), rvec, tvec, false, 200, 2.f, 100, &inliers);

    ASSERT_LE(cv::norm(rvec - rvec_gold), 1e-3f);
--- a/modules/gpu/test/test_features2d.cpp
+++ b/modules/gpu/test/test_features2d.cpp
@ -90,7 +90,7 @@ struct SURF : TestWithParam<cv::gpu::DeviceInfo>

    std::vector<cv::KeyPoint> keypoints_gold;
    std::vector<float> descriptors_gold;
-    
+
    virtual void SetUp()
    {
        devInfo = GetParam();
@ -157,20 +157,20 @@ PARAM_TEST_CASE(BruteForceMatcher, cv::gpu::DeviceInfo, DistType, int)
    cv::gpu::DeviceInfo devInfo;
    cv::gpu::BruteForceMatcher_GPU_base::DistType distType;
    int dim;
-        
+
    int queryDescCount;
    int countFactor;
-    
+
    cv::Mat query, train;

-    virtual void SetUp() 
+    virtual void SetUp()
    {
        devInfo = GET_PARAM(0);
        distType = (cv::gpu::BruteForceMatcher_GPU_base::DistType)(int)GET_PARAM(1);
        dim = GET_PARAM(2);

        cv::gpu::setDevice(devInfo.deviceID());
-        
+
        queryDescCount = 300; // must be even number because we split train data in some cases in two
        countFactor = 4; // do not change it

@ -218,7 +218,7 @@ TEST_P(BruteForceMatcher, Match)

    matcher.match(loadMat(query), loadMat(train), matches);

-    ASSERT_EQ(queryDescCount, matches.size());
+    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());

    int badCount = 0;
    for (size_t i = 0; i < matches.size(); i++)
@ -259,7 +259,7 @@ TEST_P(BruteForceMatcher, MatchAdd)

    isMaskSupported = matcher.isMaskSupported();

-    ASSERT_EQ(queryDescCount, matches.size());
+    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());

    int badCount = 0;
    for (size_t i = 0; i < matches.size(); i++)
@ -292,7 +292,7 @@ TEST_P(BruteForceMatcher, KnnMatch2)
    cv::gpu::BruteForceMatcher_GPU_base matcher(distType);
    matcher.knnMatch(loadMat(query), loadMat(train), matches, knn);

-    ASSERT_EQ(queryDescCount, matches.size());
+    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());

    int badCount = 0;
    for (size_t i = 0; i < matches.size(); i++)
@ -324,7 +324,7 @@ TEST_P(BruteForceMatcher, KnnMatch3)
    cv::gpu::BruteForceMatcher_GPU_base matcher(distType);
    matcher.knnMatch(loadMat(query), loadMat(train), matches, knn);

-    ASSERT_EQ(queryDescCount, matches.size());
+    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());

    int badCount = 0;
    for (size_t i = 0; i < matches.size(); i++)
@ -375,7 +375,7 @@ TEST_P(BruteForceMatcher, KnnMatchAdd2)

    isMaskSupported = matcher.isMaskSupported();

-    ASSERT_EQ(queryDescCount, matches.size());
+    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());

    int badCount = 0;
    int shift = isMaskSupported ? 1 : 0;
@ -437,7 +437,7 @@ TEST_P(BruteForceMatcher, KnnMatchAdd3)

    isMaskSupported = matcher.isMaskSupported();

-    ASSERT_EQ(queryDescCount, matches.size());
+    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());

    int badCount = 0;
    int shift = isMaskSupported ? 1 : 0;
@ -485,7 +485,7 @@ TEST_P(BruteForceMatcher, RadiusMatch)

    matcher.radiusMatch(loadMat(query), loadMat(train), matches, radius);

-    ASSERT_EQ(queryDescCount, matches.size());
+    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());

    int badCount = 0;
    for (size_t i = 0; i < matches.size(); i++)
@ -536,7 +536,7 @@ TEST_P(BruteForceMatcher, RadiusMatchAdd)

    isMaskSupported = matcher.isMaskSupported();

-    ASSERT_EQ(queryDescCount, matches.size());
+    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());

    int badCount = 0;
    int shift = isMaskSupported ? 1 : 0;
@ -588,17 +588,16 @@ struct FAST : TestWithParam<cv::gpu::DeviceInfo>
    int threshold;

    std::vector<cv::KeyPoint> keypoints_gold;
-    
+
    virtual void SetUp()
    {
        devInfo = GetParam();

        cv::gpu::setDevice(devInfo.deviceID());
-        
+
        image = readImage("features2d/aloe.png", CV_LOAD_IMAGE_GRAYSCALE);
        ASSERT_FALSE(image.empty());

-        cv::RNG& rng = cvtest::TS::ptr()->get_rng();
        threshold = 30;

        cv::FAST(image, keypoints_gold, threshold);
@ -630,7 +629,7 @@ TEST_P(FAST, Accuracy)
    cv::gpu::FAST_GPU fastGPU(threshold);

    fastGPU(cv::gpu::GpuMat(image), cv::gpu::GpuMat(), keypoints);
-    
+
    ASSERT_EQ(keypoints.size(), keypoints_gold.size());

    std::sort(keypoints.begin(), keypoints.end(), KeyPointCompare());
@ -663,16 +662,16 @@ struct ORB : TestWithParam<cv::gpu::DeviceInfo>

    std::vector<cv::KeyPoint> keypoints_gold;
    cv::Mat descriptors_gold;
-    
+
    virtual void SetUp()
    {
        devInfo = GetParam();

        cv::gpu::setDevice(devInfo.deviceID());
-        
+
        image = readImage("features2d/aloe.png", CV_LOAD_IMAGE_GRAYSCALE);
-        ASSERT_FALSE(image.empty());        
-        
+        ASSERT_FALSE(image.empty());
+
        mask = cv::Mat(image.size(), CV_8UC1, cv::Scalar::all(1));
        mask(cv::Range(0, image.rows / 2), cv::Range(0, image.cols / 2)).setTo(cv::Scalar::all(0));

--- a/modules/gpu/test/test_imgproc.cpp
+++ b/modules/gpu/test/test_imgproc.cpp
@ -58,7 +58,7 @@ PARAM_TEST_CASE(Integral, cv::gpu::DeviceInfo, UseRoi)
    cv::Mat src;

    cv::Mat dst_gold;
-    
+
    virtual void SetUp()
    {
        devInfo = GET_PARAM(0);
@ -70,9 +70,9 @@ PARAM_TEST_CASE(Integral, cv::gpu::DeviceInfo, UseRoi)

        size = cv::Size(rng.uniform(20, 150), rng.uniform(20, 150));

-        src = randomMat(rng, size, CV_8UC1, 0.0, 255.0, false); 
+        src = randomMat(rng, size, CV_8UC1, 0.0, 255.0, false);

-        cv::integral(src, dst_gold, CV_32S);     
+        cv::integral(src, dst_gold, CV_32S);
    }
 };

@ -90,7 +90,7 @@ TEST_P(Integral, Accuracy)
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, Integral, Combine(
-                        ALL_DEVICES, 
+                        ALL_DEVICES,
                        WHOLE_SUBMAT));

 ///////////////////////////////////////////////////////////////////////////////////////////////////////
@ -101,9 +101,9 @@ PARAM_TEST_CASE(CvtColor, cv::gpu::DeviceInfo, MatType, UseRoi)
    cv::gpu::DeviceInfo devInfo;
    int type;
    bool useRoi;
-    
+
    cv::Mat img;
-    
+
    virtual void SetUp()
    {
        devInfo = GET_PARAM(0);
@ -111,7 +111,7 @@ PARAM_TEST_CASE(CvtColor, cv::gpu::DeviceInfo, MatType, UseRoi)
        useRoi = GET_PARAM(2);

        cv::gpu::setDevice(devInfo.deviceID());
-        
+
        cv::Mat imgBase = readImage("stereobm/aloe-L.png");
        ASSERT_FALSE(imgBase.empty());

@ -1998,7 +1998,7 @@ TEST_P(CvtColor, RGBA2YUV4)
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, CvtColor, Combine(
-                        ALL_DEVICES, 
+                        ALL_DEVICES,
                        Values(CV_8U, CV_16U, CV_32F),
                        WHOLE_SUBMAT));

@ -2009,18 +2009,18 @@ PARAM_TEST_CASE(SwapChannels, cv::gpu::DeviceInfo, UseRoi)
 {
    cv::gpu::DeviceInfo devInfo;
    bool useRoi;
-    
+
    cv::Mat img;
-    
+
    cv::Mat dst_gold;
-    
+
    virtual void SetUp()
    {
        devInfo = GET_PARAM(0);
        useRoi = GET_PARAM(1);

        cv::gpu::setDevice(devInfo.deviceID());
-        
+
        cv::Mat imgBase = readImage("stereobm/aloe-L.png");
        ASSERT_FALSE(imgBase.empty());

@ -2051,23 +2051,23 @@ INSTANTIATE_TEST_CASE_P(ImgProc, SwapChannels, Combine(ALL_DEVICES, WHOLE_SUBMAT
 struct HistEven : TestWithParam<cv::gpu::DeviceInfo>
 {
    cv::gpu::DeviceInfo devInfo;
-    
+
    cv::Mat hsv;
-    
+
    int hbins;
    float hranges[2];

    cv::Mat hist_gold;
-    
+
    virtual void SetUp()
    {
        devInfo = GetParam();

        cv::gpu::setDevice(devInfo.deviceID());
-        
+
        cv::Mat img = readImage("stereobm/aloe-L.png");
        ASSERT_FALSE(img.empty());
-        
+
        cv::cvtColor(img, hsv, CV_BGR2HSV);

        hbins = 30;
@ -2092,7 +2092,7 @@ struct HistEven : TestWithParam<cv::gpu::DeviceInfo>
 TEST_P(HistEven, Accuracy)
 {
    cv::Mat hist;
-    
+
    std::vector<cv::gpu::GpuMat> srcs;
    cv::gpu::split(loadMat(hsv), srcs);

@ -2114,7 +2114,7 @@ struct CalcHist : TestWithParam<cv::gpu::DeviceInfo>
    cv::Size size;
    cv::Mat src;
    cv::Mat hist_gold;
-    
+
    virtual void SetUp()
    {
        devInfo = GetParam();
@ -2124,7 +2124,7 @@ struct CalcHist : TestWithParam<cv::gpu::DeviceInfo>
        cv::RNG& rng = TS::ptr()->get_rng();

        size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200));
-        
+
        src = randomMat(rng, size, CV_8UC1, 0, 255, false);

        hist_gold.create(1, 256, CV_32SC1);
@ -2144,7 +2144,7 @@ struct CalcHist : TestWithParam<cv::gpu::DeviceInfo>
 TEST_P(CalcHist, Accuracy)
 {
    cv::Mat hist;
-    
+
    cv::gpu::GpuMat gpuHist;

    cv::gpu::calcHist(loadMat(src), gpuHist);
@ -2163,7 +2163,7 @@ struct EqualizeHist : TestWithParam<cv::gpu::DeviceInfo>
    cv::Size size;
    cv::Mat src;
    cv::Mat dst_gold;
-    
+
    virtual void SetUp()
    {
        devInfo = GetParam();
@ -2173,7 +2173,7 @@ struct EqualizeHist : TestWithParam<cv::gpu::DeviceInfo>
        cv::RNG& rng = TS::ptr()->get_rng();

        size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200));
-        
+
        src = randomMat(rng, size, CV_8UC1, 0, 255, false);

        cv::equalizeHist(src, dst_gold);
@ -2183,7 +2183,7 @@ struct EqualizeHist : TestWithParam<cv::gpu::DeviceInfo>
 TEST_P(EqualizeHist, Accuracy)
 {
    cv::Mat dst;
-    
+
    cv::gpu::GpuMat gpuDst;

    cv::gpu::equalizeHist(loadMat(src), gpuDst);
@ -2217,7 +2217,7 @@ PARAM_TEST_CASE(CornerHarris, cv::gpu::DeviceInfo, MatType, Border, int, int)
        type = GET_PARAM(1);
        borderType = GET_PARAM(2);
        blockSize = GET_PARAM(3);
-        apertureSize = GET_PARAM(4); 
+        apertureSize = GET_PARAM(4);

        cv::gpu::setDevice(devInfo.deviceID());

@ -2248,8 +2248,8 @@ TEST_P(CornerHarris, Accuracy)
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, CornerHarris, Combine(
-                        ALL_DEVICES, 
-                        Values(CV_8UC1, CV_32FC1), 
+                        ALL_DEVICES,
+                        Values(CV_8UC1, CV_32FC1),
                        Values((int) cv::BORDER_REFLECT101, (int) cv::BORDER_REPLICATE, (int) cv::BORDER_REFLECT),
                        Values(3, 5, 7),
                        Values(0, 3, 5, 7)));
@ -2268,19 +2268,17 @@ PARAM_TEST_CASE(CornerMinEigen, cv::gpu::DeviceInfo, MatType, Border, int, int)
    cv::Mat src;

    cv::Mat dst_gold;
-    
+
    virtual void SetUp()
    {
        devInfo = GET_PARAM(0);
        type = GET_PARAM(1);
        borderType = GET_PARAM(2);
        blockSize = GET_PARAM(3);
-        apertureSize = GET_PARAM(4); 
+        apertureSize = GET_PARAM(4);

        cv::gpu::setDevice(devInfo.deviceID());

-        cv::RNG& rng = TS::ptr()->get_rng();
-
        cv::Mat img = readImage("stereobm/aloe-L.png", CV_LOAD_IMAGE_GRAYSCALE);
        ASSERT_FALSE(img.empty());

@ -2304,8 +2302,8 @@ TEST_P(CornerMinEigen, Accuracy)
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, CornerMinEigen, Combine(
-                        ALL_DEVICES, 
-                        Values(CV_8UC1, CV_32FC1), 
+                        ALL_DEVICES,
+                        Values(CV_8UC1, CV_32FC1),
                        Values((int) cv::BORDER_REFLECT101, (int) cv::BORDER_REPLICATE, (int) cv::BORDER_REFLECT),
                        Values(3, 5, 7),
                        Values(0, 3, 5, 7)));
@ -2325,7 +2323,7 @@ struct ColumnSum : TestWithParam<cv::gpu::DeviceInfo>
        devInfo = GetParam();

        cv::gpu::setDevice(devInfo.deviceID());
-    
+
        cv::RNG& rng = TS::ptr()->get_rng();

        size = cv::Size(rng.uniform(100, 400), rng.uniform(100, 400));
@ -2337,7 +2335,7 @@ struct ColumnSum : TestWithParam<cv::gpu::DeviceInfo>
 TEST_P(ColumnSum, Accuracy)
 {
    cv::Mat dst;
-    
+
    cv::gpu::GpuMat dev_dst;

    cv::gpu::columnSum(loadMat(src), dev_dst);
@ -2387,7 +2385,7 @@ PARAM_TEST_CASE(Norm, cv::gpu::DeviceInfo, MatType, NormCode, UseRoi)
        useRoi = GET_PARAM(3);

        cv::gpu::setDevice(devInfo.deviceID());
-    
+
        cv::RNG& rng = TS::ptr()->get_rng();

        size = cv::Size(rng.uniform(100, 400), rng.uniform(100, 400));
@ -2406,7 +2404,7 @@ TEST_P(Norm, Accuracy)
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, Norm, Combine(
-                        ALL_DEVICES, 
+                        ALL_DEVICES,
                        TYPES(CV_8U, CV_32F, 1, 1),
                        Values((int) cv::NORM_INF, (int) cv::NORM_L1, (int) cv::NORM_L2),
                        WHOLE_SUBMAT));
@ -2431,7 +2429,7 @@ PARAM_TEST_CASE(ReprojectImageTo3D, cv::gpu::DeviceInfo, UseRoi)
        useRoi = GET_PARAM(1);

        cv::gpu::setDevice(devInfo.deviceID());
-    
+
        cv::RNG& rng = TS::ptr()->get_rng();

        size = cv::Size(rng.uniform(100, 500), rng.uniform(100, 500));
@ -2481,7 +2479,7 @@ INSTANTIATE_TEST_CASE_P(ImgProc, ReprojectImageTo3D, Combine(ALL_DEVICES, WHOLE_
 struct MeanShift : TestWithParam<cv::gpu::DeviceInfo>
 {
    cv::gpu::DeviceInfo devInfo;
-    
+
    cv::Mat rgba;

    int spatialRad;
@ -2492,10 +2490,10 @@ struct MeanShift : TestWithParam<cv::gpu::DeviceInfo>
        devInfo = GetParam();

        cv::gpu::setDevice(devInfo.deviceID());
-        
+
        cv::Mat img = readImage("meanshift/cones.png");
        ASSERT_FALSE(img.empty());
-        
+
        cv::cvtColor(img, rgba, CV_BGR2BGRA);

        spatialRad = 30;
@ -2506,7 +2504,7 @@ struct MeanShift : TestWithParam<cv::gpu::DeviceInfo>
 TEST_P(MeanShift, Filtering)
 {
    cv::Mat img_template;
-    
+
    if (supportFeature(devInfo, cv::gpu::FEATURE_SET_COMPUTE_20))
        img_template = readImage("meanshift/con_result.png");
    else
@ -2562,8 +2560,8 @@ TEST_P(MeanShift, Proc)
    d_spmap.download(spmap);

    ASSERT_EQ(CV_8UC4, rmap.type());
-    
-    EXPECT_MAT_NEAR(rmap_filtered, rmap, 0.0);    
+
+    EXPECT_MAT_NEAR(rmap_filtered, rmap, 0.0);
    EXPECT_MAT_NEAR(spmap_template, spmap, 0.0);
 }

@ -2573,7 +2571,7 @@ PARAM_TEST_CASE(MeanShiftSegmentation, cv::gpu::DeviceInfo, int)
 {
    cv::gpu::DeviceInfo devInfo;
    int minsize;
-    
+
    cv::Mat rgba;

    cv::Mat dst_gold;
@ -2584,10 +2582,10 @@ PARAM_TEST_CASE(MeanShiftSegmentation, cv::gpu::DeviceInfo, int)
        minsize = GET_PARAM(1);

        cv::gpu::setDevice(devInfo.deviceID());
-        
+
        cv::Mat img = readImage("meanshift/cones.png");
        ASSERT_FALSE(img.empty());
-        
+
        cv::cvtColor(img, rgba, CV_BGR2BGRA);

        std::ostringstream path;
@ -2669,7 +2667,7 @@ TEST_P(MatchTemplate8U, Regression)

 INSTANTIATE_TEST_CASE_P(ImgProc, MatchTemplate8U, Combine(
                        ALL_DEVICES,
-                        Range(1, 5), 
+                        Range(1, 5),
                        Values((int)cv::TM_SQDIFF, (int) cv::TM_SQDIFF_NORMED, (int) cv::TM_CCORR, (int) cv::TM_CCORR_NORMED, (int) cv::TM_CCOEFF, (int) cv::TM_CCOEFF_NORMED)));


@ -2720,8 +2718,8 @@ TEST_P(MatchTemplate32F, Regression)
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, MatchTemplate32F, Combine(
-                        ALL_DEVICES, 
-                        Range(1, 5), 
+                        ALL_DEVICES,
+                        Range(1, 5),
                        Values((int) cv::TM_SQDIFF, (int) cv::TM_CCORR)));


@ -2830,9 +2828,9 @@ PARAM_TEST_CASE(MulSpectrums, cv::gpu::DeviceInfo, DftFlags)
    cv::gpu::DeviceInfo devInfo;
    int flag;

-    cv::Mat a, b; 
+    cv::Mat a, b;

-    virtual void SetUp() 
+    virtual void SetUp()
    {
        devInfo = GET_PARAM(0);
        flag = GET_PARAM(1);
@ -2850,7 +2848,7 @@ TEST_P(MulSpectrums, Simple)
 {
    cv::Mat c_gold;
    cv::mulSpectrums(a, b, c_gold, flag, false);
-    
+
    cv::Mat c;

    cv::gpu::GpuMat d_c;
@ -2882,7 +2880,7 @@ TEST_P(MulSpectrums, Scaled)
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, MulSpectrums, Combine(
-                        ALL_DEVICES, 
+                        ALL_DEVICES,
                        Values(0, (int) cv::DFT_ROWS)));

 ////////////////////////////////////////////////////////////////////////////
@ -2892,7 +2890,7 @@ struct Dft : TestWithParam<cv::gpu::DeviceInfo>
 {
    cv::gpu::DeviceInfo devInfo;

-    virtual void SetUp() 
+    virtual void SetUp()
    {
        devInfo = GetParam();

@ -2956,7 +2954,7 @@ TEST_P(Dft, C2C)
 void testR2CThenC2R(const std::string& hint, int cols, int rows, bool inplace)
 {
    SCOPED_TRACE(hint);
-    
+
    cv::RNG& rng = TS::ptr()->get_rng();

    cv::Mat a = randomMat(rng, cv::Size(cols, rows), CV_32FC1, 0.0, 10.0, false);
@ -2981,7 +2979,7 @@ void testR2CThenC2R(const std::string& hint, int cols, int rows, bool inplace)

    cv::gpu::dft(loadMat(a), d_b, cv::Size(cols, rows), 0);
    cv::gpu::dft(d_b, d_c, cv::Size(cols, rows), cv::DFT_REAL_OUTPUT | cv::DFT_SCALE);
-    
+
    EXPECT_TRUE(!inplace || d_b.ptr() == d_b_data.ptr());
    EXPECT_TRUE(!inplace || d_c.ptr() == d_c_data.ptr());
    ASSERT_EQ(CV_32F, d_c.depth());
@ -3019,7 +3017,7 @@ INSTANTIATE_TEST_CASE_P(ImgProc, Dft, ALL_DEVICES);
 ////////////////////////////////////////////////////////////////////////////
 // blend

-template <typename T> 
+template <typename T>
 void blendLinearGold(const cv::Mat& img1, const cv::Mat& img2, const cv::Mat& weights1, const cv::Mat& weights2, cv::Mat& result_gold)
 {
    result_gold.create(img1.size(), img1.type());
@ -3057,7 +3055,7 @@ PARAM_TEST_CASE(Blend, cv::gpu::DeviceInfo, MatType, UseRoi)

    cv::Mat result_gold;

-    virtual void SetUp() 
+    virtual void SetUp()
    {
        devInfo = GET_PARAM(0);
        type = GET_PARAM(1);
@ -3075,7 +3073,7 @@ PARAM_TEST_CASE(Blend, cv::gpu::DeviceInfo, MatType, UseRoi)
        img2 = randomMat(rng, size, type, 0.0, depth == CV_8U ? 255.0 : 1.0, false);
        weights1 = randomMat(rng, size, CV_32F, 0, 1, false);
        weights2 = randomMat(rng, size, CV_32F, 0, 1, false);
-        
+
        if (depth == CV_8U)
            blendLinearGold<uchar>(img1, img2, weights1, weights2, result_gold);
        else
@ -3101,105 +3099,6 @@ INSTANTIATE_TEST_CASE_P(ImgProc, Blend, Combine(
                        testing::Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
                        WHOLE_SUBMAT));

-////////////////////////////////////////////////////////
-// pyrDown
-
-PARAM_TEST_CASE(PyrDown, cv::gpu::DeviceInfo, MatType, UseRoi)
-{    
-    cv::gpu::DeviceInfo devInfo;
-    int type;
-    bool useRoi;
-
-    cv::Mat src;
-
-    cv::Mat dst_gold;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        type = GET_PARAM(1);
-        useRoi = GET_PARAM(2);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-
-        cv::RNG& rng = TS::ptr()->get_rng();
-
-        cv::Size size(rng.uniform(100, 200), rng.uniform(100, 200));
-
-        src = randomMat(rng, size, type, 0.0, 255.0, false);
-        
-        cv::pyrDown(src, dst_gold);
-    }
-};
-
-TEST_P(PyrDown, Accuracy)
-{    
-    cv::Mat dst;
-
-    cv::gpu::GpuMat d_dst;
-    
-    cv::gpu::pyrDown(loadMat(src, useRoi), d_dst);
-    
-    d_dst.download(dst);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-4 : 1.0);
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, PyrDown, Combine(
-                        ALL_DEVICES, 
-                        Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32FC1, CV_32FC3, CV_32FC4),
-                        WHOLE_SUBMAT));
-
-////////////////////////////////////////////////////////
-// pyrUp
-
-PARAM_TEST_CASE(PyrUp, cv::gpu::DeviceInfo, MatType, UseRoi)
-{    
-    cv::gpu::DeviceInfo devInfo;
-    int type;
-    bool useRoi;
-
-    cv::Mat src;
-    
-    cv::Mat dst_gold;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        type = GET_PARAM(1);
-        useRoi = GET_PARAM(2);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-        
-        cv::RNG& rng = TS::ptr()->get_rng();
-
-        cv::Size size(rng.uniform(200, 400), rng.uniform(200, 400));
-
-        src = randomMat(rng, size, type, 0.0, 255.0, false);
-        
-        cv::pyrUp(src, dst_gold);
-    }
-};
-
-TEST_P(PyrUp, Accuracy)
-{    
-    cv::Mat dst;
-
-    cv::gpu::GpuMat d_dst;
-    
-    cv::gpu::pyrUp(loadMat(src, useRoi), d_dst, cv::BORDER_REFLECT);
-    
-    d_dst.download(dst);
-
-    // results differs only on border left and top border due different border extrapolation type
-    EXPECT_MAT_NEAR(dst_gold(cv::Range(1, dst_gold.rows), cv::Range(1, dst_gold.cols)), dst(cv::Range(1, dst_gold.rows), cv::Range(1, dst_gold.cols)), src.depth() == CV_32F ? 1e-4 : 1.0);
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, PyrUp, Combine(
-                        ALL_DEVICES, 
-                        Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32FC1, CV_32FC3, CV_32FC4),
-                        WHOLE_SUBMAT));
-
 ////////////////////////////////////////////////////////
 // Canny

@ -3209,7 +3108,7 @@ PARAM_TEST_CASE(Canny, cv::gpu::DeviceInfo, int, bool, UseRoi)
    int apperture_size;
    bool L2gradient;
    bool useRoi;
-    
+
    cv::Mat img;

    double low_thresh;
@ -3217,7 +3116,7 @@ PARAM_TEST_CASE(Canny, cv::gpu::DeviceInfo, int, bool, UseRoi)

    cv::Mat edges_gold;

-    virtual void SetUp() 
+    virtual void SetUp()
    {
        devInfo = GET_PARAM(0);
        apperture_size = GET_PARAM(1);
@ -3225,13 +3124,13 @@ PARAM_TEST_CASE(Canny, cv::gpu::DeviceInfo, int, bool, UseRoi)
        useRoi = GET_PARAM(3);

        cv::gpu::setDevice(devInfo.deviceID());
-        
+
        img = readImage("stereobm/aloe-L.png", CV_LOAD_IMAGE_GRAYSCALE);
-        ASSERT_FALSE(img.empty()); 
+        ASSERT_FALSE(img.empty());

        low_thresh = 50.0;
        high_thresh = 100.0;
-        
+
        cv::Canny(img, edges_gold, low_thresh, high_thresh, apperture_size, L2gradient);
    }
 };
@ -3301,14 +3200,14 @@ namespace
 }

 PARAM_TEST_CASE(Convolve, cv::gpu::DeviceInfo, int, bool)
-{    
+{
    cv::gpu::DeviceInfo devInfo;
    int ksize;
    bool ccorr;
-    
+
    cv::Mat src;
    cv::Mat kernel;
-    
+
    cv::Mat dst_gold;

    virtual void SetUp()
@ -3318,14 +3217,14 @@ PARAM_TEST_CASE(Convolve, cv::gpu::DeviceInfo, int, bool)
        ccorr = GET_PARAM(2);

        cv::gpu::setDevice(devInfo.deviceID());
-        
+
        cv::RNG& rng = TS::ptr()->get_rng();

        cv::Size size(rng.uniform(200, 400), rng.uniform(200, 400));

        src = randomMat(rng, size, CV_32FC1, 0.0, 100.0, false);
        kernel = randomMat(rng, cv::Size(ksize, ksize), CV_32FC1, 0.0, 1.0, false);
-        
+
        convolveDFT(src, kernel, dst_gold, ccorr);
    }
 };
@ -3345,7 +3244,7 @@ TEST_P(Convolve, Accuracy)


 INSTANTIATE_TEST_CASE_P(ImgProc, Convolve, Combine(
-                        ALL_DEVICES, 
+                        ALL_DEVICES,
                        Values(3, 7, 11, 17, 19, 23, 45),
                        Bool()));

--- a/modules/gpu/test/test_pyramids.cpp
+++ b/modules/gpu/test/test_pyramids.cpp
@ -0,0 +1,126 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+////////////////////////////////////////////////////////
+// pyrDown
+
+PARAM_TEST_CASE(PyrDown, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+TEST_P(PyrDown, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::gpu::GpuMat dst = createMat(cv::Size((size.width + 1) / 2, (size.height + 1) / 2), type, useRoi);
+    cv::gpu::pyrDown(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::pyrDown(src, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-4 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrDown, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////
+// pyrUp
+
+PARAM_TEST_CASE(PyrUp, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+TEST_P(PyrUp, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::gpu::GpuMat dst = createMat(cv::Size(size.width * 2, size.height * 2), type, useRoi);
+    cv::gpu::pyrUp(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::pyrUp(src, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-4 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrUp, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    WHOLE_SUBMAT));
+
+#endif // HAVE_CUDA
--- a/modules/gpu/test/utility.hpp
+++ b/modules/gpu/test/utility.hpp
@ -88,7 +88,7 @@ double checkSimilarity(const cv::Mat& m1, const cv::Mat& m2);
        EXPECT_LE(checkSimilarity(cv::Mat(mat1), cv::Mat(mat2)), eps); \
    }

-namespace cv { namespace gpu 
+namespace cv { namespace gpu
 {
    void PrintTo(const DeviceInfo& info, std::ostream* os);
 }}
@ -167,6 +167,8 @@ CV_FLAGS(DftFlags, cv::DFT_INVERSE, cv::DFT_SCALE, cv::DFT_ROWS, cv::DFT_COMPLEX

 #define DIFFERENT_SIZES testing::Values(cv::Size(128, 128), cv::Size(113, 113))

+#define WHOLE testing::Values(UseRoi(false))
+#define SUBMAT testing::Values(UseRoi(true))
 #define WHOLE_SUBMAT testing::Values(UseRoi(false), UseRoi(true))

 #define DIRECT_INVERSE testing::Values(Inverse(false), Inverse(true))