Merge pull request #761 from jet47:gpu-core-refactoring

12 years ago · 2cd67cc92b
parent e8107610d1 a783f5c260
commit 2cd67cc92b
169 changed files with 1730 additions and 2019 deletions
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@ -432,16 +432,16 @@ macro(ocv_glob_module_sources)
  file(GLOB lib_hdrs     "include/opencv2/*.hpp" "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h")
  file(GLOB lib_hdrs_detail "include/opencv2/${name}/detail/*.hpp" "include/opencv2/${name}/detail/*.h")

-  file(GLOB lib_device_srcs "src/cuda/*.cu")
-  set(device_objs "")
-  set(lib_device_hdrs "")
+  file(GLOB lib_cuda_srcs "src/cuda/*.cu")
+  set(cuda_objs "")
+  set(lib_cuda_hdrs "")

-  if (HAVE_CUDA AND lib_device_srcs)
+  if(HAVE_CUDA AND lib_cuda_srcs)
    ocv_include_directories(${CUDA_INCLUDE_DIRS})
-    file(GLOB lib_device_hdrs "src/cuda/*.hpp")
+    file(GLOB lib_cuda_hdrs "src/cuda/*.hpp")

-    ocv_cuda_compile(device_objs ${lib_device_srcs} ${lib_device_hdrs})
-    source_group("Src\\Cuda"      FILES ${lib_device_srcs} ${lib_device_hdrs})
+    ocv_cuda_compile(cuda_objs ${lib_cuda_srcs} ${lib_cuda_hdrs})
+    source_group("Src\\Cuda"      FILES ${lib_cuda_srcs} ${lib_cuda_hdrs})
  endif()

  file(GLOB cl_kernels "src/opencl/*.cl")
@ -457,7 +457,7 @@ macro(ocv_glob_module_sources)
  endif()

  ocv_set_module_sources(${ARGN} HEADERS ${lib_hdrs} ${lib_hdrs_detail}
-                                 SOURCES ${lib_srcs} ${lib_int_hdrs} ${device_objs} ${lib_device_srcs} ${lib_device_hdrs})
+                                 SOURCES ${lib_srcs} ${lib_int_hdrs} ${cuda_objs} ${lib_cuda_srcs} ${lib_cuda_hdrs})

  source_group("Src" FILES ${lib_srcs} ${lib_int_hdrs})
  source_group("Include" FILES ${lib_hdrs})
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@ -1,12 +1,20 @@
 set(the_description "The Core Functionality")
 ocv_add_module(core ${ZLIB_LIBRARIES})
-ocv_module_include_directories(${ZLIB_INCLUDE_DIR} "${OpenCV_SOURCE_DIR}/modules/gpu/include")
+ocv_module_include_directories(${ZLIB_INCLUDE_DIR})

 if(HAVE_CUDA)
  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
 endif()

-ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc")
+file(GLOB lib_cuda_hdrs        "include/opencv2/${name}/cuda/*.hpp"        "include/opencv2/${name}/cuda/*.h")
+file(GLOB lib_cuda_hdrs_detail "include/opencv2/${name}/cuda/detail/*.hpp" "include/opencv2/${name}/cuda/detail/*.h")
+
+source_group("Cuda Headers"         FILES ${lib_cuda_hdrs})
+source_group("Cuda Headers\\Detail" FILES ${lib_cuda_hdrs_detail})
+
+ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc"
+                        HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
+
 ocv_create_module()
 ocv_add_precompiled_headers(${the_module})

--- a/modules/core/include/opencv2/core/cuda/block.hpp
+++ b/modules/core/include/opencv2/core/cuda/block.hpp
@ -43,7 +43,7 @@
 #ifndef __OPENCV_GPU_DEVICE_BLOCK_HPP__
 #define __OPENCV_GPU_DEVICE_BLOCK_HPP__

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    struct Block
    {
@ -201,5 +201,3 @@ namespace cv { namespace gpu { namespace device
 }}}

 #endif /* __OPENCV_GPU_DEVICE_BLOCK_HPP__ */
-
-
--- a/modules/core/include/opencv2/core/cuda/border_interpolate.hpp
+++ b/modules/core/include/opencv2/core/cuda/border_interpolate.hpp
@ -47,7 +47,7 @@
 #include "vec_traits.hpp"
 #include "vec_math.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    //////////////////////////////////////////////////////////////
    // BrdConstant
@ -709,6 +709,6 @@ namespace cv { namespace gpu { namespace device
        const int width;
        const D val;
    };
-}}} // namespace cv { namespace gpu { namespace device
+}}} // namespace cv { namespace gpu { namespace cudev

 #endif // __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
--- a/modules/core/include/opencv2/core/cuda/color.hpp
+++ b/modules/core/include/opencv2/core/cuda/color.hpp
@ -45,7 +45,7 @@

 #include "detail/color_detail.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    // All OPENCV_GPU_IMPLEMENT_*_TRAITS(ColorSpace1_to_ColorSpace2, ...) macros implements
    // template <typename T> class ColorSpace1_to_ColorSpace2_traits
@ -296,6 +296,6 @@ namespace cv { namespace gpu { namespace device
    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lbgra, 4, 4, false, 0)

    #undef OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS
-}}} // namespace cv { namespace gpu { namespace device
+}}} // namespace cv { namespace gpu { namespace cudev

 #endif // __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
--- a/modules/core/include/opencv2/core/cuda/common.hpp
+++ b/modules/core/include/opencv2/core/cuda/common.hpp
@ -45,10 +45,8 @@

 #include <cuda_runtime.h>
 #include "opencv2/core/cuda_devptrs.hpp"
-
-#ifndef CV_PI
-    #define CV_PI   3.1415926535897932384626433832795
-#endif
+#include "opencv2/core/cvdef.h"
+#include "opencv2/core/base.hpp"

 #ifndef CV_PI_F
    #ifndef CV_PI
@ -58,16 +56,24 @@
    #endif
 #endif

-#if defined(__GNUC__)
-    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
-#else /* defined(__CUDACC__) || defined(__MSVC__) */
-    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
+namespace cv { namespace gpu {
+    static inline void checkCudaError(cudaError_t err, const char* file, const int line, const char* func)
+    {
+        if (cudaSuccess != err)
+            cv::error(cv::Error::GpuApiCallError, cudaGetErrorString(err), func, file, line);
+    }
+}}
+
+#ifndef cudaSafeCall
+    #if defined(__GNUC__)
+        #define cudaSafeCall(expr)  cv::gpu::checkCudaError(expr, __FILE__, __LINE__, __func__)
+    #else /* defined(__CUDACC__) || defined(__MSVC__) */
+        #define cudaSafeCall(expr)  cv::gpu::checkCudaError(expr, __FILE__, __LINE__, "")
+    #endif
 #endif

 namespace cv { namespace gpu
 {
-    void error(const char *error_string, const char *file, const int line, const char *func);
-
    template <typename T> static inline bool isAligned(const T* ptr, size_t size)
    {
        return reinterpret_cast<size_t>(ptr) % size == 0;
@ -79,37 +85,29 @@ namespace cv { namespace gpu
    }
 }}

-static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
-{
-    if (cudaSuccess != err)
-        cv::gpu::error(cudaGetErrorString(err), file, line, func);
-}
-
 namespace cv { namespace gpu
 {
-    __host__ __device__ __forceinline__ int divUp(int total, int grain)
+    enum
    {
-        return (total + grain - 1) / grain;
-    }
-
-    namespace device
+        BORDER_REFLECT101_GPU = 0,
+        BORDER_REPLICATE_GPU,
+        BORDER_CONSTANT_GPU,
+        BORDER_REFLECT_GPU,
+        BORDER_WRAP_GPU
+    };
+
+    namespace cudev
    {
-        using cv::gpu::divUp;
-
-#ifdef __CUDACC__
-        typedef unsigned char uchar;
-        typedef unsigned short ushort;
-        typedef signed char schar;
-        #if defined (_WIN32) || defined (__APPLE__)
-            typedef unsigned int uint;
-        #endif
+        __host__ __device__ __forceinline__ int divUp(int total, int grain)
+        {
+            return (total + grain - 1) / grain;
+        }

        template<class T> inline void bindTexture(const textureReference* tex, const PtrStepSz<T>& img)
        {
            cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
            cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
        }
-#endif // __CUDACC__
    }
 }}

--- a/modules/core/include/opencv2/core/cuda/datamov_utils.hpp
+++ b/modules/core/include/opencv2/core/cuda/datamov_utils.hpp
@ -45,7 +45,7 @@

 #include "common.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 200

@ -100,6 +100,6 @@ namespace cv { namespace gpu { namespace device
        #undef OPENCV_GPU_ASM_PTR

    #endif // __CUDA_ARCH__ >= 200
-}}} // namespace cv { namespace gpu { namespace device
+}}} // namespace cv { namespace gpu { namespace cudev

 #endif // __OPENCV_GPU_DATAMOV_UTILS_HPP__
--- a/modules/core/include/opencv2/core/cuda/detail/color_detail.hpp
+++ b/modules/core/include/opencv2/core/cuda/detail/color_detail.hpp
@ -49,7 +49,7 @@
 #include "../limits.hpp"
 #include "../functional.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    #ifndef CV_DESCALE
        #define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))
@ -149,7 +149,7 @@ namespace cv { namespace gpu { namespace device
 #define OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(name, scn, dcn, bidx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::device::color_detail::RGB2RGB<T, scn, dcn, bidx> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::RGB2RGB<T, scn, dcn, bidx> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -222,7 +222,7 @@ namespace cv { namespace gpu { namespace device
 #define OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(name, scn, bidx, green_bits) \
    struct name ## _traits \
    { \
-        typedef ::cv::gpu::device::color_detail::RGB2RGB5x5<scn, bidx, green_bits> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::RGB2RGB5x5<scn, bidx, green_bits> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -303,7 +303,7 @@ namespace cv { namespace gpu { namespace device
 #define OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(name, dcn, bidx, green_bits) \
    struct name ## _traits \
    { \
-        typedef ::cv::gpu::device::color_detail::RGB5x52RGB<dcn, bidx, green_bits> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::RGB5x52RGB<dcn, bidx, green_bits> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -350,7 +350,7 @@ namespace cv { namespace gpu { namespace device
 #define OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(name, dcn) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::device::color_detail::Gray2RGB<T, dcn> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::Gray2RGB<T, dcn> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -392,7 +392,7 @@ namespace cv { namespace gpu { namespace device
 #define OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(name, green_bits) \
    struct name ## _traits \
    { \
-        typedef ::cv::gpu::device::color_detail::Gray2RGB5x5<green_bits> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::Gray2RGB5x5<green_bits> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -434,7 +434,7 @@ namespace cv { namespace gpu { namespace device
 #define OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(name, green_bits) \
    struct name ## _traits \
    { \
-        typedef ::cv::gpu::device::color_detail::RGB5x52Gray<green_bits> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::RGB5x52Gray<green_bits> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -486,7 +486,7 @@ namespace cv { namespace gpu { namespace device
 #define OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(name, scn, bidx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::device::color_detail::RGB2Gray<T, scn, bidx> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::RGB2Gray<T, scn, bidx> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -539,7 +539,7 @@ namespace cv { namespace gpu { namespace device
 #define OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(name, scn, dcn, bidx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::device::color_detail::RGB2YUV<T, scn, dcn, bidx> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::RGB2YUV<T, scn, dcn, bidx> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -629,7 +629,7 @@ namespace cv { namespace gpu { namespace device
 #define OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(name, scn, dcn, bidx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::device::color_detail::YUV2RGB<T, scn, dcn, bidx> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::YUV2RGB<T, scn, dcn, bidx> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -710,7 +710,7 @@ namespace cv { namespace gpu { namespace device
 #define OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(name, scn, dcn, bidx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::device::color_detail::RGB2YCrCb<T, scn, dcn, bidx> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::RGB2YCrCb<T, scn, dcn, bidx> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -791,7 +791,7 @@ namespace cv { namespace gpu { namespace device
 #define OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(name, scn, dcn, bidx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::device::color_detail::YCrCb2RGB<T, scn, dcn, bidx> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::YCrCb2RGB<T, scn, dcn, bidx> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -869,7 +869,7 @@ namespace cv { namespace gpu { namespace device
 #define OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(name, scn, dcn, bidx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::device::color_detail::RGB2XYZ<T, scn, dcn, bidx> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::RGB2XYZ<T, scn, dcn, bidx> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -946,7 +946,7 @@ namespace cv { namespace gpu { namespace device
 #define OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(name, scn, dcn, bidx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::device::color_detail::XYZ2RGB<T, scn, dcn, bidx> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::XYZ2RGB<T, scn, dcn, bidx> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1086,7 +1086,7 @@ namespace cv { namespace gpu { namespace device
 #define OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(name, scn, dcn, bidx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::device::color_detail::RGB2HSV<T, scn, dcn, bidx, 180> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::RGB2HSV<T, scn, dcn, bidx, 180> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1094,7 +1094,7 @@ namespace cv { namespace gpu { namespace device
    }; \
    template <typename T> struct name ## _full_traits \
    { \
-        typedef ::cv::gpu::device::color_detail::RGB2HSV<T, scn, dcn, bidx, 256> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::RGB2HSV<T, scn, dcn, bidx, 256> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1102,7 +1102,7 @@ namespace cv { namespace gpu { namespace device
    }; \
    template <> struct name ## _traits<float> \
    { \
-        typedef ::cv::gpu::device::color_detail::RGB2HSV<float, scn, dcn, bidx, 360> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::RGB2HSV<float, scn, dcn, bidx, 360> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1110,7 +1110,7 @@ namespace cv { namespace gpu { namespace device
    }; \
    template <> struct name ## _full_traits<float> \
    { \
-        typedef ::cv::gpu::device::color_detail::RGB2HSV<float, scn, dcn, bidx, 360> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::RGB2HSV<float, scn, dcn, bidx, 360> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1228,7 +1228,7 @@ namespace cv { namespace gpu { namespace device
 #define OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(name, scn, dcn, bidx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::device::color_detail::HSV2RGB<T, scn, dcn, bidx, 180> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::HSV2RGB<T, scn, dcn, bidx, 180> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1236,7 +1236,7 @@ namespace cv { namespace gpu { namespace device
    }; \
    template <typename T> struct name ## _full_traits \
    { \
-        typedef ::cv::gpu::device::color_detail::HSV2RGB<T, scn, dcn, bidx, 255> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::HSV2RGB<T, scn, dcn, bidx, 255> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1244,7 +1244,7 @@ namespace cv { namespace gpu { namespace device
    }; \
    template <> struct name ## _traits<float> \
    { \
-        typedef ::cv::gpu::device::color_detail::HSV2RGB<float, scn, dcn, bidx, 360> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::HSV2RGB<float, scn, dcn, bidx, 360> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1252,7 +1252,7 @@ namespace cv { namespace gpu { namespace device
    }; \
    template <> struct name ## _full_traits<float> \
    { \
-        typedef ::cv::gpu::device::color_detail::HSV2RGB<float, scn, dcn, bidx, 360> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::HSV2RGB<float, scn, dcn, bidx, 360> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1363,7 +1363,7 @@ namespace cv { namespace gpu { namespace device
 #define OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(name, scn, dcn, bidx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::device::color_detail::RGB2HLS<T, scn, dcn, bidx, 180> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::RGB2HLS<T, scn, dcn, bidx, 180> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1371,7 +1371,7 @@ namespace cv { namespace gpu { namespace device
    }; \
    template <typename T> struct name ## _full_traits \
    { \
-        typedef ::cv::gpu::device::color_detail::RGB2HLS<T, scn, dcn, bidx, 256> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::RGB2HLS<T, scn, dcn, bidx, 256> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1379,7 +1379,7 @@ namespace cv { namespace gpu { namespace device
    }; \
    template <> struct name ## _traits<float> \
    { \
-        typedef ::cv::gpu::device::color_detail::RGB2HLS<float, scn, dcn, bidx, 360> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::RGB2HLS<float, scn, dcn, bidx, 360> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1387,7 +1387,7 @@ namespace cv { namespace gpu { namespace device
    }; \
    template <> struct name ## _full_traits<float> \
    { \
-        typedef ::cv::gpu::device::color_detail::RGB2HLS<float, scn, dcn, bidx, 360> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::RGB2HLS<float, scn, dcn, bidx, 360> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1505,7 +1505,7 @@ namespace cv { namespace gpu { namespace device
 #define OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(name, scn, dcn, bidx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::device::color_detail::HLS2RGB<T, scn, dcn, bidx, 180> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::HLS2RGB<T, scn, dcn, bidx, 180> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1513,7 +1513,7 @@ namespace cv { namespace gpu { namespace device
    }; \
    template <typename T> struct name ## _full_traits \
    { \
-        typedef ::cv::gpu::device::color_detail::HLS2RGB<T, scn, dcn, bidx, 255> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::HLS2RGB<T, scn, dcn, bidx, 255> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1521,7 +1521,7 @@ namespace cv { namespace gpu { namespace device
    }; \
    template <> struct name ## _traits<float> \
    { \
-        typedef ::cv::gpu::device::color_detail::HLS2RGB<float, scn, dcn, bidx, 360> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::HLS2RGB<float, scn, dcn, bidx, 360> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1529,7 +1529,7 @@ namespace cv { namespace gpu { namespace device
    }; \
    template <> struct name ## _full_traits<float> \
    { \
-        typedef ::cv::gpu::device::color_detail::HLS2RGB<float, scn, dcn, bidx, 360> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::HLS2RGB<float, scn, dcn, bidx, 360> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1674,7 +1674,7 @@ namespace cv { namespace gpu { namespace device
 #define OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(name, scn, dcn, srgb, blueIdx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::device::color_detail::RGB2Lab<T, scn, dcn, srgb, blueIdx> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::RGB2Lab<T, scn, dcn, srgb, blueIdx> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1787,7 +1787,7 @@ namespace cv { namespace gpu { namespace device
 #define OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(name, scn, dcn, srgb, blueIdx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::device::color_detail::Lab2RGB<T, scn, dcn, srgb, blueIdx> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::Lab2RGB<T, scn, dcn, srgb, blueIdx> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1886,7 +1886,7 @@ namespace cv { namespace gpu { namespace device
 #define OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(name, scn, dcn, srgb, blueIdx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::device::color_detail::RGB2Luv<T, scn, dcn, srgb, blueIdx> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::RGB2Luv<T, scn, dcn, srgb, blueIdx> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1987,7 +1987,7 @@ namespace cv { namespace gpu { namespace device
 #define OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(name, scn, dcn, srgb, blueIdx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::device::color_detail::Luv2RGB<T, scn, dcn, srgb, blueIdx> functor_type; \
+        typedef ::cv::gpu::cudev::color_detail::Luv2RGB<T, scn, dcn, srgb, blueIdx> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1996,6 +1996,6 @@ namespace cv { namespace gpu { namespace device

    #undef CV_DESCALE

-}}} // namespace cv { namespace gpu { namespace device
+}}} // namespace cv { namespace gpu { namespace cudev

 #endif // __OPENCV_GPU_COLOR_DETAIL_HPP__
--- a/modules/core/include/opencv2/core/cuda/detail/reduce.hpp
+++ b/modules/core/include/opencv2/core/cuda/detail/reduce.hpp
@ -47,7 +47,7 @@
 #include "../warp.hpp"
 #include "../warp_shuffle.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace reduce_detail
    {
--- a/modules/core/include/opencv2/core/cuda/detail/reduce_key_val.hpp
+++ b/modules/core/include/opencv2/core/cuda/detail/reduce_key_val.hpp
@ -47,7 +47,7 @@
 #include "../warp.hpp"
 #include "../warp_shuffle.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace reduce_key_val_detail
    {
--- a/modules/core/include/opencv2/core/cuda/detail/transform_detail.hpp
+++ b/modules/core/include/opencv2/core/cuda/detail/transform_detail.hpp
@ -47,7 +47,7 @@
 #include "../vec_traits.hpp"
 #include "../functional.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace transform_detail
    {
@ -345,7 +345,7 @@ namespace cv { namespace gpu { namespace device
            {
                typedef TransformFunctorTraits<UnOp> ft;

-                StaticAssert<ft::smart_shift != 1>::check();
+                CV_StaticAssert(ft::smart_shift != 1, "");

                if (!isAligned(src.data, ft::smart_shift * sizeof(T)) || !isAligned(src.step, ft::smart_shift * sizeof(T)) ||
                    !isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D)))
@ -369,7 +369,7 @@ namespace cv { namespace gpu { namespace device
            {
                typedef TransformFunctorTraits<BinOp> ft;

-                StaticAssert<ft::smart_shift != 1>::check();
+                CV_StaticAssert(ft::smart_shift != 1, "");

                if (!isAligned(src1.data, ft::smart_shift * sizeof(T1)) || !isAligned(src1.step, ft::smart_shift * sizeof(T1)) ||
                    !isAligned(src2.data, ft::smart_shift * sizeof(T2)) || !isAligned(src2.step, ft::smart_shift * sizeof(T2)) ||
@ -390,6 +390,6 @@ namespace cv { namespace gpu { namespace device
            }
        };
    } // namespace transform_detail
-}}} // namespace cv { namespace gpu { namespace device
+}}} // namespace cv { namespace gpu { namespace cudev

 #endif // __OPENCV_GPU_TRANSFORM_DETAIL_HPP__
--- a/modules/core/include/opencv2/core/cuda/detail/type_traits_detail.hpp
+++ b/modules/core/include/opencv2/core/cuda/detail/type_traits_detail.hpp
@ -46,7 +46,7 @@
 #include "../common.hpp"
 #include "../vec_traits.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace type_traits_detail
    {
@ -182,6 +182,6 @@ namespace cv { namespace gpu { namespace device
            enum { value = 1 };
        };
    } // namespace type_traits_detail
-}}} // namespace cv { namespace gpu { namespace device
+}}} // namespace cv { namespace gpu { namespace cudev

 #endif // __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__
--- a/modules/core/include/opencv2/core/cuda/detail/vec_distance_detail.hpp
+++ b/modules/core/include/opencv2/core/cuda/detail/vec_distance_detail.hpp
@ -45,7 +45,7 @@

 #include "../datamov_utils.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace vec_distance_detail
    {
@ -112,6 +112,6 @@ namespace cv { namespace gpu { namespace device
            }
        };
    } // namespace vec_distance_detail
-}}} // namespace cv { namespace gpu { namespace device
+}}} // namespace cv { namespace gpu { namespace cudev

 #endif // __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
--- a/modules/core/include/opencv2/core/cuda/dynamic_smem.hpp
+++ b/modules/core/include/opencv2/core/cuda/dynamic_smem.hpp
@ -43,7 +43,7 @@
 #ifndef __OPENCV_GPU_DYNAMIC_SMEM_HPP__
 #define __OPENCV_GPU_DYNAMIC_SMEM_HPP__

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template<class T> struct DynamicSharedMem
    {
--- a/modules/core/include/opencv2/core/cuda/emulation.hpp
+++ b/modules/core/include/opencv2/core/cuda/emulation.hpp
@ -45,7 +45,7 @@

 #include "warp_reduce.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    struct Emulation
    {
@ -133,6 +133,6 @@ namespace cv { namespace gpu { namespace device
            }
        };
    };
-}}} // namespace cv { namespace gpu { namespace device
+}}} // namespace cv { namespace gpu { namespace cudev

 #endif /* OPENCV_GPU_EMULATION_HPP_ */
--- a/modules/core/include/opencv2/core/cuda/filters.hpp
+++ b/modules/core/include/opencv2/core/cuda/filters.hpp
@ -48,7 +48,7 @@
 #include "vec_math.hpp"
 #include "type_traits.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <typename Ptr2D> struct PointFilter
    {
@ -273,6 +273,6 @@ namespace cv { namespace gpu { namespace device
        float scale_x, scale_y;
        int width, haight;
    };
-}}} // namespace cv { namespace gpu { namespace device
+}}} // namespace cv { namespace gpu { namespace cudev

 #endif // __OPENCV_GPU_FILTERS_HPP__
--- a/modules/core/include/opencv2/core/cuda/funcattrib.hpp
+++ b/modules/core/include/opencv2/core/cuda/funcattrib.hpp
@ -45,7 +45,7 @@

 #include <cstdio>

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template<class Func>
    void printFuncAttrib(Func& func)
@ -66,6 +66,6 @@ namespace cv { namespace gpu { namespace device
        printf("\n");
        fflush(stdout);
    }
-}}} // namespace cv { namespace gpu { namespace device
+}}} // namespace cv { namespace gpu { namespace cudev

 #endif  /* __OPENCV_GPU_DEVICE_FUNCATTRIB_HPP_ */
--- a/modules/core/include/opencv2/core/cuda/functional.hpp
+++ b/modules/core/include/opencv2/core/cuda/functional.hpp
@ -49,7 +49,7 @@
 #include "type_traits.hpp"
 #include "device_functions.h"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    // Function Objects
    template<typename Argument, typename Result> struct unary_function : public std::unary_function<Argument, Result> {};
@ -786,6 +786,6 @@ namespace cv { namespace gpu { namespace device

 #define OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(type) \
    template <> struct TransformFunctorTraits< type > : DefaultTransformFunctorTraits< type >
-}}} // namespace cv { namespace gpu { namespace device
+}}} // namespace cv { namespace gpu { namespace cudev

 #endif // __OPENCV_GPU_FUNCTIONAL_HPP__
--- a/modules/core/include/opencv2/core/cuda/limits.hpp
+++ b/modules/core/include/opencv2/core/cuda/limits.hpp
@ -46,7 +46,7 @@
 #include <limits>
 #include "common.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template<class T> struct numeric_limits
    {
@ -230,6 +230,6 @@ namespace cv { namespace gpu { namespace device
        __device__ __forceinline__ static type signaling_NaN();
        static const bool is_signed = true;
    };
-}}} // namespace cv { namespace gpu { namespace device {
+}}} // namespace cv { namespace gpu { namespace cudev {

 #endif // __OPENCV_GPU_LIMITS_GPU_HPP__
--- a/modules/core/include/opencv2/core/cuda/reduce.hpp
+++ b/modules/core/include/opencv2/core/cuda/reduce.hpp
@ -47,7 +47,7 @@
 #include "detail/reduce.hpp"
 #include "detail/reduce_key_val.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <int N, typename T, class Op>
    __device__ __forceinline__ void reduce(volatile T* smem, T& val, unsigned int tid, const Op& op)
--- a/modules/core/include/opencv2/core/cuda/saturate_cast.hpp
+++ b/modules/core/include/opencv2/core/cuda/saturate_cast.hpp
@ -45,7 +45,7 @@

 #include "common.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); }
    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); }
--- a/modules/core/include/opencv2/core/cuda/scan.hpp
+++ b/modules/core/include/opencv2/core/cuda/scan.hpp
@ -43,12 +43,12 @@
 #ifndef __OPENCV_GPU_SCAN_HPP__
 #define __OPENCV_GPU_SCAN_HPP__

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/utility.hpp"
-#include "opencv2/gpu/device/warp.hpp"
-#include "opencv2/gpu/device/warp_shuffle.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+#include "opencv2/core/cuda/warp.hpp"
+#include "opencv2/core/cuda/warp_shuffle.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    enum ScanKind { EXCLUSIVE = 0,  INCLUSIVE = 1 };

@ -174,13 +174,13 @@ namespace cv { namespace gpu { namespace device
    __device__ T warpScanInclusive(T idata, volatile T* s_Data, unsigned int tid)
    {
    #if __CUDA_ARCH__ >= 300
-        const unsigned int laneId = cv::gpu::device::Warp::laneId();
+        const unsigned int laneId = cv::gpu::cudev::Warp::laneId();

        // scan on shuffl functions
        #pragma unroll
        for (int i = 1; i <= (OPENCV_GPU_WARP_SIZE / 2); i *= 2)
        {
-            const T n = cv::gpu::device::shfl_up(idata, i);
+            const T n = cv::gpu::cudev::shfl_up(idata, i);
            if (laneId >= i)
                  idata += n;
        }
--- a/modules/core/include/opencv2/core/cuda/simd_functions.hpp
+++ b/modules/core/include/opencv2/core/cuda/simd_functions.hpp
@ -123,7 +123,7 @@
  vmin4(a,b)      per-byte unsigned minimum: min(a, b)
 */

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    // 2

--- a/modules/core/include/opencv2/core/cuda/transform.hpp
+++ b/modules/core/include/opencv2/core/cuda/transform.hpp
@ -47,7 +47,7 @@
 #include "utility.hpp"
 #include "detail/transform_detail.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <typename T, typename D, typename UnOp, typename Mask>
    static inline void transform(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, const Mask& mask, cudaStream_t stream)
--- a/modules/core/include/opencv2/core/cuda/type_traits.hpp
+++ b/modules/core/include/opencv2/core/cuda/type_traits.hpp
@ -45,7 +45,7 @@

 #include "detail/type_traits_detail.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <typename T> struct IsSimpleParameter
    {
--- a/modules/core/include/opencv2/core/cuda/utility.hpp
+++ b/modules/core/include/opencv2/core/cuda/utility.hpp
@ -46,7 +46,7 @@
 #include "saturate_cast.hpp"
 #include "datamov_utils.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    #define OPENCV_GPU_LOG_WARP_SIZE        (5)
    #define OPENCV_GPU_WARP_SIZE            (1 << OPENCV_GPU_LOG_WARP_SIZE)
@ -208,6 +208,6 @@ namespace cv { namespace gpu { namespace device

        return false;
    }
-}}} // namespace cv { namespace gpu { namespace device
+}}} // namespace cv { namespace gpu { namespace cudev

 #endif // __OPENCV_GPU_UTILITY_HPP__
--- a/modules/core/include/opencv2/core/cuda/vec_distance.hpp
+++ b/modules/core/include/opencv2/core/cuda/vec_distance.hpp
@ -47,7 +47,7 @@
 #include "functional.hpp"
 #include "detail/vec_distance_detail.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <typename T> struct L1Dist
    {
@ -219,6 +219,6 @@ namespace cv { namespace gpu { namespace device

        U vec1Vals[MAX_LEN / THREAD_DIM];
    };
-}}} // namespace cv { namespace gpu { namespace device
+}}} // namespace cv { namespace gpu { namespace cudev

 #endif // __OPENCV_GPU_VEC_DISTANCE_HPP__
--- a/modules/core/include/opencv2/core/cuda/vec_math.hpp
+++ b/modules/core/include/opencv2/core/cuda/vec_math.hpp
@ -47,7 +47,7 @@
 #include "vec_traits.hpp"
 #include "functional.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace vec_math_detail
    {
@ -325,6 +325,6 @@ namespace cv { namespace gpu { namespace device
    #undef OPENCV_GPU_IMPLEMENT_VEC_BINOP
    #undef OPENCV_GPU_IMPLEMENT_VEC_OP
    #undef OPENCV_GPU_IMPLEMENT_VEC_INT_OP
-}}} // namespace cv { namespace gpu { namespace device
+}}} // namespace cv { namespace gpu { namespace cudev

 #endif // __OPENCV_GPU_VECMATH_HPP__
--- a/modules/core/include/opencv2/core/cuda/vec_traits.hpp
+++ b/modules/core/include/opencv2/core/cuda/vec_traits.hpp
@ -45,7 +45,7 @@

 #include "common.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template<typename T, int N> struct TypeVec;

@ -275,6 +275,6 @@ namespace cv { namespace gpu { namespace device
        static __device__ __host__ __forceinline__ char8 make(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7) {return make_char8(a0, a1, a2, a3, a4, a5, a6, a7);}
        static __device__ __host__ __forceinline__ char8 make(const schar* v) {return make_char8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);}
    };
-}}} // namespace cv { namespace gpu { namespace device
+}}} // namespace cv { namespace gpu { namespace cudev

 #endif // __OPENCV_GPU_VEC_TRAITS_HPP__
--- a/modules/core/include/opencv2/core/cuda/warp.hpp
+++ b/modules/core/include/opencv2/core/cuda/warp.hpp
@ -43,7 +43,7 @@
 #ifndef __OPENCV_GPU_DEVICE_WARP_HPP__
 #define __OPENCV_GPU_DEVICE_WARP_HPP__

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    struct Warp
    {
@ -126,6 +126,6 @@ namespace cv { namespace gpu { namespace device
                *t = value;
        }
    };
-}}} // namespace cv { namespace gpu { namespace device
+}}} // namespace cv { namespace gpu { namespace cudev

 #endif /* __OPENCV_GPU_DEVICE_WARP_HPP__ */
--- a/modules/core/include/opencv2/core/cuda/warp_reduce.hpp
+++ b/modules/core/include/opencv2/core/cuda/warp_reduce.hpp
@ -43,7 +43,7 @@
 #ifndef OPENCV_GPU_WARP_REDUCE_HPP__
 #define OPENCV_GPU_WARP_REDUCE_HPP__

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <class T>
    __device__ __forceinline__ T warp_reduce(volatile T *ptr , const unsigned int tid = threadIdx.x)
@ -63,6 +63,6 @@ namespace cv { namespace gpu { namespace device

        return ptr[tid - lane];
    }
-}}} // namespace cv { namespace gpu { namespace device {
+}}} // namespace cv { namespace gpu { namespace cudev {

 #endif /* OPENCV_GPU_WARP_REDUCE_HPP__ */
--- a/modules/core/include/opencv2/core/cuda/warp_shuffle.hpp
+++ b/modules/core/include/opencv2/core/cuda/warp_shuffle.hpp
@ -43,7 +43,7 @@
 #ifndef __OPENCV_GPU_WARP_SHUFFLE_HPP__
 #define __OPENCV_GPU_WARP_SHUFFLE_HPP__

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <typename T>
    __device__ __forceinline__ T shfl(T val, int srcLane, int width = warpSize)
--- a/modules/core/include/opencv2/core/cuda_devptrs.hpp
+++ b/modules/core/include/opencv2/core/cuda_devptrs.hpp
@ -58,9 +58,6 @@ namespace cv
        // Simple lightweight structures that encapsulates information about an image on device.
        // It is intended to pass to nvcc-compiled code. GpuMat depends on headers that nvcc can't compile

-        template <bool expr> struct StaticAssert;
-        template <> struct StaticAssert<true> {static __CV_GPU_HOST_DEVICE__ void check(){}};
-
        template<typename T> struct DevPtr
        {
            typedef T elem_type;
@ -148,24 +145,6 @@ namespace cv
        typedef DevMem2Db DevMem2D;
        typedef DevMem2D_<float> DevMem2Df;
        typedef DevMem2D_<int> DevMem2Di;
-
-//#undef __CV_GPU_DEPR_BEFORE__
-//#undef __CV_GPU_DEPR_AFTER__
-
-        namespace device
-        {
-            using cv::gpu::PtrSz;
-            using cv::gpu::PtrStep;
-            using cv::gpu::PtrStepSz;
-
-            using cv::gpu::PtrStepSzb;
-            using cv::gpu::PtrStepSzf;
-            using cv::gpu::PtrStepSzi;
-
-            using cv::gpu::PtrStepb;
-            using cv::gpu::PtrStepf;
-            using cv::gpu::PtrStepi;
-        }
    }
 }

--- a/modules/core/include/opencv2/core/gpu_private.hpp
+++ b/modules/core/include/opencv2/core/gpu_private.hpp
@ -0,0 +1,143 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_CORE_GPU_PRIVATE_HPP__
+#define __OPENCV_CORE_GPU_PRIVATE_HPP__
+
+#ifndef __OPENCV_BUILD
+#  error this is a private header which should not be used from outside of the OpenCV library
+#endif
+
+#include "cvconfig.h"
+
+#include "opencv2/core/cvdef.h"
+#include "opencv2/core/base.hpp"
+
+#ifdef HAVE_CUDA
+#  include <cuda.h>
+#  include <cuda_runtime.h>
+#  include <npp.h>
+#  include "opencv2/core/stream_accessor.hpp"
+#  include "opencv2/core/cuda/common.hpp"
+
+#  define CUDART_MINIMUM_REQUIRED_VERSION 4020
+
+#  if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION)
+#    error "Insufficient Cuda Runtime library version, please update it."
+#  endif
+
+#  if defined(CUDA_ARCH_BIN_OR_PTX_10)
+#    error "OpenCV GPU module doesn't support NVIDIA compute capability 1.0"
+#  endif
+#endif
+
+namespace cv { namespace gpu {
+    CV_EXPORTS cv::String getNppErrorMessage(int code);
+    CV_EXPORTS cv::String getCudaDriverApiErrorMessage(int code);
+
+    // Converts CPU border extrapolation mode into GPU internal analogue.
+    // Returns true if the GPU analogue exists, false otherwise.
+    CV_EXPORTS bool tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType);
+}}
+
+#ifndef HAVE_CUDA
+
+static inline void throw_no_cuda() { CV_Error(cv::Error::GpuNotSupported, "The library is compiled without GPU support"); }
+
+#else // HAVE_CUDA
+
+static inline void throw_no_cuda() { CV_Error(cv::Error::StsNotImplemented, "The called functionality is disabled for current build or platform"); }
+
+namespace cv { namespace gpu
+{
+    static inline void checkNppError(int code, const char* file, const int line, const char* func)
+    {
+        if (code < 0)
+            cv::error(cv::Error::GpuApiCallError, getNppErrorMessage(code), func, file, line);
+    }
+
+    static inline void checkCudaDriverApiError(int code, const char* file, const int line, const char* func)
+    {
+        if (code != CUDA_SUCCESS)
+            cv::error(cv::Error::GpuApiCallError, getCudaDriverApiErrorMessage(code), func, file, line);
+    }
+
+    template<int n> struct NPPTypeTraits;
+    template<> struct NPPTypeTraits<CV_8U>  { typedef Npp8u npp_type; };
+    template<> struct NPPTypeTraits<CV_8S>  { typedef Npp8s npp_type; };
+    template<> struct NPPTypeTraits<CV_16U> { typedef Npp16u npp_type; };
+    template<> struct NPPTypeTraits<CV_16S> { typedef Npp16s npp_type; };
+    template<> struct NPPTypeTraits<CV_32S> { typedef Npp32s npp_type; };
+    template<> struct NPPTypeTraits<CV_32F> { typedef Npp32f npp_type; };
+    template<> struct NPPTypeTraits<CV_64F> { typedef Npp64f npp_type; };
+
+    class NppStreamHandler
+    {
+    public:
+        inline explicit NppStreamHandler(cudaStream_t newStream)
+        {
+            oldStream = nppGetStream();
+            nppSetStream(newStream);
+        }
+
+        inline ~NppStreamHandler()
+        {
+            nppSetStream(oldStream);
+        }
+
+    private:
+        cudaStream_t oldStream;
+    };
+}}
+
+#if defined(__GNUC__)
+    #define nppSafeCall(expr)  cv::gpu::checkNppError(expr, __FILE__, __LINE__, __func__)
+    #define cuSafeCall(expr)  cv::gpu::checkCudaDriverApiError(expr, __FILE__, __LINE__, __func__)
+#else /* defined(__CUDACC__) || defined(__MSVC__) */
+    #define nppSafeCall(expr)  cv::gpu::checkNppError(expr, __FILE__, __LINE__, "")
+    #define cuSafeCall(expr)  cv::gpu::checkCudaDriverApiError(expr, __FILE__, __LINE__, "")
+#endif
+
+#endif // HAVE_CUDA
+
+#endif // __OPENCV_CORE_GPU_PRIVATE_HPP__
--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@ -454,11 +454,6 @@ CV_EXPORTS void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m);

 CV_EXPORTS GpuMat allocMatFromBuf(int rows, int cols, int type, GpuMat &mat);

-////////////////////////////////////////////////////////////////////////
-// Error handling
-
-CV_EXPORTS void error(const char* error_string, const char* file, const int line, const char* func = "");
-
 ////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////
--- a/modules/core/include/opencv2/core/stream_accessor.hpp
+++ b/modules/core/include/opencv2/core/stream_accessor.hpp
@ -43,17 +43,20 @@
 #ifndef __OPENCV_CUDA_STREAM_ACCESSOR_HPP__
 #define __OPENCV_CUDA_STREAM_ACCESSOR_HPP__

-#include "opencv2/core/gpumat.hpp"
-#include "cuda_runtime_api.h"
+#include <cuda_runtime.h>
+#include "opencv2/core/cvdef.h"
+
+// This is only header file that depends on Cuda. All other headers are independent.
+// So if you use OpenCV binaries you do noot need to install Cuda Toolkit.
+// But of you wanna use GPU by yourself, may get cuda stream instance using the class below.
+// In this case you have to install Cuda Toolkit.

 namespace cv
 {
    namespace gpu
    {
-        // This is only header file that depends on Cuda. All other headers are independent.
-        // So if you use OpenCV binaries you do noot need to install Cuda Toolkit.
-        // But of you wanna use GPU by yourself, may get cuda stream instance using the class below.
-        // In this case you have to install Cuda Toolkit.
+        class Stream;
+
        struct StreamAccessor
        {
            CV_EXPORTS static cudaStream_t getStream(const Stream& stream);
--- a/modules/core/src/cuda/matrix_operations.cu
+++ b/modules/core/src/cuda/matrix_operations.cu
@ -40,12 +40,12 @@
 //
 //M*/

-#include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/transform.hpp"
-#include "opencv2/gpu/device/functional.hpp"
-#include "opencv2/gpu/device/type_traits.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/type_traits.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    void writeScalar(const uchar*);
    void writeScalar(const schar*);
@ -58,7 +58,7 @@ namespace cv { namespace gpu { namespace device
    void convert_gpu(PtrStepSzb, int, PtrStepSzb, int, double, double, cudaStream_t);
 }}}

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <typename T> struct shift_and_sizeof;
    template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; };
@ -76,9 +76,9 @@ namespace cv { namespace gpu { namespace device
    template <typename T> void copyToWithMask(PtrStepSzb src, PtrStepSzb dst, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream)
    {
        if (colorMask)
-            cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<T>)dst, identity<T>(), SingleMask(mask), stream);
+            cv::gpu::cudev::transform((PtrStepSz<T>)src, (PtrStepSz<T>)dst, identity<T>(), SingleMask(mask), stream);
        else
-            cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<T>)dst, identity<T>(), SingleMaskChannels(mask, cn), stream);
+            cv::gpu::cudev::transform((PtrStepSz<T>)src, (PtrStepSz<T>)dst, identity<T>(), SingleMaskChannels(mask, cn), stream);
    }

    void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream)
@ -293,7 +293,7 @@ namespace cv { namespace gpu { namespace device
        cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
        cudaSafeCall( cudaSetDoubleForDevice(&beta) );
        Convertor<T, D, S> op(static_cast<S>(alpha), static_cast<S>(beta));
-        cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
+        cv::gpu::cudev::transform((PtrStepSz<T>)src, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
    }

 #if defined  __clang__
@ -379,4 +379,4 @@ namespace cv { namespace gpu { namespace device
 #if defined __clang__
 # pragma clang diagnostic pop
 #endif
-}}} // namespace cv { namespace gpu { namespace device
+}}} // namespace cv { namespace gpu { namespace cudev
--- a/modules/core/src/cudastream.cpp
+++ b/modules/core/src/cudastream.cpp
@ -46,33 +46,30 @@ using namespace cv;
 using namespace cv::gpu;

 #if !defined (HAVE_CUDA)
-#define throw_nogpu() CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")

-cv::gpu::Stream::Stream() { throw_nogpu(); }
+cv::gpu::Stream::Stream() { throw_no_cuda(); }
 cv::gpu::Stream::~Stream() {}
-cv::gpu::Stream::Stream(const Stream&) { throw_nogpu(); }
-Stream& cv::gpu::Stream::operator=(const Stream&) { throw_nogpu(); return *this; }
-bool cv::gpu::Stream::queryIfComplete() { throw_nogpu(); return false; }
-void cv::gpu::Stream::waitForCompletion() { throw_nogpu(); }
-void cv::gpu::Stream::enqueueDownload(const GpuMat&, Mat&) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueDownload(const GpuMat&, CudaMem&) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueUpload(const CudaMem&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueUpload(const Mat&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueCopy(const GpuMat&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueMemSet(GpuMat&, Scalar) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueMemSet(GpuMat&, Scalar, const GpuMat&) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueConvert(const GpuMat&, GpuMat&, int, double, double) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueHostCallback(StreamCallback, void*) { throw_nogpu(); }
-Stream& cv::gpu::Stream::Null() { throw_nogpu(); static Stream s; return s; }
-cv::gpu::Stream::operator bool() const { throw_nogpu(); return false; }
-cv::gpu::Stream::Stream(Impl*) { throw_nogpu(); }
-void cv::gpu::Stream::create() { throw_nogpu(); }
-void cv::gpu::Stream::release() { throw_nogpu(); }
+cv::gpu::Stream::Stream(const Stream&) { throw_no_cuda(); }
+Stream& cv::gpu::Stream::operator=(const Stream&) { throw_no_cuda(); return *this; }
+bool cv::gpu::Stream::queryIfComplete() { throw_no_cuda(); return false; }
+void cv::gpu::Stream::waitForCompletion() { throw_no_cuda(); }
+void cv::gpu::Stream::enqueueDownload(const GpuMat&, Mat&) { throw_no_cuda(); }
+void cv::gpu::Stream::enqueueDownload(const GpuMat&, CudaMem&) { throw_no_cuda(); }
+void cv::gpu::Stream::enqueueUpload(const CudaMem&, GpuMat&) { throw_no_cuda(); }
+void cv::gpu::Stream::enqueueUpload(const Mat&, GpuMat&) { throw_no_cuda(); }
+void cv::gpu::Stream::enqueueCopy(const GpuMat&, GpuMat&) { throw_no_cuda(); }
+void cv::gpu::Stream::enqueueMemSet(GpuMat&, Scalar) { throw_no_cuda(); }
+void cv::gpu::Stream::enqueueMemSet(GpuMat&, Scalar, const GpuMat&) { throw_no_cuda(); }
+void cv::gpu::Stream::enqueueConvert(const GpuMat&, GpuMat&, int, double, double) { throw_no_cuda(); }
+void cv::gpu::Stream::enqueueHostCallback(StreamCallback, void*) { throw_no_cuda(); }
+Stream& cv::gpu::Stream::Null() { throw_no_cuda(); static Stream s; return s; }
+cv::gpu::Stream::operator bool() const { throw_no_cuda(); return false; }
+cv::gpu::Stream::Stream(Impl*) { throw_no_cuda(); }
+void cv::gpu::Stream::create() { throw_no_cuda(); }
+void cv::gpu::Stream::release() { throw_no_cuda(); }

 #else /* !defined (HAVE_CUDA) */

-#include "opencv2/core/stream_accessor.hpp"
-
 namespace cv { namespace gpu
 {
    void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@ -45,64 +45,38 @@
 using namespace cv;
 using namespace cv::gpu;

-#ifndef HAVE_CUDA
-
-#define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")
-
-#else // HAVE_CUDA
-
-namespace
-{
-#if defined(__GNUC__)
-    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, __func__)
-#else /* defined(__CUDACC__) || defined(__MSVC__) */
-    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__)
-#endif
-
-    inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
-    {
-        if (err < 0)
-        {
-            String msg = cv::format("NPP API Call Error: %d", err);
-            cv::gpu::error(msg.c_str(), file, line, func);
-        }
-    }
-}
-
-#endif // HAVE_CUDA
-
 //////////////////////////////// Initialization & Info ////////////////////////

 #ifndef HAVE_CUDA

 int cv::gpu::getCudaEnabledDeviceCount() { return 0; }

-void cv::gpu::setDevice(int) { throw_nogpu; }
-int cv::gpu::getDevice() { throw_nogpu; return 0; }
+void cv::gpu::setDevice(int) { throw_no_cuda(); }
+int cv::gpu::getDevice() { throw_no_cuda(); return 0; }

-void cv::gpu::resetDevice() { throw_nogpu; }
+void cv::gpu::resetDevice() { throw_no_cuda(); }

-bool cv::gpu::deviceSupports(FeatureSet) { throw_nogpu; return false; }
+bool cv::gpu::deviceSupports(FeatureSet) { throw_no_cuda(); return false; }

-bool cv::gpu::TargetArchs::builtWith(FeatureSet) { throw_nogpu; return false; }
-bool cv::gpu::TargetArchs::has(int, int) { throw_nogpu; return false; }
-bool cv::gpu::TargetArchs::hasPtx(int, int) { throw_nogpu; return false; }
-bool cv::gpu::TargetArchs::hasBin(int, int) { throw_nogpu; return false; }
-bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int, int) { throw_nogpu; return false; }
-bool cv::gpu::TargetArchs::hasEqualOrGreater(int, int) { throw_nogpu; return false; }
-bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int, int) { throw_nogpu; return false; }
-bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int, int) { throw_nogpu; return false; }
+bool cv::gpu::TargetArchs::builtWith(FeatureSet) { throw_no_cuda(); return false; }
+bool cv::gpu::TargetArchs::has(int, int) { throw_no_cuda(); return false; }
+bool cv::gpu::TargetArchs::hasPtx(int, int) { throw_no_cuda(); return false; }
+bool cv::gpu::TargetArchs::hasBin(int, int) { throw_no_cuda(); return false; }
+bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int, int) { throw_no_cuda(); return false; }
+bool cv::gpu::TargetArchs::hasEqualOrGreater(int, int) { throw_no_cuda(); return false; }
+bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int, int) { throw_no_cuda(); return false; }
+bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int, int) { throw_no_cuda(); return false; }

-size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { throw_nogpu; return 0; }
-void cv::gpu::DeviceInfo::queryMemory(size_t&, size_t&) const { throw_nogpu; }
-size_t cv::gpu::DeviceInfo::freeMemory() const { throw_nogpu; return 0; }
-size_t cv::gpu::DeviceInfo::totalMemory() const { throw_nogpu; return 0; }
-bool cv::gpu::DeviceInfo::supports(FeatureSet) const { throw_nogpu; return false; }
-bool cv::gpu::DeviceInfo::isCompatible() const { throw_nogpu; return false; }
-void cv::gpu::DeviceInfo::query() { throw_nogpu; }
+size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { throw_no_cuda(); return 0; }
+void cv::gpu::DeviceInfo::queryMemory(size_t&, size_t&) const { throw_no_cuda(); }
+size_t cv::gpu::DeviceInfo::freeMemory() const { throw_no_cuda(); return 0; }
+size_t cv::gpu::DeviceInfo::totalMemory() const { throw_no_cuda(); return 0; }
+bool cv::gpu::DeviceInfo::supports(FeatureSet) const { throw_no_cuda(); return false; }
+bool cv::gpu::DeviceInfo::isCompatible() const { throw_no_cuda(); return false; }
+void cv::gpu::DeviceInfo::query() { throw_no_cuda(); }

-void cv::gpu::printCudaDeviceInfo(int) { throw_nogpu; }
-void cv::gpu::printShortCudaDeviceInfo(int) { throw_nogpu; }
+void cv::gpu::printCudaDeviceInfo(int) { throw_no_cuda(); }
+void cv::gpu::printShortCudaDeviceInfo(int) { throw_no_cuda(); }

 #else // HAVE_CUDA

@ -846,18 +820,18 @@ namespace
    class EmptyFuncTable : public GpuFuncTable
    {
    public:
-        void copy(const Mat&, GpuMat&) const { throw_nogpu; }
-        void copy(const GpuMat&, Mat&) const { throw_nogpu; }
-        void copy(const GpuMat&, GpuMat&) const { throw_nogpu; }
+        void copy(const Mat&, GpuMat&) const { throw_no_cuda(); }
+        void copy(const GpuMat&, Mat&) const { throw_no_cuda(); }
+        void copy(const GpuMat&, GpuMat&) const { throw_no_cuda(); }

-        void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu; }
+        void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_no_cuda(); }

-        void convert(const GpuMat&, GpuMat&) const { throw_nogpu; }
-        void convert(const GpuMat&, GpuMat&, double, double) const { throw_nogpu; }
+        void convert(const GpuMat&, GpuMat&) const { throw_no_cuda(); }
+        void convert(const GpuMat&, GpuMat&, double, double) const { throw_no_cuda(); }

-        void setTo(GpuMat&, Scalar, const GpuMat&) const { throw_nogpu; }
+        void setTo(GpuMat&, Scalar, const GpuMat&) const { throw_no_cuda(); }

-        void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; }
+        void mallocPitch(void**, size_t*, size_t, size_t) const { throw_no_cuda(); }
        void free(void*) const {}
    };

@ -870,7 +844,7 @@ namespace

 #else // HAVE_CUDA

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream);

@ -888,13 +862,13 @@ namespace
    template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream)
    {
        Scalar_<T> sf = s;
-        cv::gpu::device::set_to_gpu(src, sf.val, src.channels(), stream);
+        cv::gpu::cudev::set_to_gpu(src, sf.val, src.channels(), stream);
    }

    template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
    {
        Scalar_<T> sf = s;
-        cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream);
+        cv::gpu::cudev::set_to_gpu(src, sf.val, mask, src.channels(), stream);
    }
 }

@ -918,17 +892,17 @@ namespace cv { namespace gpu
        CV_Assert(src.size() == dst.size() && src.type() == dst.type());
        CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));

-        cv::gpu::device::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream);
+        cv::gpu::cudev::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream);
    }

    void convertTo(const GpuMat& src, GpuMat& dst)
    {
-        cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0);
+        cv::gpu::cudev::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0);
    }

    void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0)
    {
-        cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream);
+        cv::gpu::cudev::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream);
    }

    void setTo(GpuMat& src, Scalar s, cudaStream_t stream)
@ -1551,18 +1525,185 @@ void cv::gpu::GpuMat::release()
 ////////////////////////////////////////////////////////////////////////
 // Error handling

-void cv::gpu::error(const char *error_string, const char *file, const int line, const char *func)
+#ifdef HAVE_CUDA
+
+namespace
 {
-    int code = CV_GpuApiCallError;
+    #define error_entry(entry)  { entry, #entry }
+
+    struct ErrorEntry
+    {
+        int code;
+        const char* str;
+    };

-    if (std::uncaught_exception())
+    struct ErrorEntryComparer
    {
-        const char* errorStr = cvErrorStr(code);
-        const char* function = func ? func : "unknown function";
+        int code;
+        ErrorEntryComparer(int code_) : code(code_) {}
+        bool operator()(const ErrorEntry& e) const { return e.code == code; }
+    };
+
+    const ErrorEntry npp_errors [] =
+    {
+        error_entry( NPP_NOT_SUPPORTED_MODE_ERROR ),
+        error_entry( NPP_ROUND_MODE_NOT_SUPPORTED_ERROR ),
+        error_entry( NPP_RESIZE_NO_OPERATION_ERROR ),
+
+#if defined (_MSC_VER)
+        error_entry( NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY ),
+#endif
+
+        error_entry( NPP_BAD_ARG_ERROR ),
+        error_entry( NPP_LUT_NUMBER_OF_LEVELS_ERROR ),
+        error_entry( NPP_TEXTURE_BIND_ERROR ),
+        error_entry( NPP_COEFF_ERROR ),
+        error_entry( NPP_RECT_ERROR ),
+        error_entry( NPP_QUAD_ERROR ),
+        error_entry( NPP_WRONG_INTERSECTION_ROI_ERROR ),
+        error_entry( NPP_NOT_EVEN_STEP_ERROR ),
+        error_entry( NPP_INTERPOLATION_ERROR ),
+        error_entry( NPP_RESIZE_FACTOR_ERROR ),
+        error_entry( NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR ),
+        error_entry( NPP_MEMFREE_ERR ),
+        error_entry( NPP_MEMSET_ERR ),
+        error_entry( NPP_MEMCPY_ERROR ),
+        error_entry( NPP_MEM_ALLOC_ERR ),
+        error_entry( NPP_HISTO_NUMBER_OF_LEVELS_ERROR ),
+        error_entry( NPP_MIRROR_FLIP_ERR ),
+        error_entry( NPP_INVALID_INPUT ),
+        error_entry( NPP_ALIGNMENT_ERROR ),
+        error_entry( NPP_STEP_ERROR ),
+        error_entry( NPP_SIZE_ERROR ),
+        error_entry( NPP_POINTER_ERROR ),
+        error_entry( NPP_NULL_POINTER_ERROR ),
+        error_entry( NPP_CUDA_KERNEL_EXECUTION_ERROR ),
+        error_entry( NPP_NOT_IMPLEMENTED_ERROR ),
+        error_entry( NPP_ERROR ),
+        error_entry( NPP_NO_ERROR ),
+        error_entry( NPP_SUCCESS ),
+        error_entry( NPP_WARNING ),
+        error_entry( NPP_WRONG_INTERSECTION_QUAD_WARNING ),
+        error_entry( NPP_MISALIGNED_DST_ROI_WARNING ),
+        error_entry( NPP_AFFINE_QUAD_INCORRECT_WARNING ),
+        error_entry( NPP_DOUBLE_SIZE_WARNING ),
+        error_entry( NPP_ODD_ROI_WARNING )
+    };
+
+    const size_t npp_error_num = sizeof(npp_errors) / sizeof(npp_errors[0]);
+
+    const ErrorEntry cu_errors [] =
+    {
+        error_entry( CUDA_SUCCESS                              ),
+        error_entry( CUDA_ERROR_INVALID_VALUE                  ),
+        error_entry( CUDA_ERROR_OUT_OF_MEMORY                  ),
+        error_entry( CUDA_ERROR_NOT_INITIALIZED                ),
+        error_entry( CUDA_ERROR_DEINITIALIZED                  ),
+        error_entry( CUDA_ERROR_PROFILER_DISABLED              ),
+        error_entry( CUDA_ERROR_PROFILER_NOT_INITIALIZED       ),
+        error_entry( CUDA_ERROR_PROFILER_ALREADY_STARTED       ),
+        error_entry( CUDA_ERROR_PROFILER_ALREADY_STOPPED       ),
+        error_entry( CUDA_ERROR_NO_DEVICE                      ),
+        error_entry( CUDA_ERROR_INVALID_DEVICE                 ),
+        error_entry( CUDA_ERROR_INVALID_IMAGE                  ),
+        error_entry( CUDA_ERROR_INVALID_CONTEXT                ),
+        error_entry( CUDA_ERROR_CONTEXT_ALREADY_CURRENT        ),
+        error_entry( CUDA_ERROR_MAP_FAILED                     ),
+        error_entry( CUDA_ERROR_UNMAP_FAILED                   ),
+        error_entry( CUDA_ERROR_ARRAY_IS_MAPPED                ),
+        error_entry( CUDA_ERROR_ALREADY_MAPPED                 ),
+        error_entry( CUDA_ERROR_NO_BINARY_FOR_GPU              ),
+        error_entry( CUDA_ERROR_ALREADY_ACQUIRED               ),
+        error_entry( CUDA_ERROR_NOT_MAPPED                     ),
+        error_entry( CUDA_ERROR_NOT_MAPPED_AS_ARRAY            ),
+        error_entry( CUDA_ERROR_NOT_MAPPED_AS_POINTER          ),
+        error_entry( CUDA_ERROR_ECC_UNCORRECTABLE              ),
+        error_entry( CUDA_ERROR_UNSUPPORTED_LIMIT              ),
+        error_entry( CUDA_ERROR_CONTEXT_ALREADY_IN_USE         ),
+        error_entry( CUDA_ERROR_INVALID_SOURCE                 ),
+        error_entry( CUDA_ERROR_FILE_NOT_FOUND                 ),
+        error_entry( CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND ),
+        error_entry( CUDA_ERROR_SHARED_OBJECT_INIT_FAILED      ),
+        error_entry( CUDA_ERROR_OPERATING_SYSTEM               ),
+        error_entry( CUDA_ERROR_INVALID_HANDLE                 ),
+        error_entry( CUDA_ERROR_NOT_FOUND                      ),
+        error_entry( CUDA_ERROR_NOT_READY                      ),
+        error_entry( CUDA_ERROR_LAUNCH_FAILED                  ),
+        error_entry( CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES        ),
+        error_entry( CUDA_ERROR_LAUNCH_TIMEOUT                 ),
+        error_entry( CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  ),
+        error_entry( CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED    ),
+        error_entry( CUDA_ERROR_PEER_ACCESS_NOT_ENABLED        ),
+        error_entry( CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE         ),
+        error_entry( CUDA_ERROR_CONTEXT_IS_DESTROYED           ),
+        error_entry( CUDA_ERROR_ASSERT                         ),
+        error_entry( CUDA_ERROR_TOO_MANY_PEERS                 ),
+        error_entry( CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED ),
+        error_entry( CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED     ),
+        error_entry( CUDA_ERROR_UNKNOWN                        )
+    };

-        fprintf(stderr, "OpenCV Error: %s(%s) in %s, file %s, line %d", errorStr, error_string, function, file, line);
-        fflush(stderr);
+    const size_t cu_errors_num = sizeof(cu_errors) / sizeof(cu_errors[0]);
+
+    cv::String getErrorString(int code, const ErrorEntry* errors, size_t n)
+    {
+        size_t idx = std::find_if(errors, errors + n, ErrorEntryComparer(code)) - errors;
+
+        const char* msg = (idx != n) ? errors[idx].str : "Unknown error code";
+        cv::String str = cv::format("%s [Code = %d]", msg, code);
+
+        return str;
    }
-    else
-        cv::error( cv::Exception(code, error_string, func, file, line) );
+}
+
+#endif
+
+String cv::gpu::getNppErrorMessage(int code)
+{
+#ifndef HAVE_CUDA
+    (void) code;
+    return String();
+#else
+    return getErrorString(code, npp_errors, npp_error_num);
+#endif
+}
+
+String cv::gpu::getCudaDriverApiErrorMessage(int code)
+{
+#ifndef HAVE_CUDA
+    (void) code;
+    return String();
+#else
+    return getErrorString(code, cu_errors, cu_errors_num);
+#endif
+}
+
+bool cv::gpu::tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType)
+{
+#ifndef HAVE_CUDA
+    (void) cpuBorderType;
+    (void) gpuBorderType;
+    return false;
+#else
+    switch (cpuBorderType)
+    {
+    case IPL_BORDER_REFLECT_101:
+        gpuBorderType = cv::gpu::BORDER_REFLECT101_GPU;
+        return true;
+    case IPL_BORDER_REPLICATE:
+        gpuBorderType = cv::gpu::BORDER_REPLICATE_GPU;
+        return true;
+    case IPL_BORDER_CONSTANT:
+        gpuBorderType = cv::gpu::BORDER_CONSTANT_GPU;
+        return true;
+    case IPL_BORDER_REFLECT:
+        gpuBorderType = cv::gpu::BORDER_REFLECT_GPU;
+        return true;
+    case IPL_BORDER_WRAP:
+        gpuBorderType = cv::gpu::BORDER_WRAP_GPU;
+        return true;
+    default:
+        return false;
+    };
+#endif
 }
--- a/modules/core/src/matrix_operations.cpp
+++ b/modules/core/src/matrix_operations.cpp
@ -41,7 +41,6 @@
 //M*/

 #include "precomp.hpp"
-#include "opencv2/core/gpumat.hpp"

 using namespace cv;
 using namespace cv::gpu;
@ -181,13 +180,12 @@ bool cv::gpu::CudaMem::empty() const

 #if !defined (HAVE_CUDA)

-void cv::gpu::registerPageLocked(Mat&) { CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); }
-void cv::gpu::unregisterPageLocked(Mat&) { CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); }
-void cv::gpu::CudaMem::create(int /*_rows*/, int /*_cols*/, int /*_type*/, int /*type_alloc*/)
-{ CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); }
-bool cv::gpu::CudaMem::canMapHostMemory() { CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); return false; }
-void cv::gpu::CudaMem::release() { CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); }
-GpuMat cv::gpu::CudaMem::createGpuMatHeader () const { CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); return GpuMat(); }
+void cv::gpu::registerPageLocked(Mat&) { throw_no_cuda(); }
+void cv::gpu::unregisterPageLocked(Mat&) { throw_no_cuda(); }
+void cv::gpu::CudaMem::create(int, int, int, int) { throw_no_cuda(); }
+bool cv::gpu::CudaMem::canMapHostMemory() { throw_no_cuda(); return false; }
+void cv::gpu::CudaMem::release() { throw_no_cuda(); }
+GpuMat cv::gpu::CudaMem::createGpuMatHeader () const { throw_no_cuda(); return GpuMat(); }

 #else /* !defined (HAVE_CUDA) */

@ -222,7 +220,7 @@ namespace
 void cv::gpu::CudaMem::create(int _rows, int _cols, int _type, int _alloc_type)
 {
    if (_alloc_type == ALLOC_ZEROCOPY && !canMapHostMemory())
-            cv::gpu::error("ZeroCopy is not supported by current device", __FILE__, __LINE__);
+        CV_Error(cv::Error::GpuApiCallError, "ZeroCopy is not supported by current device");

    _type &= Mat::TYPE_MASK;
    if( rows == _rows && cols == _cols && type() == _type && data )
@ -254,10 +252,10 @@ void cv::gpu::CudaMem::create(int _rows, int _cols, int _type, int _alloc_type)

        switch (alloc_type)
        {
-            case ALLOC_PAGE_LOCKED:    cudaSafeCall( cudaHostAlloc( &ptr, datasize, cudaHostAllocDefault) ); break;
-            case ALLOC_ZEROCOPY:       cudaSafeCall( cudaHostAlloc( &ptr, datasize, cudaHostAllocMapped) );  break;
-            case ALLOC_WRITE_COMBINED: cudaSafeCall( cudaHostAlloc( &ptr, datasize, cudaHostAllocWriteCombined) ); break;
-            default: cv::gpu::error("Invalid alloc type", __FILE__, __LINE__);
+        case ALLOC_PAGE_LOCKED:    cudaSafeCall( cudaHostAlloc( &ptr, datasize, cudaHostAllocDefault) ); break;
+        case ALLOC_ZEROCOPY:       cudaSafeCall( cudaHostAlloc( &ptr, datasize, cudaHostAllocMapped) );  break;
+        case ALLOC_WRITE_COMBINED: cudaSafeCall( cudaHostAlloc( &ptr, datasize, cudaHostAllocWriteCombined) ); break;
+        default:                   CV_Error(cv::Error::StsBadFlag, "Invalid alloc type");
        }

        datastart = data =  (uchar*)ptr;
@ -270,15 +268,13 @@ void cv::gpu::CudaMem::create(int _rows, int _cols, int _type, int _alloc_type)

 GpuMat cv::gpu::CudaMem::createGpuMatHeader () const
 {
+    CV_Assert( alloc_type == ALLOC_ZEROCOPY );
+
    GpuMat res;
-    if (alloc_type == ALLOC_ZEROCOPY)
-    {
-        void *pdev;
-        cudaSafeCall( cudaHostGetDevicePointer( &pdev, data, 0 ) );
-        res = GpuMat(rows, cols, type(), pdev, step);
-    }
-    else
-        cv::gpu::error("Zero-copy is not supported or memory was allocated without zero-copy flag", __FILE__, __LINE__);
+
+    void *pdev;
+    cudaSafeCall( cudaHostGetDevicePointer( &pdev, data, 0 ) );
+    res = GpuMat(rows, cols, type(), pdev, step);

    return res;
 }
--- a/modules/core/src/opengl_interop.cpp
+++ b/modules/core/src/opengl_interop.cpp
@ -41,16 +41,12 @@
 //M*/

 #include "precomp.hpp"
-#include "opencv2/core/opengl.hpp"
-#include "opencv2/core/gpumat.hpp"

 #ifdef HAVE_OPENGL
-    #include "gl_core_3_1.hpp"
-
-    #ifdef HAVE_CUDA
-        #include <cuda_runtime.h>
-        #include <cuda_gl_interop.h>
-    #endif
+#  include "gl_core_3_1.hpp"
+#  ifdef HAVE_CUDA
+#    include <cuda_gl_interop.h>
+#  endif
 #endif

 using namespace cv;
@ -59,15 +55,9 @@ using namespace cv::gpu;
 namespace
 {
    #ifndef HAVE_OPENGL
-        void throw_nogl() { CV_Error(CV_OpenGlNotSupported, "The library is compiled without OpenGL support"); }
+        void throw_no_ogl() { CV_Error(CV_OpenGlNotSupported, "The library is compiled without OpenGL support"); }
    #else
-        void throw_nogl() { CV_Error(CV_OpenGlApiCallError, "OpenGL context doesn't exist"); }
-
-        #ifndef HAVE_CUDA
-            void throw_nocuda() { CV_Error(CV_GpuNotSupported, "The library is compiled without GPU support"); }
-        #else
-            void throw_nocuda() { CV_Error(CV_StsNotImplemented, "The called functionality is disabled for current build or platform"); }
-        #endif
+        void throw_no_ogl() { CV_Error(CV_OpenGlApiCallError, "OpenGL context doesn't exist"); }
    #endif

 bool checkError(const char* file, const int line, const char* func = 0)
@ -137,11 +127,11 @@ void cv::gpu::setGlDevice(int device)
 {
 #ifndef HAVE_OPENGL
    (void) device;
-    throw_nogl();
+    throw_no_ogl();
 #else
    #if !defined(HAVE_CUDA) || defined(CUDA_DISABLER)
        (void) device;
-        throw_nocuda();
+        throw_no_cuda();
    #else
        cudaSafeCall( cudaGLSetGLDevice(device) );
    #endif
@ -476,7 +466,7 @@ void cv::ogl::Buffer::Impl::unmapHost()
 cv::ogl::Buffer::Buffer() : rows_(0), cols_(0), type_(0)
 {
 #ifndef HAVE_OPENGL
-    throw_nogl();
+    throw_no_ogl();
 #else
    impl_ = Impl::empty();
 #endif
@ -490,7 +480,7 @@ cv::ogl::Buffer::Buffer(int arows, int acols, int atype, unsigned int abufId, bo
    (void) atype;
    (void) abufId;
    (void) autoRelease;
-    throw_nogl();
+    throw_no_ogl();
 #else
    impl_ = new Impl(abufId, autoRelease);
    rows_ = arows;
@ -506,7 +496,7 @@ cv::ogl::Buffer::Buffer(Size asize, int atype, unsigned int abufId, bool autoRel
    (void) atype;
    (void) abufId;
    (void) autoRelease;
-    throw_nogl();
+    throw_no_ogl();
 #else
    impl_ = new Impl(abufId, autoRelease);
    rows_ = asize.height;
@ -531,7 +521,7 @@ cv::ogl::Buffer::Buffer(InputArray arr, Target target, bool autoRelease) : rows_
    (void) arr;
    (void) target;
    (void) autoRelease;
-    throw_nogl();
+    throw_no_ogl();
 #else
    const int kind = arr.kind();

@ -578,7 +568,7 @@ void cv::ogl::Buffer::create(int arows, int acols, int atype, Target target, boo
    (void) atype;
    (void) target;
    (void) autoRelease;
-    throw_nogl();
+    throw_no_ogl();
 #else
    if (rows_ != arows || cols_ != acols || type_ != atype)
    {
@ -607,7 +597,7 @@ void cv::ogl::Buffer::setAutoRelease(bool flag)
 {
 #ifndef HAVE_OPENGL
    (void) flag;
-    throw_nogl();
+    throw_no_ogl();
 #else
    impl_->setAutoRelease(flag);
 #endif
@ -619,7 +609,7 @@ void cv::ogl::Buffer::copyFrom(InputArray arr, Target target, bool autoRelease)
    (void) arr;
    (void) target;
    (void) autoRelease;
-    throw_nogl();
+    throw_no_ogl();
 #else
    const int kind = arr.kind();

@ -647,7 +637,7 @@ void cv::ogl::Buffer::copyFrom(InputArray arr, Target target, bool autoRelease)
    case _InputArray::GPU_MAT:
        {
            #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-                throw_nocuda();
+                throw_no_cuda();
            #else
                GpuMat dmat = arr.getGpuMat();
                impl_->copyFrom(dmat.data, dmat.step, dmat.cols * dmat.elemSize(), dmat.rows);
@ -672,7 +662,7 @@ void cv::ogl::Buffer::copyTo(OutputArray arr, Target target, bool autoRelease) c
    (void) arr;
    (void) target;
    (void) autoRelease;
-    throw_nogl();
+    throw_no_ogl();
 #else
    const int kind = arr.kind();

@ -693,7 +683,7 @@ void cv::ogl::Buffer::copyTo(OutputArray arr, Target target, bool autoRelease) c
    case _InputArray::GPU_MAT:
        {
            #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-                throw_nocuda();
+                throw_no_cuda();
            #else
                GpuMat& dmat = arr.getGpuMatRef();
                dmat.create(rows_, cols_, type_);
@ -719,7 +709,7 @@ cv::ogl::Buffer cv::ogl::Buffer::clone(Target target, bool autoRelease) const
 #ifndef HAVE_OPENGL
    (void) target;
    (void) autoRelease;
-    throw_nogl();
+    throw_no_ogl();
    return cv::ogl::Buffer();
 #else
    ogl::Buffer buf;
@ -732,7 +722,7 @@ void cv::ogl::Buffer::bind(Target target) const
 {
 #ifndef HAVE_OPENGL
    (void) target;
-    throw_nogl();
+    throw_no_ogl();
 #else
    impl_->bind(target);
 #endif
@ -742,7 +732,7 @@ void cv::ogl::Buffer::unbind(Target target)
 {
 #ifndef HAVE_OPENGL
    (void) target;
-    throw_nogl();
+    throw_no_ogl();
 #else
    gl::BindBuffer(target, 0);
    CV_CheckGlError();
@ -753,7 +743,7 @@ Mat cv::ogl::Buffer::mapHost(Access access)
 {
 #ifndef HAVE_OPENGL
    (void) access;
-    throw_nogl();
+    throw_no_ogl();
    return Mat();
 #else
    return Mat(rows_, cols_, type_, impl_->mapHost(access));
@ -763,7 +753,7 @@ Mat cv::ogl::Buffer::mapHost(Access access)
 void cv::ogl::Buffer::unmapHost()
 {
 #ifndef HAVE_OPENGL
-    throw_nogl();
+    throw_no_ogl();
 #else
    return impl_->unmapHost();
 #endif
@ -772,11 +762,11 @@ void cv::ogl::Buffer::unmapHost()
 GpuMat cv::ogl::Buffer::mapDevice()
 {
 #ifndef HAVE_OPENGL
-    throw_nogl();
+    throw_no_ogl();
    return GpuMat();
 #else
    #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-        throw_nocuda();
+        throw_no_cuda();
        return GpuMat();
    #else
        return GpuMat(rows_, cols_, type_, impl_->mapDevice());
@ -787,10 +777,10 @@ GpuMat cv::ogl::Buffer::mapDevice()
 void cv::ogl::Buffer::unmapDevice()
 {
 #ifndef HAVE_OPENGL
-    throw_nogl();
+    throw_no_ogl();
 #else
    #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-        throw_nocuda();
+        throw_no_cuda();
    #else
        impl_->unmapDevice();
    #endif
@ -800,7 +790,7 @@ void cv::ogl::Buffer::unmapDevice()
 unsigned int cv::ogl::Buffer::bufId() const
 {
 #ifndef HAVE_OPENGL
-    throw_nogl();
+    throw_no_ogl();
    return 0;
 #else
    return impl_->bufId();
@ -926,7 +916,7 @@ void cv::ogl::Texture2D::Impl::bind() const
 cv::ogl::Texture2D::Texture2D() : rows_(0), cols_(0), format_(NONE)
 {
 #ifndef HAVE_OPENGL
-    throw_nogl();
+    throw_no_ogl();
 #else
    impl_ = Impl::empty();
 #endif
@ -940,7 +930,7 @@ cv::ogl::Texture2D::Texture2D(int arows, int acols, Format aformat, unsigned int
    (void) aformat;
    (void) atexId;
    (void) autoRelease;
-    throw_nogl();
+    throw_no_ogl();
 #else
    impl_ = new Impl(atexId, autoRelease);
    rows_ = arows;
@ -956,7 +946,7 @@ cv::ogl::Texture2D::Texture2D(Size asize, Format aformat, unsigned int atexId, b
    (void) aformat;
    (void) atexId;
    (void) autoRelease;
-    throw_nogl();
+    throw_no_ogl();
 #else
    impl_ = new Impl(atexId, autoRelease);
    rows_ = asize.height;
@ -980,7 +970,7 @@ cv::ogl::Texture2D::Texture2D(InputArray arr, bool autoRelease) : rows_(0), cols
 #ifndef HAVE_OPENGL
    (void) arr;
    (void) autoRelease;
-    throw_nogl();
+    throw_no_ogl();
 #else
    const int kind = arr.kind();

@ -1016,7 +1006,7 @@ cv::ogl::Texture2D::Texture2D(InputArray arr, bool autoRelease) : rows_(0), cols
    case _InputArray::GPU_MAT:
        {
            #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-                throw_nocuda();
+                throw_no_cuda();
            #else
                GpuMat dmat = arr.getGpuMat();
                ogl::Buffer buf(dmat, ogl::Buffer::PIXEL_UNPACK_BUFFER);
@ -1051,7 +1041,7 @@ void cv::ogl::Texture2D::create(int arows, int acols, Format aformat, bool autoR
    (void) acols;
    (void) aformat;
    (void) autoRelease;
-    throw_nogl();
+    throw_no_ogl();
 #else
    if (rows_ != arows || cols_ != acols || format_ != aformat)
    {
@ -1080,7 +1070,7 @@ void cv::ogl::Texture2D::setAutoRelease(bool flag)
 {
 #ifndef HAVE_OPENGL
    (void) flag;
-    throw_nogl();
+    throw_no_ogl();
 #else
    impl_->setAutoRelease(flag);
 #endif
@ -1091,7 +1081,7 @@ void cv::ogl::Texture2D::copyFrom(InputArray arr, bool autoRelease)
 #ifndef HAVE_OPENGL
    (void) arr;
    (void) autoRelease;
-    throw_nogl();
+    throw_no_ogl();
 #else
    const int kind = arr.kind();

@ -1129,7 +1119,7 @@ void cv::ogl::Texture2D::copyFrom(InputArray arr, bool autoRelease)
    case _InputArray::GPU_MAT:
        {
            #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-                throw_nocuda();
+                throw_no_cuda();
            #else
                GpuMat dmat = arr.getGpuMat();
                ogl::Buffer buf(dmat, ogl::Buffer::PIXEL_UNPACK_BUFFER);
@ -1158,7 +1148,7 @@ void cv::ogl::Texture2D::copyTo(OutputArray arr, int ddepth, bool autoRelease) c
    (void) arr;
    (void) ddepth;
    (void) autoRelease;
-    throw_nogl();
+    throw_no_ogl();
 #else
    const int kind = arr.kind();

@ -1180,7 +1170,7 @@ void cv::ogl::Texture2D::copyTo(OutputArray arr, int ddepth, bool autoRelease) c
    case _InputArray::GPU_MAT:
        {
            #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-                throw_nocuda();
+                throw_no_cuda();
            #else
                ogl::Buffer buf(rows_, cols_, CV_MAKE_TYPE(ddepth, cn), ogl::Buffer::PIXEL_PACK_BUFFER);
                buf.bind(ogl::Buffer::PIXEL_PACK_BUFFER);
@ -1207,7 +1197,7 @@ void cv::ogl::Texture2D::copyTo(OutputArray arr, int ddepth, bool autoRelease) c
 void cv::ogl::Texture2D::bind() const
 {
 #ifndef HAVE_OPENGL
-    throw_nogl();
+    throw_no_ogl();
 #else
    impl_->bind();
 #endif
@ -1216,7 +1206,7 @@ void cv::ogl::Texture2D::bind() const
 unsigned int cv::ogl::Texture2D::texId() const
 {
 #ifndef HAVE_OPENGL
-    throw_nogl();
+    throw_no_ogl();
    return 0;
 #else
    return impl_->texId();
@ -1331,7 +1321,7 @@ void cv::ogl::Arrays::setAutoRelease(bool flag)
 void cv::ogl::Arrays::bind() const
 {
 #ifndef HAVE_OPENGL
-    throw_nogl();
+    throw_no_ogl();
 #else
    CV_Assert( texCoord_.empty() || texCoord_.size().area() == size_ );
    CV_Assert( normal_.empty() || normal_.size().area() == size_ );
@ -1416,7 +1406,7 @@ void cv::ogl::render(const ogl::Texture2D& tex, Rect_<double> wndRect, Rect_<dou
    (void) tex;
    (void) wndRect;
    (void) texRect;
-    throw_nogl();
+    throw_no_ogl();
 #else
    if (!tex.empty())
    {
@ -1488,7 +1478,7 @@ void cv::ogl::render(const ogl::Arrays& arr, int mode, Scalar color)
    (void) arr;
    (void) mode;
    (void) color;
-    throw_nogl();
+    throw_no_ogl();
 #else
    if (!arr.empty())
    {
@ -1508,7 +1498,7 @@ void cv::ogl::render(const ogl::Arrays& arr, InputArray indices, int mode, Scala
    (void) indices;
    (void) mode;
    (void) color;
-    throw_nogl();
+    throw_no_ogl();
 #else
    if (!arr.empty() && !indices.empty())
    {
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@ -46,8 +46,10 @@
 #include "opencv2/core/utility.hpp"
 #include "opencv2/core/core_c.h"
 #include "opencv2/core/gpumat.hpp"
+#include "opencv2/core/opengl.hpp"

 #include "opencv2/core/private.hpp"
+#include "opencv2/core/gpu_private.hpp"

 #include <assert.h>
 #include <ctype.h>
@ -64,37 +66,6 @@
 #define GET_OPTIMIZED(func) (func)
 #endif

-#ifdef HAVE_CUDA
-
-#  include <cuda_runtime.h>
-#  include <npp.h>
-
-#  define CUDART_MINIMUM_REQUIRED_VERSION 4020
-#  define NPP_MINIMUM_REQUIRED_VERSION 4200
-
-#  if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION)
-#    error "Insufficient Cuda Runtime library version, please update it."
-#  endif
-
-#  if (NPP_VERSION_MAJOR * 1000 + NPP_VERSION_MINOR * 100 + NPP_VERSION_BUILD < NPP_MINIMUM_REQUIRED_VERSION)
-#    error "Insufficient NPP version, please update it."
-#  endif
-
-#  if defined(__GNUC__)
-#    define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
-#  else
-#    define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
-#  endif
-
-static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
-{
-    if (cudaSuccess != err) cv::gpu::error(cudaGetErrorString(err), file, line, func);
-}
-
-#else
-#  define cudaSafeCall(expr)
-#endif //HAVE_CUDA
-
 namespace cv
 {

--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@ -8,8 +8,6 @@ ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video o
 ocv_module_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/src/cuda")

 file(GLOB lib_hdrs               "include/opencv2/*.hpp"                       "include/opencv2/${name}/*.hpp"               "include/opencv2/${name}/*.h")
-file(GLOB lib_device_hdrs        "include/opencv2/${name}/device/*.hpp"        "include/opencv2/${name}/device/*.h")
-file(GLOB lib_device_hdrs_detail "include/opencv2/${name}/device/detail/*.hpp" "include/opencv2/${name}/device/detail/*.h")
 file(GLOB lib_int_hdrs           "src/*.hpp"      "src/*.h")
 file(GLOB lib_cuda_hdrs          "src/cuda/*.hpp" "src/cuda/*.h")
 file(GLOB lib_srcs               "src/*.cpp")
@ -18,8 +16,6 @@ file(GLOB lib_cuda               "src/cuda/*.cu*")
 source_group("Include"        FILES ${lib_hdrs})
 source_group("Src\\Host"      FILES ${lib_srcs} ${lib_int_hdrs})
 source_group("Src\\Cuda"      FILES ${lib_cuda} ${lib_cuda_hdrs})
-source_group("Device"         FILES ${lib_device_hdrs})
-source_group("Device\\Detail" FILES ${lib_device_hdrs_detail})

 if(HAVE_CUDA)
  file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp" "src/nvidia/*.h*")
@ -64,7 +60,7 @@ else()
 endif()

 ocv_set_module_sources(
-  HEADERS ${lib_hdrs} ${lib_device_hdrs} ${lib_device_hdrs_detail}
+  HEADERS ${lib_hdrs}
  SOURCES ${lib_int_hdrs} ${lib_cuda_hdrs} ${lib_srcs} ${lib_cuda} ${ncv_files} ${cuda_objs}
  )

--- a/modules/gpu/include/opencv2/gpu/device/static_check.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/static_check.hpp
@ -1,67 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__
-#define __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__
-
-#if defined(__CUDACC__)
-    #define __OPENCV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__
-#else
-    #define __OPENCV_GPU_HOST_DEVICE__
-#endif
-
-namespace cv { namespace gpu
-{
-    namespace device
-    {
-        template<bool expr> struct Static {};
-
-        template<> struct Static<true>
-        {
-            __OPENCV_GPU_HOST_DEVICE__ static void check() {};
-        };
-    }
-}}
-
-#undef __OPENCV_GPU_HOST_DEVICE__
-
-#endif /* __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__ */
--- a/modules/gpu/perf/perf_precomp.hpp
+++ b/modules/gpu/perf/perf_precomp.hpp
@ -54,10 +54,6 @@
 #include <cstdio>
 #include <iostream>

-#ifdef HAVE_CUDA
-#include <cuda_runtime.h>
-#endif
-
 #include "opencv2/ts.hpp"
 #include "opencv2/ts/gpu_perf.hpp"

@ -70,7 +66,7 @@
 #include "opencv2/legacy.hpp"
 #include "opencv2/photo.hpp"

-#include "opencv2/core/private.hpp"
+#include "opencv2/core/gpu_private.hpp"

 #ifdef GTEST_CREATE_SHARED_LIBRARY
 #error no modules except ts should have GTEST_CREATE_SHARED_LIBRARY defined
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@ -47,19 +47,19 @@ using namespace cv::gpu;

 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)

-void cv::gpu::gemm(const GpuMat&, const GpuMat&, double, const GpuMat&, double, GpuMat&, int, Stream&) { throw_nogpu(); }
-void cv::gpu::transpose(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::flip(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
-void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::magnitude(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::magnitudeSqr(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::magnitude(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::magnitudeSqr(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::phase(const GpuMat&, const GpuMat&, GpuMat&, bool, Stream&) { throw_nogpu(); }
-void cv::gpu::cartToPolar(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, Stream&) { throw_nogpu(); }
-void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, Stream&) { throw_nogpu(); }
-void cv::gpu::normalize(const GpuMat&, GpuMat&, double, double, int, int, const GpuMat&) { throw_nogpu(); }
-void cv::gpu::normalize(const GpuMat&, GpuMat&, double, double, int, int, const GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::gemm(const GpuMat&, const GpuMat&, double, const GpuMat&, double, GpuMat&, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::transpose(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::flip(const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::magnitude(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::magnitudeSqr(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::magnitude(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::magnitudeSqr(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::phase(const GpuMat&, const GpuMat&, GpuMat&, bool, Stream&) { throw_no_cuda(); }
+void cv::gpu::cartToPolar(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, Stream&) { throw_no_cuda(); }
+void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, Stream&) { throw_no_cuda(); }
+void cv::gpu::normalize(const GpuMat&, GpuMat&, double, double, int, int, const GpuMat&) { throw_no_cuda(); }
+void cv::gpu::normalize(const GpuMat&, GpuMat&, double, double, int, int, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }

 #else /* !defined (HAVE_CUDA) */

@ -444,7 +444,7 @@ void cv::gpu::magnitudeSqr(const GpuMat& src, GpuMat& dst, Stream& stream)
 ////////////////////////////////////////////////////////////////////////
 // Polar <-> Cart

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace mathfunc
    {
@ -457,7 +457,7 @@ namespace
 {
    inline void cartToPolar_caller(const GpuMat& x, const GpuMat& y, GpuMat* mag, bool magSqr, GpuMat* angle, bool angleInDegrees, cudaStream_t stream)
    {
-        using namespace ::cv::gpu::device::mathfunc;
+        using namespace ::cv::gpu::cudev::mathfunc;

        CV_Assert(x.size() == y.size() && x.type() == y.type());
        CV_Assert(x.depth() == CV_32F);
@ -477,7 +477,7 @@ namespace

    inline void polarToCart_caller(const GpuMat& mag, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, cudaStream_t stream)
    {
-        using namespace ::cv::gpu::device::mathfunc;
+        using namespace ::cv::gpu::cudev::mathfunc;

        CV_Assert((mag.empty() || mag.size() == angle.size()) && mag.type() == angle.type());
        CV_Assert(mag.depth() == CV_32F);
--- a/modules/gpu/src/bgfg_gmg.cpp
+++ b/modules/gpu/src/bgfg_gmg.cpp
@ -44,14 +44,14 @@

 #if !defined HAVE_CUDA || defined(CUDA_DISABLER)

-cv::gpu::GMG_GPU::GMG_GPU() { throw_nogpu(); }
-void cv::gpu::GMG_GPU::initialize(cv::Size, float, float) { throw_nogpu(); }
-void cv::gpu::GMG_GPU::operator ()(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, float, cv::gpu::Stream&) { throw_nogpu(); }
+cv::gpu::GMG_GPU::GMG_GPU() { throw_no_cuda(); }
+void cv::gpu::GMG_GPU::initialize(cv::Size, float, float) { throw_no_cuda(); }
+void cv::gpu::GMG_GPU::operator ()(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, float, cv::gpu::Stream&) { throw_no_cuda(); }
 void cv::gpu::GMG_GPU::release() {}

 #else

-namespace cv { namespace gpu { namespace device {
+namespace cv { namespace gpu { namespace cudev {
    namespace bgfg_gmg
    {
        void loadConstants(int width, int height, float minVal, float maxVal, int quantizationLevels, float backgroundPrior,
@ -77,7 +77,7 @@ cv::gpu::GMG_GPU::GMG_GPU()

 void cv::gpu::GMG_GPU::initialize(cv::Size frameSize, float min, float max)
 {
-    using namespace cv::gpu::device::bgfg_gmg;
+    using namespace cv::gpu::cudev::bgfg_gmg;

    CV_Assert(min < max);
    CV_Assert(maxFeatures > 0);
@ -107,7 +107,7 @@ void cv::gpu::GMG_GPU::initialize(cv::Size frameSize, float min, float max)

 void cv::gpu::GMG_GPU::operator ()(const cv::gpu::GpuMat& frame, cv::gpu::GpuMat& fgmask, float newLearningRate, cv::gpu::Stream& stream)
 {
-    using namespace cv::gpu::device::bgfg_gmg;
+    using namespace cv::gpu::cudev::bgfg_gmg;

    typedef void (*func_t)(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures,
                           int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
--- a/modules/gpu/src/bgfg_mog.cpp
+++ b/modules/gpu/src/bgfg_mog.cpp
@ -44,21 +44,21 @@

 #if !defined HAVE_CUDA || defined(CUDA_DISABLER)

-cv::gpu::MOG_GPU::MOG_GPU(int) { throw_nogpu(); }
-void cv::gpu::MOG_GPU::initialize(cv::Size, int) { throw_nogpu(); }
-void cv::gpu::MOG_GPU::operator()(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, float, Stream&) { throw_nogpu(); }
-void cv::gpu::MOG_GPU::getBackgroundImage(GpuMat&, Stream&) const { throw_nogpu(); }
+cv::gpu::MOG_GPU::MOG_GPU(int) { throw_no_cuda(); }
+void cv::gpu::MOG_GPU::initialize(cv::Size, int) { throw_no_cuda(); }
+void cv::gpu::MOG_GPU::operator()(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, float, Stream&) { throw_no_cuda(); }
+void cv::gpu::MOG_GPU::getBackgroundImage(GpuMat&, Stream&) const { throw_no_cuda(); }
 void cv::gpu::MOG_GPU::release() {}

-cv::gpu::MOG2_GPU::MOG2_GPU(int) { throw_nogpu(); }
-void cv::gpu::MOG2_GPU::initialize(cv::Size, int) { throw_nogpu(); }
-void cv::gpu::MOG2_GPU::operator()(const GpuMat&, GpuMat&, float, Stream&) { throw_nogpu(); }
-void cv::gpu::MOG2_GPU::getBackgroundImage(GpuMat&, Stream&) const { throw_nogpu(); }
+cv::gpu::MOG2_GPU::MOG2_GPU(int) { throw_no_cuda(); }
+void cv::gpu::MOG2_GPU::initialize(cv::Size, int) { throw_no_cuda(); }
+void cv::gpu::MOG2_GPU::operator()(const GpuMat&, GpuMat&, float, Stream&) { throw_no_cuda(); }
+void cv::gpu::MOG2_GPU::getBackgroundImage(GpuMat&, Stream&) const { throw_no_cuda(); }
 void cv::gpu::MOG2_GPU::release() {}

 #else

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace mog
    {
@ -123,7 +123,7 @@ void cv::gpu::MOG_GPU::initialize(cv::Size frameSize, int frameType)

 void cv::gpu::MOG_GPU::operator()(const cv::gpu::GpuMat& frame, cv::gpu::GpuMat& fgmask, float learningRate, Stream& stream)
 {
-    using namespace cv::gpu::device::mog;
+    using namespace cv::gpu::cudev::mog;

    CV_Assert(frame.depth() == CV_8U);

@ -146,7 +146,7 @@ void cv::gpu::MOG_GPU::operator()(const cv::gpu::GpuMat& frame, cv::gpu::GpuMat&

 void cv::gpu::MOG_GPU::getBackgroundImage(GpuMat& backgroundImage, Stream& stream) const
 {
-    using namespace cv::gpu::device::mog;
+    using namespace cv::gpu::cudev::mog;

    backgroundImage.create(frameSize_, frameType_);

@ -208,7 +208,7 @@ cv::gpu::MOG2_GPU::MOG2_GPU(int nmixtures) :

 void cv::gpu::MOG2_GPU::initialize(cv::Size frameSize, int frameType)
 {
-    using namespace cv::gpu::device::mog;
+    using namespace cv::gpu::cudev::mog;

    CV_Assert(frameType == CV_8UC1 || frameType == CV_8UC3 || frameType == CV_8UC4);

@ -236,7 +236,7 @@ void cv::gpu::MOG2_GPU::initialize(cv::Size frameSize, int frameType)

 void cv::gpu::MOG2_GPU::operator()(const GpuMat& frame, GpuMat& fgmask, float learningRate, Stream& stream)
 {
-    using namespace cv::gpu::device::mog;
+    using namespace cv::gpu::cudev::mog;

    int ch = frame.channels();
    int work_ch = ch;
@ -256,7 +256,7 @@ void cv::gpu::MOG2_GPU::operator()(const GpuMat& frame, GpuMat& fgmask, float le

 void cv::gpu::MOG2_GPU::getBackgroundImage(GpuMat& backgroundImage, Stream& stream) const
 {
-    using namespace cv::gpu::device::mog;
+    using namespace cv::gpu::cudev::mog;

    backgroundImage.create(frameSize_, frameType_);

--- a/modules/gpu/src/bilateral_filter.cpp
+++ b/modules/gpu/src/bilateral_filter.cpp
@ -47,14 +47,14 @@ using namespace cv::gpu;

 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)

-cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int) { throw_nogpu(); }
-cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int, float, float, float) { throw_nogpu(); }
+cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int) { throw_no_cuda(); }
+cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int, float, float, float) { throw_no_cuda(); }

-void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
+void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }

 #else /* !defined (HAVE_CUDA) */

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace disp_bilateral_filter
    {
@ -65,7 +65,7 @@ namespace cv { namespace gpu { namespace device
    }
 }}}

-using namespace ::cv::gpu::device::disp_bilateral_filter;
+using namespace ::cv::gpu::cudev::disp_bilateral_filter;

 namespace
 {
--- a/modules/gpu/src/blend.cpp
+++ b/modules/gpu/src/blend.cpp
@ -47,11 +47,11 @@ using namespace cv::gpu;

 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)

-void cv::gpu::blendLinear(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
+void cv::gpu::blendLinear(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }

 #else

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace blend
    {
@ -62,7 +62,7 @@ namespace cv { namespace gpu { namespace device
    }
 }}}

-using namespace ::cv::gpu::device::blend;
+using namespace ::cv::gpu::cudev::blend;

 void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2,
                          GpuMat& result, Stream& stream)
--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
@ -47,41 +47,41 @@ using namespace cv::gpu;

 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)

-cv::gpu::BFMatcher_GPU::BFMatcher_GPU(int) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::add(const std::vector<GpuMat>&) { throw_nogpu(); }
-const std::vector<GpuMat>& cv::gpu::BFMatcher_GPU::getTrainDescriptors() const { throw_nogpu(); return trainDescCollection; }
-void cv::gpu::BFMatcher_GPU::clear() { throw_nogpu(); }
-bool cv::gpu::BFMatcher_GPU::empty() const { throw_nogpu(); return true; }
-bool cv::gpu::BFMatcher_GPU::isMaskSupported() const { throw_nogpu(); return true; }
-void cv::gpu::BFMatcher_GPU::matchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::matchDownload(const GpuMat&, const GpuMat&, std::vector<DMatch>&) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::matchConvert(const Mat&, const Mat&, std::vector<DMatch>&) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::match(const GpuMat&, const GpuMat&, std::vector<DMatch>&, const GpuMat&) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::makeGpuCollection(GpuMat&, GpuMat&, const std::vector<GpuMat>&) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::matchCollection(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::matchDownload(const GpuMat&, const GpuMat&, const GpuMat&, std::vector<DMatch>&) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::matchConvert(const Mat&, const Mat&, const Mat&, std::vector<DMatch>&) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::match(const GpuMat&, std::vector<DMatch>&, const std::vector<GpuMat>&) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::knnMatchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, const GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::knnMatchDownload(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, bool) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::knnMatchConvert(const Mat&, const Mat&, std::vector< std::vector<DMatch> >&, bool) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::knnMatch(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, int, const GpuMat&, bool) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::knnMatch2Collection(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::knnMatch2Download(const GpuMat&, const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, bool) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::knnMatch2Convert(const Mat&, const Mat&, const Mat&, std::vector< std::vector<DMatch> >&, bool) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::knnMatch(const GpuMat&, std::vector< std::vector<DMatch> >&, int, const std::vector<GpuMat>&, bool) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::radiusMatchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, float, const GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::radiusMatchDownload(const GpuMat&, const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, bool) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::radiusMatchConvert(const Mat&, const Mat&, const Mat&, std::vector< std::vector<DMatch> >&, bool) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::radiusMatch(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, float, const GpuMat&, bool) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::radiusMatchCollection(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, float, const std::vector<GpuMat>&, Stream&) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::radiusMatchDownload(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, bool) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::radiusMatchConvert(const Mat&, const Mat&, const Mat&, const Mat&, std::vector< std::vector<DMatch> >&, bool) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::radiusMatch(const GpuMat&, std::vector< std::vector<DMatch> >&, float, const std::vector<GpuMat>&, bool) { throw_nogpu(); }
+cv::gpu::BFMatcher_GPU::BFMatcher_GPU(int) { throw_no_cuda(); }
+void cv::gpu::BFMatcher_GPU::add(const std::vector<GpuMat>&) { throw_no_cuda(); }
+const std::vector<GpuMat>& cv::gpu::BFMatcher_GPU::getTrainDescriptors() const { throw_no_cuda(); return trainDescCollection; }
+void cv::gpu::BFMatcher_GPU::clear() { throw_no_cuda(); }
+bool cv::gpu::BFMatcher_GPU::empty() const { throw_no_cuda(); return true; }
+bool cv::gpu::BFMatcher_GPU::isMaskSupported() const { throw_no_cuda(); return true; }
+void cv::gpu::BFMatcher_GPU::matchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::BFMatcher_GPU::matchDownload(const GpuMat&, const GpuMat&, std::vector<DMatch>&) { throw_no_cuda(); }
+void cv::gpu::BFMatcher_GPU::matchConvert(const Mat&, const Mat&, std::vector<DMatch>&) { throw_no_cuda(); }
+void cv::gpu::BFMatcher_GPU::match(const GpuMat&, const GpuMat&, std::vector<DMatch>&, const GpuMat&) { throw_no_cuda(); }
+void cv::gpu::BFMatcher_GPU::makeGpuCollection(GpuMat&, GpuMat&, const std::vector<GpuMat>&) { throw_no_cuda(); }
+void cv::gpu::BFMatcher_GPU::matchCollection(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::BFMatcher_GPU::matchDownload(const GpuMat&, const GpuMat&, const GpuMat&, std::vector<DMatch>&) { throw_no_cuda(); }
+void cv::gpu::BFMatcher_GPU::matchConvert(const Mat&, const Mat&, const Mat&, std::vector<DMatch>&) { throw_no_cuda(); }
+void cv::gpu::BFMatcher_GPU::match(const GpuMat&, std::vector<DMatch>&, const std::vector<GpuMat>&) { throw_no_cuda(); }
+void cv::gpu::BFMatcher_GPU::knnMatchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, const GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::BFMatcher_GPU::knnMatchDownload(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, bool) { throw_no_cuda(); }
+void cv::gpu::BFMatcher_GPU::knnMatchConvert(const Mat&, const Mat&, std::vector< std::vector<DMatch> >&, bool) { throw_no_cuda(); }
+void cv::gpu::BFMatcher_GPU::knnMatch(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, int, const GpuMat&, bool) { throw_no_cuda(); }
+void cv::gpu::BFMatcher_GPU::knnMatch2Collection(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::BFMatcher_GPU::knnMatch2Download(const GpuMat&, const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, bool) { throw_no_cuda(); }
+void cv::gpu::BFMatcher_GPU::knnMatch2Convert(const Mat&, const Mat&, const Mat&, std::vector< std::vector<DMatch> >&, bool) { throw_no_cuda(); }
+void cv::gpu::BFMatcher_GPU::knnMatch(const GpuMat&, std::vector< std::vector<DMatch> >&, int, const std::vector<GpuMat>&, bool) { throw_no_cuda(); }
+void cv::gpu::BFMatcher_GPU::radiusMatchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, float, const GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::BFMatcher_GPU::radiusMatchDownload(const GpuMat&, const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, bool) { throw_no_cuda(); }
+void cv::gpu::BFMatcher_GPU::radiusMatchConvert(const Mat&, const Mat&, const Mat&, std::vector< std::vector<DMatch> >&, bool) { throw_no_cuda(); }
+void cv::gpu::BFMatcher_GPU::radiusMatch(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, float, const GpuMat&, bool) { throw_no_cuda(); }
+void cv::gpu::BFMatcher_GPU::radiusMatchCollection(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, float, const std::vector<GpuMat>&, Stream&) { throw_no_cuda(); }
+void cv::gpu::BFMatcher_GPU::radiusMatchDownload(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, bool) { throw_no_cuda(); }
+void cv::gpu::BFMatcher_GPU::radiusMatchConvert(const Mat&, const Mat&, const Mat&, const Mat&, std::vector< std::vector<DMatch> >&, bool) { throw_no_cuda(); }
+void cv::gpu::BFMatcher_GPU::radiusMatch(const GpuMat&, std::vector< std::vector<DMatch> >&, float, const std::vector<GpuMat>&, bool) { throw_no_cuda(); }

 #else /* !defined (HAVE_CUDA) */

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace bf_match
    {
@ -197,7 +197,7 @@ void cv::gpu::BFMatcher_GPU::matchSingle(const GpuMat& query, const GpuMat& trai
    if (query.empty() || train.empty())
        return;

-    using namespace cv::gpu::device::bf_match;
+    using namespace cv::gpu::cudev::bf_match;

    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
@ -340,7 +340,7 @@ void cv::gpu::BFMatcher_GPU::matchCollection(const GpuMat& query, const GpuMat&
    if (query.empty() || trainCollection.empty())
        return;

-    using namespace cv::gpu::device::bf_match;
+    using namespace cv::gpu::cudev::bf_match;

    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
                             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
@ -451,7 +451,7 @@ void cv::gpu::BFMatcher_GPU::knnMatchSingle(const GpuMat& query, const GpuMat& t
    if (query.empty() || train.empty())
        return;

-    using namespace cv::gpu::device::bf_knnmatch;
+    using namespace cv::gpu::cudev::bf_knnmatch;

    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
                             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
@ -580,7 +580,7 @@ void cv::gpu::BFMatcher_GPU::knnMatch2Collection(const GpuMat& query, const GpuM
    if (query.empty() || trainCollection.empty())
        return;

-    using namespace cv::gpu::device::bf_knnmatch;
+    using namespace cv::gpu::cudev::bf_knnmatch;

    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
                             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
@ -761,7 +761,7 @@ void cv::gpu::BFMatcher_GPU::radiusMatchSingle(const GpuMat& query, const GpuMat
    if (query.empty() || train.empty())
        return;

-    using namespace cv::gpu::device::bf_radius_match;
+    using namespace cv::gpu::cudev::bf_radius_match;

    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
@ -890,7 +890,7 @@ void cv::gpu::BFMatcher_GPU::radiusMatchCollection(const GpuMat& query, GpuMat&
    if (query.empty() || empty())
        return;

-    using namespace cv::gpu::device::bf_radius_match;
+    using namespace cv::gpu::cudev::bf_radius_match;

    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
                             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
--- a/modules/gpu/src/calib3d.cpp
+++ b/modules/gpu/src/calib3d.cpp
@ -47,15 +47,15 @@ using namespace cv::gpu;

 #if !defined HAVE_CUDA || defined(CUDA_DISABLER)

-void cv::gpu::transformPoints(const GpuMat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_nogpu(); }
+void cv::gpu::transformPoints(const GpuMat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_no_cuda(); }

-void cv::gpu::projectPoints(const GpuMat&, const Mat&, const Mat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_nogpu(); }
+void cv::gpu::projectPoints(const GpuMat&, const Mat&, const Mat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_no_cuda(); }

-void cv::gpu::solvePnPRansac(const Mat&, const Mat&, const Mat&, const Mat&, Mat&, Mat&, bool, int, float, int, std::vector<int>*) { throw_nogpu(); }
+void cv::gpu::solvePnPRansac(const Mat&, const Mat&, const Mat&, const Mat&, Mat&, Mat&, bool, int, float, int, std::vector<int>*) { throw_no_cuda(); }

 #else

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace transform_points
    {
@ -78,7 +78,7 @@ namespace cv { namespace gpu { namespace device
    }
 }}}

-using namespace ::cv::gpu::device;
+using namespace ::cv::gpu::cudev;

 namespace
 {
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@ -49,15 +49,15 @@ using namespace cv::gpu;

 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)

-cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU()               { throw_nogpu(); }
-cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU(const String&)  { throw_nogpu(); }
-cv::gpu::CascadeClassifier_GPU::~CascadeClassifier_GPU()              { throw_nogpu(); }
-bool cv::gpu::CascadeClassifier_GPU::empty() const                    { throw_nogpu(); return true; }
-bool cv::gpu::CascadeClassifier_GPU::load(const String&)              { throw_nogpu(); return true; }
-Size cv::gpu::CascadeClassifier_GPU::getClassifierSize() const        { throw_nogpu(); return Size();}
-void cv::gpu::CascadeClassifier_GPU::release()                        { throw_nogpu(); }
-int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat&, GpuMat&, double, int, Size)       {throw_nogpu(); return -1;}
-int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat&, GpuMat&, Size, Size, double, int) {throw_nogpu(); return -1;}
+cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU()               { throw_no_cuda(); }
+cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU(const String&)  { throw_no_cuda(); }
+cv::gpu::CascadeClassifier_GPU::~CascadeClassifier_GPU()              { throw_no_cuda(); }
+bool cv::gpu::CascadeClassifier_GPU::empty() const                    { throw_no_cuda(); return true; }
+bool cv::gpu::CascadeClassifier_GPU::load(const String&)              { throw_no_cuda(); return true; }
+Size cv::gpu::CascadeClassifier_GPU::getClassifierSize() const        { throw_no_cuda(); return Size();}
+void cv::gpu::CascadeClassifier_GPU::release()                        { throw_no_cuda(); }
+int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat&, GpuMat&, double, int, Size)       {throw_no_cuda(); return -1;}
+int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat&, GpuMat&, Size, Size, double, int) {throw_no_cuda(); return -1;}

 #else

@ -340,7 +340,7 @@ struct PyrLavel
    cv::Size sWindow;
 };

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace lbp
    {
@ -441,7 +441,7 @@ public:
                acc += level.sFrame.width + 1;
            }

-            device::lbp::classifyPyramid(image.cols, image.rows, NxM.width - 1, NxM.height - 1, iniScale, scaleFactor, total, stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat,
+            cudev::lbp::classifyPyramid(image.cols, image.rows, NxM.width - 1, NxM.height - 1, iniScale, scaleFactor, total, stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat,
                leaves_mat, subsets_mat, features_mat, subsetSize, candidates, dclassified.ptr<unsigned int>(), integral);
        }

@ -449,7 +449,7 @@ public:
            return 0;

        cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );
-        device::lbp::connectedConmonents(candidates, classified, objects, groupThreshold, grouping_eps, dclassified.ptr<unsigned int>());
+        cudev::lbp::connectedConmonents(candidates, classified, objects, groupThreshold, grouping_eps, dclassified.ptr<unsigned int>());

        cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );
        cudaSafeCall( cudaDeviceSynchronize() );
--- a/modules/gpu/src/color.cpp
+++ b/modules/gpu/src/color.cpp
@ -47,17 +47,17 @@ using namespace cv::gpu;

 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)

-void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int, Stream&) { throw_nogpu(); }
-void cv::gpu::demosaicing(const GpuMat&, GpuMat&, int, int, Stream&) { throw_nogpu(); }
-void cv::gpu::swapChannels(GpuMat&, const int[], Stream&) { throw_nogpu(); }
-void cv::gpu::gammaCorrection(const GpuMat&, GpuMat&, bool, Stream&) { throw_nogpu(); }
+void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::demosaicing(const GpuMat&, GpuMat&, int, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::swapChannels(GpuMat&, const int[], Stream&) { throw_no_cuda(); }
+void cv::gpu::gammaCorrection(const GpuMat&, GpuMat&, bool, Stream&) { throw_no_cuda(); }

 #else /* !defined (HAVE_CUDA) */

 #include "cvt_color_internal.h"

 namespace cv { namespace gpu {
-    namespace device
+    namespace cudev
    {
        template <int cn>
        void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
@ -69,7 +69,7 @@ namespace cv { namespace gpu {
    }
 }}

-using namespace ::cv::gpu::device;
+using namespace ::cv::gpu::cudev;

 namespace
 {
@ -77,7 +77,7 @@ namespace

    void bgr_to_rgb(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[] = {bgr_to_rgb_8u, 0, bgr_to_rgb_16u, 0, 0, bgr_to_rgb_32f};

        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
@ -90,7 +90,7 @@ namespace

    void bgr_to_bgra(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[] = {bgr_to_bgra_8u, 0, bgr_to_bgra_16u, 0, 0, bgr_to_bgra_32f};

        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
@ -103,7 +103,7 @@ namespace

    void bgr_to_rgba(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[] = {bgr_to_rgba_8u, 0, bgr_to_rgba_16u, 0, 0, bgr_to_rgba_32f};

        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
@ -116,7 +116,7 @@ namespace

    void bgra_to_bgr(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[] = {bgra_to_bgr_8u, 0, bgra_to_bgr_16u, 0, 0, bgra_to_bgr_32f};

        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
@ -129,7 +129,7 @@ namespace

    void bgra_to_rgb(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[] = {bgra_to_rgb_8u, 0, bgra_to_rgb_16u, 0, 0, bgra_to_rgb_32f};

        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
@ -142,7 +142,7 @@ namespace

    void bgra_to_rgba(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[] = {bgra_to_rgba_8u, 0, bgra_to_rgba_16u, 0, 0, bgra_to_rgba_32f};

        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
@ -160,7 +160,7 @@ namespace

        dst.create(src.size(), CV_8UC2);

-        device::bgr_to_bgr555(src, dst, StreamAccessor::getStream(stream));
+        cudev::bgr_to_bgr555(src, dst, StreamAccessor::getStream(stream));
    }

    void bgr_to_bgr565(const GpuMat& src, GpuMat& dst, int, Stream& stream)
@ -170,7 +170,7 @@ namespace

        dst.create(src.size(), CV_8UC2);

-        device::bgr_to_bgr565(src, dst, StreamAccessor::getStream(stream));
+        cudev::bgr_to_bgr565(src, dst, StreamAccessor::getStream(stream));
    }

    void rgb_to_bgr555(const GpuMat& src, GpuMat& dst, int, Stream& stream)
@ -180,7 +180,7 @@ namespace

        dst.create(src.size(), CV_8UC2);

-        device::rgb_to_bgr555(src, dst, StreamAccessor::getStream(stream));
+        cudev::rgb_to_bgr555(src, dst, StreamAccessor::getStream(stream));
    }

    void rgb_to_bgr565(const GpuMat& src, GpuMat& dst, int, Stream& stream)
@ -190,7 +190,7 @@ namespace

        dst.create(src.size(), CV_8UC2);

-        device::rgb_to_bgr565(src, dst, StreamAccessor::getStream(stream));
+        cudev::rgb_to_bgr565(src, dst, StreamAccessor::getStream(stream));
    }

    void bgra_to_bgr555(const GpuMat& src, GpuMat& dst, int, Stream& stream)
@ -200,7 +200,7 @@ namespace

        dst.create(src.size(), CV_8UC2);

-        device::bgra_to_bgr555(src, dst, StreamAccessor::getStream(stream));
+        cudev::bgra_to_bgr555(src, dst, StreamAccessor::getStream(stream));
    }

    void bgra_to_bgr565(const GpuMat& src, GpuMat& dst, int, Stream& stream)
@ -210,7 +210,7 @@ namespace

        dst.create(src.size(), CV_8UC2);

-        device::bgra_to_bgr565(src, dst, StreamAccessor::getStream(stream));
+        cudev::bgra_to_bgr565(src, dst, StreamAccessor::getStream(stream));
    }

    void rgba_to_bgr555(const GpuMat& src, GpuMat& dst, int, Stream& stream)
@ -220,7 +220,7 @@ namespace

        dst.create(src.size(), CV_8UC2);

-        device::rgba_to_bgr555(src, dst, StreamAccessor::getStream(stream));
+        cudev::rgba_to_bgr555(src, dst, StreamAccessor::getStream(stream));
    }

    void rgba_to_bgr565(const GpuMat& src, GpuMat& dst, int, Stream& stream)
@ -230,7 +230,7 @@ namespace

        dst.create(src.size(), CV_8UC2);

-        device::rgba_to_bgr565(src, dst, StreamAccessor::getStream(stream));
+        cudev::rgba_to_bgr565(src, dst, StreamAccessor::getStream(stream));
    }

    void bgr555_to_rgb(const GpuMat& src, GpuMat& dst, int, Stream& stream)
@ -240,7 +240,7 @@ namespace

        dst.create(src.size(), CV_8UC3);

-        device::bgr555_to_rgb(src, dst, StreamAccessor::getStream(stream));
+        cudev::bgr555_to_rgb(src, dst, StreamAccessor::getStream(stream));
    }

    void bgr565_to_rgb(const GpuMat& src, GpuMat& dst, int, Stream& stream)
@ -250,7 +250,7 @@ namespace

        dst.create(src.size(), CV_8UC3);

-        device::bgr565_to_rgb(src, dst, StreamAccessor::getStream(stream));
+        cudev::bgr565_to_rgb(src, dst, StreamAccessor::getStream(stream));
    }

    void bgr555_to_bgr(const GpuMat& src, GpuMat& dst, int, Stream& stream)
@ -260,7 +260,7 @@ namespace

        dst.create(src.size(), CV_8UC3);

-        device::bgr555_to_bgr(src, dst, StreamAccessor::getStream(stream));
+        cudev::bgr555_to_bgr(src, dst, StreamAccessor::getStream(stream));
    }

    void bgr565_to_bgr(const GpuMat& src, GpuMat& dst, int, Stream& stream)
@ -270,7 +270,7 @@ namespace

        dst.create(src.size(), CV_8UC3);

-        device::bgr565_to_bgr(src, dst, StreamAccessor::getStream(stream));
+        cudev::bgr565_to_bgr(src, dst, StreamAccessor::getStream(stream));
    }

    void bgr555_to_rgba(const GpuMat& src, GpuMat& dst, int, Stream& stream)
@ -280,7 +280,7 @@ namespace

        dst.create(src.size(), CV_8UC4);

-        device::bgr555_to_rgba(src, dst, StreamAccessor::getStream(stream));
+        cudev::bgr555_to_rgba(src, dst, StreamAccessor::getStream(stream));
    }

    void bgr565_to_rgba(const GpuMat& src, GpuMat& dst, int, Stream& stream)
@ -290,7 +290,7 @@ namespace

        dst.create(src.size(), CV_8UC4);

-        device::bgr565_to_rgba(src, dst, StreamAccessor::getStream(stream));
+        cudev::bgr565_to_rgba(src, dst, StreamAccessor::getStream(stream));
    }

    void bgr555_to_bgra(const GpuMat& src, GpuMat& dst, int, Stream& stream)
@ -300,7 +300,7 @@ namespace

        dst.create(src.size(), CV_8UC4);

-        device::bgr555_to_bgra(src, dst, StreamAccessor::getStream(stream));
+        cudev::bgr555_to_bgra(src, dst, StreamAccessor::getStream(stream));
    }

    void bgr565_to_bgra(const GpuMat& src, GpuMat& dst, int, Stream& stream)
@ -310,12 +310,12 @@ namespace

        dst.create(src.size(), CV_8UC4);

-        device::bgr565_to_bgra(src, dst, StreamAccessor::getStream(stream));
+        cudev::bgr565_to_bgra(src, dst, StreamAccessor::getStream(stream));
    }

    void gray_to_bgr(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[] = {gray_to_bgr_8u, 0, gray_to_bgr_16u, 0, 0, gray_to_bgr_32f};

        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
@ -328,7 +328,7 @@ namespace

    void gray_to_bgra(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[] = {gray_to_bgra_8u, 0, gray_to_bgra_16u, 0, 0, gray_to_bgra_32f};

        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
@ -346,7 +346,7 @@ namespace

        dst.create(src.size(), CV_8UC2);

-        device::gray_to_bgr555(src, dst, StreamAccessor::getStream(stream));
+        cudev::gray_to_bgr555(src, dst, StreamAccessor::getStream(stream));
    }

    void gray_to_bgr565(const GpuMat& src, GpuMat& dst, int, Stream& stream)
@ -356,7 +356,7 @@ namespace

        dst.create(src.size(), CV_8UC2);

-        device::gray_to_bgr565(src, dst, StreamAccessor::getStream(stream));
+        cudev::gray_to_bgr565(src, dst, StreamAccessor::getStream(stream));
    }

    void bgr555_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
@ -366,7 +366,7 @@ namespace

        dst.create(src.size(), CV_8UC1);

-        device::bgr555_to_gray(src, dst, StreamAccessor::getStream(stream));
+        cudev::bgr555_to_gray(src, dst, StreamAccessor::getStream(stream));
    }

    void bgr565_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
@ -376,12 +376,12 @@ namespace

        dst.create(src.size(), CV_8UC1);

-        device::bgr565_to_gray(src, dst, StreamAccessor::getStream(stream));
+        cudev::bgr565_to_gray(src, dst, StreamAccessor::getStream(stream));
    }

    void rgb_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[] = {rgb_to_gray_8u, 0, rgb_to_gray_16u, 0, 0, rgb_to_gray_32f};

        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
@ -394,7 +394,7 @@ namespace

    void bgr_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[] = {bgr_to_gray_8u, 0, bgr_to_gray_16u, 0, 0, bgr_to_gray_32f};

        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
@ -407,7 +407,7 @@ namespace

    void rgba_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[] = {rgba_to_gray_8u, 0, rgba_to_gray_16u, 0, 0, rgba_to_gray_32f};

        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
@ -420,7 +420,7 @@ namespace

    void bgra_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[] = {bgra_to_gray_8u, 0, bgra_to_gray_16u, 0, 0, bgra_to_gray_32f};

        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
@ -433,7 +433,7 @@ namespace

    void rgb_to_yuv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][6] =
        {
            {
@ -459,7 +459,7 @@ namespace

    void bgr_to_yuv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][6] =
        {
            {
@ -485,7 +485,7 @@ namespace

    void yuv_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][6] =
        {
            {
@ -511,7 +511,7 @@ namespace

    void yuv_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][6] =
        {
            {
@ -537,7 +537,7 @@ namespace

    void rgb_to_YCrCb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][6] =
        {
            {
@ -563,7 +563,7 @@ namespace

    void bgr_to_YCrCb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][6] =
        {
            {
@ -589,7 +589,7 @@ namespace

    void YCrCb_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][6] =
        {
            {
@ -615,7 +615,7 @@ namespace

    void YCrCb_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][6] =
        {
            {
@ -641,7 +641,7 @@ namespace

    void rgb_to_xyz(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][6] =
        {
            {
@ -667,7 +667,7 @@ namespace

    void bgr_to_xyz(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][6] =
        {
            {
@ -693,7 +693,7 @@ namespace

    void xyz_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][6] =
        {
            {
@ -719,7 +719,7 @@ namespace

    void xyz_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][6] =
        {
            {
@ -745,7 +745,7 @@ namespace

    void rgb_to_hsv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][6] =
        {
            {
@ -771,7 +771,7 @@ namespace

    void bgr_to_hsv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][6] =
        {
            {
@ -797,7 +797,7 @@ namespace

    void hsv_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][6] =
        {
            {
@ -823,7 +823,7 @@ namespace

    void hsv_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][6] =
        {
            {
@ -849,7 +849,7 @@ namespace

    void rgb_to_hls(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][6] =
        {
            {
@ -875,7 +875,7 @@ namespace

    void bgr_to_hls(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][6] =
        {
            {
@ -901,7 +901,7 @@ namespace

    void hls_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][6] =
        {
            {
@ -927,7 +927,7 @@ namespace

    void hls_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][6] =
        {
            {
@ -953,7 +953,7 @@ namespace

    void rgb_to_hsv_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][6] =
        {
            {
@ -979,7 +979,7 @@ namespace

    void bgr_to_hsv_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][6] =
        {
            {
@ -1005,7 +1005,7 @@ namespace

    void hsv_to_rgb_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][6] =
        {
            {
@ -1031,7 +1031,7 @@ namespace

    void hsv_to_bgr_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][6] =
        {
            {
@ -1057,7 +1057,7 @@ namespace

    void rgb_to_hls_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][6] =
        {
            {
@ -1083,7 +1083,7 @@ namespace

    void bgr_to_hls_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][6] =
        {
            {
@ -1109,7 +1109,7 @@ namespace

    void hls_to_rgb_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][6] =
        {
            {
@ -1135,7 +1135,7 @@ namespace

    void hls_to_bgr_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][6] =
        {
            {
@ -1161,7 +1161,7 @@ namespace

    void bgr_to_lab(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][2] =
        {
            {
@ -1187,7 +1187,7 @@ namespace

    void rgb_to_lab(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][2] =
        {
            {
@ -1213,7 +1213,7 @@ namespace

    void lbgr_to_lab(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][2] =
        {
            {
@ -1239,7 +1239,7 @@ namespace

    void lrgb_to_lab(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][2] =
        {
            {
@ -1265,7 +1265,7 @@ namespace

    void lab_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][2] =
        {
            {
@ -1291,7 +1291,7 @@ namespace

    void lab_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][2] =
        {
            {
@ -1317,7 +1317,7 @@ namespace

    void lab_to_lbgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][2] =
        {
            {
@ -1343,7 +1343,7 @@ namespace

    void lab_to_lrgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][2] =
        {
            {
@ -1369,7 +1369,7 @@ namespace

    void bgr_to_luv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][2] =
        {
            {
@ -1395,7 +1395,7 @@ namespace

    void rgb_to_luv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][2] =
        {
            {
@ -1421,7 +1421,7 @@ namespace

    void lbgr_to_luv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][2] =
        {
            {
@ -1447,7 +1447,7 @@ namespace

    void lrgb_to_luv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][2] =
        {
            {
@ -1473,7 +1473,7 @@ namespace

    void luv_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][2] =
        {
            {
@ -1499,7 +1499,7 @@ namespace

    void luv_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][2] =
        {
            {
@ -1525,7 +1525,7 @@ namespace

    void luv_to_lbgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][2] =
        {
            {
@ -1551,7 +1551,7 @@ namespace

    void luv_to_lrgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        using namespace cv::gpu::device;
+        using namespace cv::gpu::cudev;
        static const gpu_func_t funcs[2][2][2] =
        {
            {
@ -1895,9 +1895,9 @@ void cv::gpu::demosaicing(const GpuMat& src, GpuMat& dst, int code, int dcn, Str
                                        code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGR2BGR_MHT ? 0 : 1);

        if (dcn == 3)
-            device::MHCdemosaic<3>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
+            cudev::MHCdemosaic<3>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
        else
-            device::MHCdemosaic<4>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
+            cudev::MHCdemosaic<4>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));

        break;
    }
@ -1917,7 +1917,7 @@ void cv::gpu::demosaicing(const GpuMat& src, GpuMat& dst, int code, int dcn, Str
        const int2 firstRed = make_int2(code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGB2BGR_MHT ? 0 : 1,
                                        code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGR2BGR_MHT ? 0 : 1);

-        device::MHCdemosaic<1>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
+        cudev::MHCdemosaic<1>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));

        break;
    }
--- a/modules/gpu/src/cu_safe_call.cpp
+++ b/modules/gpu/src/cu_safe_call.cpp
@ -1,137 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "cu_safe_call.h"
-
-#ifdef HAVE_CUDA
-
-namespace
-{
-    #define error_entry(entry)  { entry, #entry }
-
-    struct ErrorEntry
-    {
-        int code;
-        const char* str;
-    };
-
-    class ErrorEntryComparer
-    {
-    public:
-        inline ErrorEntryComparer(int code) : code_(code) {}
-
-        inline bool operator()(const ErrorEntry& e) const { return e.code == code_; }
-
-    private:
-        int code_;
-    };
-
-    cv::String getErrorString(int code, const ErrorEntry* errors, size_t n)
-    {
-        size_t idx = std::find_if(errors, errors + n, ErrorEntryComparer(code)) - errors;
-
-        const char* msg = (idx != n) ? errors[idx].str : "Unknown error code";
-        cv::String str = cv::format("%s [Code = %d]", msg, code);
-
-        return str;
-    }
-
-    const ErrorEntry cu_errors [] =
-    {
-        error_entry( CUDA_SUCCESS                              ),
-        error_entry( CUDA_ERROR_INVALID_VALUE                  ),
-        error_entry( CUDA_ERROR_OUT_OF_MEMORY                  ),
-        error_entry( CUDA_ERROR_NOT_INITIALIZED                ),
-        error_entry( CUDA_ERROR_DEINITIALIZED                  ),
-        error_entry( CUDA_ERROR_PROFILER_DISABLED              ),
-        error_entry( CUDA_ERROR_PROFILER_NOT_INITIALIZED       ),
-        error_entry( CUDA_ERROR_PROFILER_ALREADY_STARTED       ),
-        error_entry( CUDA_ERROR_PROFILER_ALREADY_STOPPED       ),
-        error_entry( CUDA_ERROR_NO_DEVICE                      ),
-        error_entry( CUDA_ERROR_INVALID_DEVICE                 ),
-        error_entry( CUDA_ERROR_INVALID_IMAGE                  ),
-        error_entry( CUDA_ERROR_INVALID_CONTEXT                ),
-        error_entry( CUDA_ERROR_CONTEXT_ALREADY_CURRENT        ),
-        error_entry( CUDA_ERROR_MAP_FAILED                     ),
-        error_entry( CUDA_ERROR_UNMAP_FAILED                   ),
-        error_entry( CUDA_ERROR_ARRAY_IS_MAPPED                ),
-        error_entry( CUDA_ERROR_ALREADY_MAPPED                 ),
-        error_entry( CUDA_ERROR_NO_BINARY_FOR_GPU              ),
-        error_entry( CUDA_ERROR_ALREADY_ACQUIRED               ),
-        error_entry( CUDA_ERROR_NOT_MAPPED                     ),
-        error_entry( CUDA_ERROR_NOT_MAPPED_AS_ARRAY            ),
-        error_entry( CUDA_ERROR_NOT_MAPPED_AS_POINTER          ),
-        error_entry( CUDA_ERROR_ECC_UNCORRECTABLE              ),
-        error_entry( CUDA_ERROR_UNSUPPORTED_LIMIT              ),
-        error_entry( CUDA_ERROR_CONTEXT_ALREADY_IN_USE         ),
-        error_entry( CUDA_ERROR_INVALID_SOURCE                 ),
-        error_entry( CUDA_ERROR_FILE_NOT_FOUND                 ),
-        error_entry( CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND ),
-        error_entry( CUDA_ERROR_SHARED_OBJECT_INIT_FAILED      ),
-        error_entry( CUDA_ERROR_OPERATING_SYSTEM               ),
-        error_entry( CUDA_ERROR_INVALID_HANDLE                 ),
-        error_entry( CUDA_ERROR_NOT_FOUND                      ),
-        error_entry( CUDA_ERROR_NOT_READY                      ),
-        error_entry( CUDA_ERROR_LAUNCH_FAILED                  ),
-        error_entry( CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES        ),
-        error_entry( CUDA_ERROR_LAUNCH_TIMEOUT                 ),
-        error_entry( CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  ),
-        error_entry( CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED    ),
-        error_entry( CUDA_ERROR_PEER_ACCESS_NOT_ENABLED        ),
-        error_entry( CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE         ),
-        error_entry( CUDA_ERROR_CONTEXT_IS_DESTROYED           ),
-        error_entry( CUDA_ERROR_ASSERT                         ),
-        error_entry( CUDA_ERROR_TOO_MANY_PEERS                 ),
-        error_entry( CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED ),
-        error_entry( CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED     ),
-        error_entry( CUDA_ERROR_UNKNOWN                        )
-    };
-
-    const size_t cu_errors_num = sizeof(cu_errors) / sizeof(cu_errors[0]);
-}
-
-cv::String cv::gpu::detail::cuGetErrString(CUresult res)
-{
-    return getErrorString(res, cu_errors, cu_errors_num);
-}
-
-#endif // HAVE_CUDA
--- a/modules/gpu/src/cu_safe_call.h
+++ b/modules/gpu/src/cu_safe_call.h
@ -1,67 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __CU_SAFE_CALL_H__
-#define __CU_SAFE_CALL_H__
-
-#include "precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-namespace cv { namespace gpu {
-    namespace detail
-    {
-        String cuGetErrString(CUresult res);
-
-        inline void cuSafeCall_impl(CUresult res, const char* file, int line)
-        {
-            if (res != CUDA_SUCCESS)
-                cv::error( cv::Exception(CV_GpuApiCallError, cuGetErrString(res), "unknown function", file, line) );
-        }
-    }
-}}
-
-#define cuSafeCall( op ) cv::gpu::detail::cuSafeCall_impl( (op), __FILE__, __LINE__ )
-
-#endif // HAVE_CUDA
-
-#endif // __CU_SAFE_CALL_H__
--- a/modules/gpu/src/cuda/NV12ToARGB.cu
+++ b/modules/gpu/src/cuda/NV12ToARGB.cu
@ -49,9 +49,9 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/core/cuda/common.hpp"

-namespace cv { namespace gpu { namespace device {
+namespace cv { namespace gpu { namespace cudev {
    namespace video_decoding
    {
        __constant__ uint constAlpha = ((uint)0xff << 24);
--- a/modules/gpu/src/cuda/bf_knnmatch.cu
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
@ -42,15 +42,15 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/utility.hpp"
-#include "opencv2/gpu/device/reduce.hpp"
-#include "opencv2/gpu/device/limits.hpp"
-#include "opencv2/gpu/device/vec_distance.hpp"
-#include "opencv2/gpu/device/datamov_utils.hpp"
-#include "opencv2/gpu/device/warp_shuffle.hpp"
-
-namespace cv { namespace gpu { namespace device
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/vec_distance.hpp"
+#include "opencv2/core/cuda/datamov_utils.hpp"
+#include "opencv2/core/cuda/warp_shuffle.hpp"
+
+namespace cv { namespace gpu { namespace cudev
 {
    namespace bf_knnmatch
    {
@ -1249,7 +1249,7 @@ namespace cv { namespace gpu { namespace device
        //template void match2Hamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
        template void match2Hamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
    } // namespace bf_knnmatch
-}}} // namespace cv { namespace gpu { namespace device {
+}}} // namespace cv { namespace gpu { namespace cudev {


 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/bf_match.cu
+++ b/modules/gpu/src/cuda/bf_match.cu
@ -42,14 +42,14 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/utility.hpp"
-#include "opencv2/gpu/device/reduce.hpp"
-#include "opencv2/gpu/device/limits.hpp"
-#include "opencv2/gpu/device/vec_distance.hpp"
-#include "opencv2/gpu/device/datamov_utils.hpp"
-
-namespace cv { namespace gpu { namespace device
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/vec_distance.hpp"
+#include "opencv2/core/cuda/datamov_utils.hpp"
+
+namespace cv { namespace gpu { namespace cudev
 {
    namespace bf_match
    {
@ -768,7 +768,7 @@ namespace cv { namespace gpu { namespace device
        //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
        template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
    } // namespace bf_match
-}}} // namespace cv { namespace gpu { namespace device {
+}}} // namespace cv { namespace gpu { namespace cudev {


 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/bf_radius_match.cu
+++ b/modules/gpu/src/cuda/bf_radius_match.cu
@ -42,13 +42,13 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/utility.hpp"
-#include "opencv2/gpu/device/limits.hpp"
-#include "opencv2/gpu/device/vec_distance.hpp"
-#include "opencv2/gpu/device/datamov_utils.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/vec_distance.hpp"
+#include "opencv2/core/cuda/datamov_utils.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace bf_radius_match
    {
@ -457,7 +457,7 @@ namespace cv { namespace gpu { namespace device
        //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
    } // namespace bf_radius_match
-}}} // namespace cv { namespace gpu { namespace device
+}}} // namespace cv { namespace gpu { namespace cudev


 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/bgfg_gmg.cu
+++ b/modules/gpu/src/cuda/bgfg_gmg.cu
@ -42,11 +42,11 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/vec_traits.hpp"
-#include "opencv2/gpu/device/limits.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/limits.hpp"

-namespace cv { namespace gpu { namespace device {
+namespace cv { namespace gpu { namespace cudev {
    namespace bgfg_gmg
    {
        __constant__ int   c_width;
--- a/modules/gpu/src/cuda/bgfg_mog.cu
+++ b/modules/gpu/src/cuda/bgfg_mog.cu
@ -42,12 +42,12 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/vec_traits.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/limits.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/limits.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace mog
    {
--- a/modules/gpu/src/cuda/bilateral_filter.cu
+++ b/modules/gpu/src/cuda/bilateral_filter.cu
@ -42,10 +42,10 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/vec_traits.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"

 using namespace cv::gpu;

@ -55,7 +55,7 @@ typedef unsigned short ushort;
 //////////////////////////////////////////////////////////////////////////////////
 /// Bilateral filtering

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace imgproc
    {
@ -163,7 +163,7 @@ namespace cv { namespace gpu { namespace device


 #define OCV_INSTANTIATE_BILATERAL_FILTER(T) \
-    template void cv::gpu::device::imgproc::bilateral_filter_gpu<T>(const PtrStepSzb&, PtrStepSzb, int, float, float, int, cudaStream_t);
+    template void cv::gpu::cudev::imgproc::bilateral_filter_gpu<T>(const PtrStepSzb&, PtrStepSzb, int, float, float, int, cudaStream_t);

 OCV_INSTANTIATE_BILATERAL_FILTER(uchar)
 //OCV_INSTANTIATE_BILATERAL_FILTER(uchar2)
--- a/modules/gpu/src/cuda/blend.cu
+++ b/modules/gpu/src/cuda/blend.cu
@ -42,9 +42,9 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/core/cuda/common.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace blend
    {
@ -115,7 +115,7 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall(cudaDeviceSynchronize());
        }
    } // namespace blend
-}}} // namespace cv { namespace gpu { namespace device
+}}} // namespace cv { namespace gpu { namespace cudev


 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/calib3d.cu
+++ b/modules/gpu/src/cuda/calib3d.cu
@ -42,12 +42,12 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/transform.hpp"
-#include "opencv2/gpu/device/functional.hpp"
-#include "opencv2/gpu/device/reduce.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/reduce.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    #define SOLVE_PNP_RANSAC_MAX_NUM_ITERS 200

@ -79,7 +79,7 @@ namespace cv { namespace gpu { namespace device
            cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
            cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
            cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
-            cv::gpu::device::transform(src, dst, TransformOp(), WithOutMask(), stream);
+            cv::gpu::cudev::transform(src, dst, TransformOp(), WithOutMask(), stream);
        }
    } // namespace transform_points

@ -120,7 +120,7 @@ namespace cv { namespace gpu { namespace device
            cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
            cudaSafeCall(cudaMemcpyToSymbol(cproj0, proj, sizeof(float) * 3));
            cudaSafeCall(cudaMemcpyToSymbol(cproj1, proj + 3, sizeof(float) * 3));
-            cv::gpu::device::transform(src, dst, ProjectOp(), WithOutMask(), stream);
+            cv::gpu::cudev::transform(src, dst, ProjectOp(), WithOutMask(), stream);
        }
    } // namespace project_points

@ -187,7 +187,7 @@ namespace cv { namespace gpu { namespace device
            cudaSafeCall( cudaDeviceSynchronize() );
        }
    } // namespace solvepnp_ransac
-}}} // namespace cv { namespace gpu { namespace device
+}}} // namespace cv { namespace gpu { namespace cudev


 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
@ -44,14 +44,14 @@

 #include <utility>
 #include <algorithm>//std::swap
-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/emulation.hpp"
-#include "opencv2/gpu/device/transform.hpp"
-#include "opencv2/gpu/device/functional.hpp"
-#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/utility.hpp"

 using namespace cv::gpu;
-using namespace cv::gpu::device;
+using namespace cv::gpu::cudev;

 namespace canny
 {
@ -77,7 +77,7 @@ namespace canny
    };
 }

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <> struct TransformFunctorTraits<canny::L1> : DefaultTransformFunctorTraits<canny::L1>
    {
@ -475,7 +475,7 @@ namespace canny
    };
 }

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <> struct TransformFunctorTraits<canny::GetEdges> : DefaultTransformFunctorTraits<canny::GetEdges>
    {
--- a/modules/gpu/src/cuda/ccomponetns.cu
+++ b/modules/gpu/src/cuda/ccomponetns.cu
@ -42,15 +42,15 @@

 #if !defined CUDA_DISABLER

-#include <opencv2/gpu/device/common.hpp>
-#include <opencv2/gpu/device/vec_traits.hpp>
-#include <opencv2/gpu/device/vec_math.hpp>
-#include <opencv2/gpu/device/emulation.hpp>
+#include <opencv2/core/cuda/common.hpp>
+#include <opencv2/core/cuda/vec_traits.hpp>
+#include <opencv2/core/cuda/vec_math.hpp>
+#include <opencv2/core/cuda/emulation.hpp>

 #include <iostream>
 #include <stdio.h>

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace ccl
    {
--- a/modules/gpu/src/cuda/clahe.cu
+++ b/modules/gpu/src/cuda/clahe.cu
@ -42,15 +42,15 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/functional.hpp"
-#include "opencv2/gpu/device/emulation.hpp"
-#include "opencv2/gpu/device/scan.hpp"
-#include "opencv2/gpu/device/reduce.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+#include "opencv2/core/cuda/scan.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"

 using namespace cv::gpu;
-using namespace cv::gpu::device;
+using namespace cv::gpu::cudev;

 namespace clahe
 {
--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
@ -42,12 +42,12 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/transform.hpp"
-#include "opencv2/gpu/device/color.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/color.hpp"
 #include "cvt_color_internal.h"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type)
    {
@ -229,7 +229,7 @@ namespace cv { namespace gpu { namespace device
        traits::functor_type functor = traits::create_functor(); \
        typedef typename traits::functor_type::argument_type src_t; \
        typedef typename traits::functor_type::result_type   dst_t; \
-        cv::gpu::device::transform((PtrStepSz<src_t>)src, (PtrStepSz<dst_t>)dst, functor, WithOutMask(), stream); \
+        cv::gpu::cudev::transform((PtrStepSz<src_t>)src, (PtrStepSz<dst_t>)dst, functor, WithOutMask(), stream); \
    }

 #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \
@ -456,6 +456,6 @@ namespace cv { namespace gpu { namespace device
    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL
    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F
    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL
-}}} // namespace cv { namespace gpu { namespace device
+}}} // namespace cv { namespace gpu { namespace cudev

 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.h
+++ b/modules/gpu/src/cuda/column_filter.h
@ -40,13 +40,13 @@
 //
 //M*/

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"

 using namespace cv::gpu;
-using namespace cv::gpu::device;
+using namespace cv::gpu::cudev;

 namespace column_filter
 {
--- a/modules/gpu/src/cuda/copy_make_border.cu
+++ b/modules/gpu/src/cuda/copy_make_border.cu
@ -42,10 +42,10 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace imgproc
    {
@ -126,6 +126,6 @@ namespace cv { namespace gpu { namespace device
        template void copyMakeBorder_gpu<float, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
        template void copyMakeBorder_gpu<float, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace device
+}}} // namespace cv { namespace gpu { namespace cudev

 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/debayer.cu
+++ b/modules/gpu/src/cuda/debayer.cu
@ -42,14 +42,14 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/vec_traits.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/limits.hpp"
-#include "opencv2/gpu/device/color.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-
-namespace cv { namespace gpu { namespace device
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/color.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+
+namespace cv { namespace gpu { namespace cudev
 {
    template <typename T> struct Bayer2BGR;

--- a/modules/gpu/src/cuda/disp_bilateral_filter.cu
+++ b/modules/gpu/src/cuda/disp_bilateral_filter.cu
@ -42,10 +42,10 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/limits.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/limits.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace disp_bilateral_filter
    {
@ -208,7 +208,7 @@ namespace cv { namespace gpu { namespace device
                }
                break;
            default:
-                cv::gpu::error("Unsupported channels count", __FILE__, __LINE__, "disp_bilateral_filter");
+                CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
            }

            if (stream == 0)
@ -218,6 +218,6 @@ namespace cv { namespace gpu { namespace device
        template void disp_bilateral_filter<uchar>(PtrStepSz<uchar> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream);
        template void disp_bilateral_filter<short>(PtrStepSz<short> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream);
    } // namespace bilateral_filter
-}}} // namespace cv { namespace gpu { namespace device
+}}} // namespace cv { namespace gpu { namespace cudev

 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@ -42,16 +42,16 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/functional.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/transform.hpp"
-#include "opencv2/gpu/device/limits.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/simd_functions.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/simd_functions.hpp"

 using namespace cv::gpu;
-using namespace cv::gpu::device;
+using namespace cv::gpu::cudev;

 namespace arithm
 {
@ -193,7 +193,7 @@ namespace arithm
    };
 }

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <> struct TransformFunctorTraits< arithm::VAdd4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
    {
@ -216,21 +216,21 @@ namespace arithm
 {
    void addMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform(src1, src2, dst, VAdd4(), WithOutMask(), stream);
+        cudev::transform(src1, src2, dst, VAdd4(), WithOutMask(), stream);
    }

    void addMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform(src1, src2, dst, VAdd2(), WithOutMask(), stream);
+        cudev::transform(src1, src2, dst, VAdd2(), WithOutMask(), stream);
    }

    template <typename T, typename D>
    void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
    {
        if (mask.data)
-            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, AddMat<T, D>(), mask, stream);
+            cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, AddMat<T, D>(), mask, stream);
        else
-            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, AddMat<T, D>(), WithOutMask(), stream);
+            cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, AddMat<T, D>(), WithOutMask(), stream);
    }

    template void addMat<uchar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@ -308,7 +308,7 @@ namespace arithm
    };
 }

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::AddScalar<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
    {
@ -323,9 +323,9 @@ namespace arithm
        AddScalar<T, S, D> op(static_cast<S>(val));

        if (mask.data)
-            transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, mask, stream);
+            cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, mask, stream);
        else
-            transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+            cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
    }

    template void addScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@ -428,7 +428,7 @@ namespace arithm
    };
 }

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <> struct TransformFunctorTraits< arithm::VSub4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
    {
@ -451,21 +451,21 @@ namespace arithm
 {
    void subMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform(src1, src2, dst, VSub4(), WithOutMask(), stream);
+        cudev::transform(src1, src2, dst, VSub4(), WithOutMask(), stream);
    }

    void subMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform(src1, src2, dst, VSub2(), WithOutMask(), stream);
+        cudev::transform(src1, src2, dst, VSub2(), WithOutMask(), stream);
    }

    template <typename T, typename D>
    void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
    {
        if (mask.data)
-            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, SubMat<T, D>(), mask, stream);
+            cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, SubMat<T, D>(), mask, stream);
        else
-            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, SubMat<T, D>(), WithOutMask(), stream);
+            cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, SubMat<T, D>(), WithOutMask(), stream);
    }

    template void subMat<uchar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@ -536,9 +536,9 @@ namespace arithm
        AddScalar<T, S, D> op(-static_cast<S>(val));

        if (mask.data)
-            transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, mask, stream);
+            cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, mask, stream);
        else
-            transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+            cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
    }

    template void subScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@ -657,7 +657,7 @@ namespace arithm
    };
 }

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <> struct TransformFunctorTraits<arithm::Mul_8uc4_32f> : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
    {
@ -676,12 +676,12 @@ namespace arithm
 {
    void mulMat_8uc4_32f(PtrStepSz<uint> src1, PtrStepSzf src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform(src1, src2, dst, Mul_8uc4_32f(), WithOutMask(), stream);
+        cudev::transform(src1, src2, dst, Mul_8uc4_32f(), WithOutMask(), stream);
    }

    void mulMat_16sc4_32f(PtrStepSz<short4> src1, PtrStepSzf src2, PtrStepSz<short4> dst, cudaStream_t stream)
    {
-        transform(src1, src2, dst, Mul_16sc4_32f(), WithOutMask(), stream);
+        cudev::transform(src1, src2, dst, Mul_16sc4_32f(), WithOutMask(), stream);
    }

    template <typename T, typename S, typename D>
@ -690,12 +690,12 @@ namespace arithm
        if (scale == 1)
        {
            Mul<T, D> op;
-            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+            cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
        }
        else
        {
            MulScale<T, S, D> op(static_cast<S>(scale));
-            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+            cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
        }
    }

@ -774,7 +774,7 @@ namespace arithm
    };
 }

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::MulScalar<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
    {
@ -787,7 +787,7 @@ namespace arithm
    void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
    {
        MulScalar<T, S, D> op(static_cast<S>(val));
-        transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+        cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
    }

    template void mulScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
@ -925,7 +925,7 @@ namespace arithm
    };
 }

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <> struct TransformFunctorTraits<arithm::Div_8uc4_32f> : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
    {
@ -944,12 +944,12 @@ namespace arithm
 {
    void divMat_8uc4_32f(PtrStepSz<uint> src1, PtrStepSzf src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform(src1, src2, dst, Div_8uc4_32f(), WithOutMask(), stream);
+        cudev::transform(src1, src2, dst, Div_8uc4_32f(), WithOutMask(), stream);
    }

    void divMat_16sc4_32f(PtrStepSz<short4> src1, PtrStepSzf src2, PtrStepSz<short4> dst, cudaStream_t stream)
    {
-        transform(src1, src2, dst, Div_16sc4_32f(), WithOutMask(), stream);
+        cudev::transform(src1, src2, dst, Div_16sc4_32f(), WithOutMask(), stream);
    }

    template <typename T, typename S, typename D>
@ -958,12 +958,12 @@ namespace arithm
        if (scale == 1)
        {
            Div<T, D> op;
-            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+            cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
        }
        else
        {
            DivScale<T, S, D> op(static_cast<S>(scale));
-            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+            cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
        }
    }

@ -1033,7 +1033,7 @@ namespace arithm
    void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
    {
        MulScalar<T, S, D> op(static_cast<S>(1.0 / val));
-        transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+        cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
    }

    template void divScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
@ -1111,7 +1111,7 @@ namespace arithm
    };
 }

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::DivInv<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
    {
@ -1124,7 +1124,7 @@ namespace arithm
    void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
    {
        DivInv<T, S, D> op(static_cast<S>(val));
-        transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+        cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
    }

    template void divInv<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
@ -1240,7 +1240,7 @@ namespace arithm
    };
 }

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <> struct TransformFunctorTraits< arithm::VAbsDiff4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
    {
@ -1263,18 +1263,18 @@ namespace arithm
 {
    void absDiffMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform(src1, src2, dst, VAbsDiff4(), WithOutMask(), stream);
+        cudev::transform(src1, src2, dst, VAbsDiff4(), WithOutMask(), stream);
    }

    void absDiffMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform(src1, src2, dst, VAbsDiff2(), WithOutMask(), stream);
+        cudev::transform(src1, src2, dst, VAbsDiff2(), WithOutMask(), stream);
    }

    template <typename T>
    void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, AbsDiffMat<T>(), WithOutMask(), stream);
+        cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, AbsDiffMat<T>(), WithOutMask(), stream);
    }

    template void absDiffMat<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
@ -1305,7 +1305,7 @@ namespace arithm
    };
 }

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <typename T, typename S> struct TransformFunctorTraits< arithm::AbsDiffScalar<T, S> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
    {
@ -1319,7 +1319,7 @@ namespace arithm
    {
        AbsDiffScalar<T, S> op(static_cast<S>(val));

-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, op, WithOutMask(), stream);
+        cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, op, WithOutMask(), stream);
    }

    template void absDiffScalar<uchar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
@ -1334,7 +1334,7 @@ namespace arithm
 //////////////////////////////////////////////////////////////////////////
 // absMat

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <typename T> struct TransformFunctorTraits< abs_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
    {
@ -1346,7 +1346,7 @@ namespace arithm
    template <typename T>
    void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, abs_func<T>(), WithOutMask(), stream);
+        cudev::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, abs_func<T>(), WithOutMask(), stream);
    }

    template void absMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
@ -1375,7 +1375,7 @@ namespace arithm
    };
 }

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <typename T> struct TransformFunctorTraits< arithm::Sqr<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
    {
@ -1387,7 +1387,7 @@ namespace arithm
    template <typename T>
    void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, Sqr<T>(), WithOutMask(), stream);
+        cudev::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, Sqr<T>(), WithOutMask(), stream);
    }

    template void sqrMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
@ -1402,7 +1402,7 @@ namespace arithm
 //////////////////////////////////////////////////////////////////////////
 // sqrtMat

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <typename T> struct TransformFunctorTraits< sqrt_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
    {
@ -1414,7 +1414,7 @@ namespace arithm
    template <typename T>
    void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, sqrt_func<T>(), WithOutMask(), stream);
+        cudev::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, sqrt_func<T>(), WithOutMask(), stream);
    }

    template void sqrtMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
@ -1429,7 +1429,7 @@ namespace arithm
 //////////////////////////////////////////////////////////////////////////
 // logMat

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <typename T> struct TransformFunctorTraits< log_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
    {
@ -1441,7 +1441,7 @@ namespace arithm
    template <typename T>
    void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, log_func<T>(), WithOutMask(), stream);
+        cudev::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, log_func<T>(), WithOutMask(), stream);
    }

    template void logMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
@ -1471,7 +1471,7 @@ namespace arithm
    };
 }

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <typename T> struct TransformFunctorTraits< arithm::Exp<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
    {
@ -1483,7 +1483,7 @@ namespace arithm
    template <typename T>
    void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, Exp<T>(), WithOutMask(), stream);
+        cudev::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, Exp<T>(), WithOutMask(), stream);
    }

    template void expMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
@ -1554,7 +1554,7 @@ namespace arithm
    };
 }

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <> struct TransformFunctorTraits< arithm::VCmpEq4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
    {
@ -1580,26 +1580,26 @@ namespace arithm
 {
    void cmpMatEq_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform(src1, src2, dst, VCmpEq4(), WithOutMask(), stream);
+        cudev::transform(src1, src2, dst, VCmpEq4(), WithOutMask(), stream);
    }
    void cmpMatNe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform(src1, src2, dst, VCmpNe4(), WithOutMask(), stream);
+        cudev::transform(src1, src2, dst, VCmpNe4(), WithOutMask(), stream);
    }
    void cmpMatLt_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform(src1, src2, dst, VCmpLt4(), WithOutMask(), stream);
+        cudev::transform(src1, src2, dst, VCmpLt4(), WithOutMask(), stream);
    }
    void cmpMatLe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform(src1, src2, dst, VCmpLe4(), WithOutMask(), stream);
+        cudev::transform(src1, src2, dst, VCmpLe4(), WithOutMask(), stream);
    }

    template <template <typename> class Op, typename T>
    void cmpMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
    {
        Cmp<Op<T>, T> op;
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, dst, op, WithOutMask(), stream);
+        cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, dst, op, WithOutMask(), stream);
    }

    template <typename T> void cmpMatEq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
@ -1716,7 +1716,7 @@ namespace arithm
 #undef TYPE_VEC
 }

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <class Op, typename T> struct TransformFunctorTraits< arithm::CmpScalar<Op, T, 1> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(uchar)>
    {
@ -1735,7 +1735,7 @@ namespace arithm
        src_t val1 = VecTraits<src_t>::make(sval);

        CmpScalar<Op<T>, T, cn> op(val1);
-        transform((PtrStepSz<src_t>) src, (PtrStepSz<dst_t>) dst, op, WithOutMask(), stream);
+        cudev::transform((PtrStepSz<src_t>) src, (PtrStepSz<dst_t>) dst, op, WithOutMask(), stream);
    }

    template <typename T> void cmpScalarEq(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
@ -1875,7 +1875,7 @@ namespace arithm
 //////////////////////////////////////////////////////////////////////////////////////
 // bitMat

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <typename T> struct TransformFunctorTraits< bit_not<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
    {
@ -1899,33 +1899,33 @@ namespace arithm
    template <typename T> void bitMatNot(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
    {
        if (mask.data)
-            transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, bit_not<T>(), mask, stream);
+            cudev::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, bit_not<T>(), mask, stream);
        else
-            transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, bit_not<T>(), WithOutMask(), stream);
+            cudev::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, bit_not<T>(), WithOutMask(), stream);
    }

    template <typename T> void bitMatAnd(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
    {
        if (mask.data)
-            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_and<T>(), mask, stream);
+            cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_and<T>(), mask, stream);
        else
-            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_and<T>(), WithOutMask(), stream);
+            cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_and<T>(), WithOutMask(), stream);
    }

    template <typename T> void bitMatOr(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
    {
        if (mask.data)
-            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_or<T>(), mask, stream);
+            cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_or<T>(), mask, stream);
        else
-            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_or<T>(), WithOutMask(), stream);
+            cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_or<T>(), WithOutMask(), stream);
    }

    template <typename T> void bitMatXor(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
    {
        if (mask.data)
-            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_xor<T>(), mask, stream);
+            cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_xor<T>(), mask, stream);
        else
-            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_xor<T>(), WithOutMask(), stream);
+            cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_xor<T>(), WithOutMask(), stream);
    }

    template void bitMatNot<uchar>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@ -1948,7 +1948,7 @@ namespace arithm
 //////////////////////////////////////////////////////////////////////////////////////
 // bitScalar

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <typename T> struct TransformFunctorTraits< binder2nd< bit_and<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
    {
@ -1967,17 +1967,17 @@ namespace arithm
 {
    template <typename T> void bitScalarAnd(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::device::bind2nd(bit_and<T>(), src2), WithOutMask(), stream);
+        cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::cudev::bind2nd(bit_and<T>(), src2), WithOutMask(), stream);
    }

    template <typename T> void bitScalarOr(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::device::bind2nd(bit_or<T>(), src2), WithOutMask(), stream);
+        cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::cudev::bind2nd(bit_or<T>(), src2), WithOutMask(), stream);
    }

    template <typename T> void bitScalarXor(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::device::bind2nd(bit_xor<T>(), src2), WithOutMask(), stream);
+        cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::cudev::bind2nd(bit_xor<T>(), src2), WithOutMask(), stream);
    }

    template void bitScalarAnd<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
@ -2026,7 +2026,7 @@ namespace arithm
    };
 }

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <> struct TransformFunctorTraits< arithm::VMin4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
    {
@ -2053,17 +2053,17 @@ namespace arithm
 {
    void minMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform(src1, src2, dst, VMin4(), WithOutMask(), stream);
+        cudev::transform(src1, src2, dst, VMin4(), WithOutMask(), stream);
    }

    void minMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform(src1, src2, dst, VMin2(), WithOutMask(), stream);
+        cudev::transform(src1, src2, dst, VMin2(), WithOutMask(), stream);
    }

    template <typename T> void minMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, minimum<T>(), WithOutMask(), stream);
+        cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, minimum<T>(), WithOutMask(), stream);
    }

    template void minMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
@ -2076,7 +2076,7 @@ namespace arithm

    template <typename T> void minScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::device::bind2nd(minimum<T>(), src2), WithOutMask(), stream);
+        cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::cudev::bind2nd(minimum<T>(), src2), WithOutMask(), stream);
    }

    template void minScalar<uchar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
@ -2118,7 +2118,7 @@ namespace arithm
    };
 }

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <> struct TransformFunctorTraits< arithm::VMax4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
    {
@ -2145,17 +2145,17 @@ namespace arithm
 {
    void maxMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform(src1, src2, dst, VMax4(), WithOutMask(), stream);
+        cudev::transform(src1, src2, dst, VMax4(), WithOutMask(), stream);
    }

    void maxMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform(src1, src2, dst, VMax2(), WithOutMask(), stream);
+        cudev::transform(src1, src2, dst, VMax2(), WithOutMask(), stream);
    }

    template <typename T> void maxMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, maximum<T>(), WithOutMask(), stream);
+        cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, maximum<T>(), WithOutMask(), stream);
    }

    template void maxMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
@ -2168,7 +2168,7 @@ namespace arithm

    template <typename T> void maxScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::device::bind2nd(maximum<T>(), src2), WithOutMask(), stream);
+        cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::cudev::bind2nd(maximum<T>(), src2), WithOutMask(), stream);
    }

    template void maxScalar<uchar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
@ -2183,7 +2183,7 @@ namespace arithm
 //////////////////////////////////////////////////////////////////////////
 // threshold

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <typename T> struct TransformFunctorTraits< thresh_binary_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
    {
@ -2212,7 +2212,7 @@ namespace arithm
    void threshold_caller(PtrStepSz<T> src, PtrStepSz<T> dst, T thresh, T maxVal, cudaStream_t stream)
    {
        Op<T> op(thresh, maxVal);
-        transform(src, dst, op, WithOutMask(), stream);
+        cudev::transform(src, dst, op, WithOutMask(), stream);
    }

    template <typename T>
@ -2297,7 +2297,7 @@ namespace arithm
    };
 }

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <typename T> struct TransformFunctorTraits< arithm::PowOp<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
    {
@ -2309,7 +2309,7 @@ namespace arithm
    template<typename T>
    void pow(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, PowOp<T>(power), WithOutMask(), stream);
+        cudev::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, PowOp<T>(power), WithOutMask(), stream);
    }

    template void pow<uchar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
@ -2372,7 +2372,7 @@ namespace arithm
    };
 }

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <typename T1, typename T2, typename D, size_t src1_size, size_t src2_size, size_t dst_size> struct AddWeightedTraits : DefaultTransformFunctorTraits< arithm::AddWeighted<T1, T2, D> >
    {
@ -2393,7 +2393,7 @@ namespace arithm
    {
        AddWeighted<T1, T2, D> op(alpha, beta, gamma);

-        transform((PtrStepSz<T1>) src1, (PtrStepSz<T2>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+        cudev::transform((PtrStepSz<T1>) src1, (PtrStepSz<T2>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
    }

    template void addWeighted<uchar, uchar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
--- a/modules/gpu/src/cuda/fast.cu
+++ b/modules/gpu/src/cuda/fast.cu
@ -42,10 +42,10 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/utility.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace fast
    {
--- a/modules/gpu/src/cuda/fgd_bgfg.cu
+++ b/modules/gpu/src/cuda/fgd_bgfg.cu
@ -42,16 +42,16 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/limits.hpp"
-#include "opencv2/gpu/device/utility.hpp"
-#include "opencv2/gpu/device/reduce.hpp"
-#include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/functional.hpp"
 #include "fgd_bgfg_common.hpp"

 using namespace cv::gpu;
-using namespace cv::gpu::device;
+using namespace cv::gpu::cudev;

 namespace bgfg
 {
--- a/modules/gpu/src/cuda/gftt.cu
+++ b/modules/gpu/src/cuda/gftt.cu
@ -45,21 +45,19 @@
 #include <thrust/device_ptr.h>
 #include <thrust/sort.h>

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/utility.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace gfft
    {
        texture<float, cudaTextureType2D, cudaReadModeElementType> eigTex(0, cudaFilterModePoint, cudaAddressModeClamp);

-        __device__ uint g_counter = 0;
+        __device__ int g_counter = 0;

-        template <class Mask> __global__ void findCorners(float threshold, const Mask mask, float2* corners, uint max_count, int rows, int cols)
+        template <class Mask> __global__ void findCorners(float threshold, const Mask mask, float2* corners, int max_count, int rows, int cols)
        {
-            #if __CUDA_ARCH__ >= 110
-
            const int j = blockIdx.x * blockDim.x + threadIdx.x;
            const int i = blockIdx.y * blockDim.y + threadIdx.y;

@ -84,15 +82,13 @@ namespace cv { namespace gpu { namespace device

                    if (val == maxVal)
                    {
-                        const uint ind = atomicInc(&g_counter, (uint)(-1));
+                        const int ind = ::atomicAdd(&g_counter, 1);

                        if (ind < max_count)
                            corners[ind] = make_float2(j, i);
                    }
                }
            }
-
-            #endif // __CUDA_ARCH__ >= 110
        }

        int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count)
@ -100,7 +96,7 @@ namespace cv { namespace gpu { namespace device
            void* counter_ptr;
            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );

-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(uint)) );
+            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );

            bindTexture(&eigTex, eig);

@ -116,10 +112,10 @@ namespace cv { namespace gpu { namespace device

            cudaSafeCall( cudaDeviceSynchronize() );

-            uint count;
-            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(uint), cudaMemcpyDeviceToHost) );
+            int count;
+            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );

-            return min(count, max_count);
+            return std::min(count, max_count);
        }

        class EigGreater
--- a/modules/gpu/src/cuda/global_motion.cu
+++ b/modules/gpu/src/cuda/global_motion.cu
@ -45,9 +45,9 @@
 #include <thrust/device_ptr.h>
 #include <thrust/remove.h>
 #include <thrust/functional.h>
-#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/core/cuda/common.hpp"

-namespace cv { namespace gpu { namespace device { namespace globmotion {
+namespace cv { namespace gpu { namespace cudev { namespace globmotion {

 __constant__ float cml[9];
 __constant__ float cmr[9];
--- a/modules/gpu/src/cuda/hist.cu
+++ b/modules/gpu/src/cuda/hist.cu
@ -42,13 +42,13 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/functional.hpp"
-#include "opencv2/gpu/device/emulation.hpp"
-#include "opencv2/gpu/device/transform.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+#include "opencv2/core/cuda/transform.hpp"

 using namespace cv::gpu;
-using namespace cv::gpu::device;
+using namespace cv::gpu::cudev;

 namespace hist
 {
@ -127,7 +127,7 @@ namespace hist
    };
 }

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    template <> struct TransformFunctorTraits<hist::EqualizeHist> : DefaultTransformFunctorTraits<hist::EqualizeHist>
    {
@ -146,7 +146,7 @@ namespace hist

        const float scale = 255.0f / (src.cols * src.rows);

-        transform(src, dst, EqualizeHist(scale), WithOutMask(), stream);
+        cudev::transform(src, dst, EqualizeHist(scale), WithOutMask(), stream);
    }
 }

--- a/modules/gpu/src/cuda/hog.cu
+++ b/modules/gpu/src/cuda/hog.cu
@ -42,12 +42,12 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/reduce.hpp"
-#include "opencv2/gpu/device/functional.hpp"
-#include "opencv2/gpu/device/warp_shuffle.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/warp_shuffle.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    // Other values are not supported
    #define CELL_WIDTH 8
@ -316,7 +316,7 @@ namespace cv { namespace gpu { namespace device
            else if (nthreads == 512)
                normalize_hists_kernel_many_blocks<512, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
            else
-                cv::gpu::error("normalize_hists: histogram's size is too big, try to decrease number of bins", __FILE__, __LINE__, "normalize_hists");
+                CV_Error(cv::Error::StsBadArg, "normalize_hists: histogram's size is too big, try to decrease number of bins");

            cudaSafeCall( cudaGetLastError() );

@ -808,7 +808,7 @@ namespace cv { namespace gpu { namespace device
        void resize_8UC1(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); }
        void resize_8UC4(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); }
    } // namespace hog
-}}} // namespace cv { namespace gpu { namespace device
+}}} // namespace cv { namespace gpu { namespace cudev


 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/hough.cu
+++ b/modules/gpu/src/cuda/hough.cu
@ -45,13 +45,13 @@
 #include <thrust/device_ptr.h>
 #include <thrust/sort.h>

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/emulation.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/limits.hpp"
-#include "opencv2/gpu/device/dynamic_smem.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/dynamic_smem.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace hough
    {
@ -1424,7 +1424,7 @@ namespace cv { namespace gpu { namespace device
            cudaSafeCall( cudaDeviceSynchronize() );

            thrust::device_ptr<int> sizesPtr(sizes);
-            thrust::transform(sizesPtr, sizesPtr + levels + 1, sizesPtr, device::bind2nd(device::minimum<int>(), maxSize));
+            thrust::transform(sizesPtr, sizesPtr + levels + 1, sizesPtr, cudev::bind2nd(cudev::minimum<int>(), maxSize));
        }

        void GHT_Guil_Full_buildTemplFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@ -42,14 +42,14 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/vec_traits.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
 #include "internal_shared.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace imgproc
    {
@ -1002,7 +1002,7 @@ namespace cv { namespace gpu { namespace device
        template void filter2D_gpu<float, float>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
        template void filter2D_gpu<float4, float4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace device {
+}}} // namespace cv { namespace gpu { namespace cudev {


 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/integral_image.cu
+++ b/modules/gpu/src/cuda/integral_image.cu
@ -42,9 +42,9 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/core/cuda/common.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace imgproc
    {
--- a/modules/gpu/src/cuda/internal_shared.hpp
+++ b/modules/gpu/src/cuda/internal_shared.hpp
@ -48,37 +48,10 @@
 #include "NPP_staging.hpp"
 #include "opencv2/gpu/devmem2d.hpp"
 #include "safe_call.hpp"
-#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/core/cuda/common.hpp"

 namespace cv { namespace gpu
 {
-    enum
-    {
-        BORDER_REFLECT101_GPU = 0,
-        BORDER_REPLICATE_GPU,
-        BORDER_CONSTANT_GPU,
-        BORDER_REFLECT_GPU,
-        BORDER_WRAP_GPU
-    };
-
-    class NppStreamHandler
-    {
-    public:
-        inline explicit NppStreamHandler(cudaStream_t newStream = 0)
-        {
-            oldStream = nppGetStream();
-            nppSetStream(newStream);
-        }
-
-        inline ~NppStreamHandler()
-        {
-            nppSetStream(oldStream);
-        }
-
-    private:
-        cudaStream_t oldStream;
-    };
-
    class NppStStreamHandler
    {
    public:
--- a/modules/gpu/src/cuda/lbp.cu
+++ b/modules/gpu/src/cuda/lbp.cu
@ -43,10 +43,10 @@
 #if !defined CUDA_DISABLER

 #include "lbp.hpp"
-#include "opencv2/gpu/device/vec_traits.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace lbp
    {
--- a/modules/gpu/src/cuda/lbp.hpp
+++ b/modules/gpu/src/cuda/lbp.hpp
@ -43,10 +43,10 @@
 #ifndef __OPENCV_GPU_DEVICE_LBP_HPP_
 #define __OPENCV_GPU_DEVICE_LBP_HPP_

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/emulation.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/emulation.hpp"

-namespace cv { namespace gpu { namespace device {
+namespace cv { namespace gpu { namespace cudev {

 namespace lbp {

--- a/modules/gpu/src/cuda/match_template.cu
+++ b/modules/gpu/src/cuda/match_template.cu
@ -42,10 +42,10 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace match_template
    {
@ -910,7 +910,7 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        }
    } //namespace match_template
-}}} // namespace cv { namespace gpu { namespace device
+}}} // namespace cv { namespace gpu { namespace cudev


 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@ -42,9 +42,9 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/core/cuda/common.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace mathfunc
    {
@ -212,6 +212,6 @@ namespace cv { namespace gpu { namespace device
            callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);
        }
    } // namespace mathfunc
-}}} // namespace cv { namespace gpu { namespace device
+}}} // namespace cv { namespace gpu { namespace cudev

 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
@ -42,18 +42,18 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/limits.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/vec_traits.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/reduce.hpp"
-#include "opencv2/gpu/device/functional.hpp"
-#include "opencv2/gpu/device/utility.hpp"
-#include "opencv2/gpu/device/type_traits.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+#include "opencv2/core/cuda/type_traits.hpp"

 using namespace cv::gpu;
-using namespace cv::gpu::device;
+using namespace cv::gpu::cudev;

 namespace detail
 {
@ -205,7 +205,7 @@ namespace detail
        template <int BLOCK_SIZE, typename R>
        static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*> smem_tuple(R* smem)
        {
-            return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE);
+            return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE);
        }

        template <typename R>
@ -225,7 +225,7 @@ namespace detail
        template <int BLOCK_SIZE, typename R>
        static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*, volatile R*> smem_tuple(R* smem)
        {
-            return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE);
+            return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE);
        }

        template <typename R>
@ -245,7 +245,7 @@ namespace detail
        template <int BLOCK_SIZE, typename R>
        static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*, volatile R*, volatile R*> smem_tuple(R* smem)
        {
-            return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE);
+            return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE);
        }

        template <typename R>
@ -340,7 +340,7 @@ namespace sum
            {
                sum = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<result_type>::all(0);

-                device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(sum), tid, detail::Unroll<cn>::op(plus<R>()));
+                cudev::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(sum), tid, detail::Unroll<cn>::op(plus<R>()));

                if (tid == 0)
                {
@ -383,7 +383,7 @@ namespace sum
            }
        }

-        device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(sum), tid, detail::Unroll<cn>::op(plus<R>()));
+        cudev::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(sum), tid, detail::Unroll<cn>::op(plus<R>()));

        GlobalReduce<BLOCK_SIZE, R, cn>::run(sum, result, tid, bid, smem);
    }
@ -642,7 +642,7 @@ namespace minMax

                const minimum<R> minOp;
                const maximum<R> maxOp;
-                device::reduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp));
+                cudev::reduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp));

                if (tid == 0)
                {
@ -690,7 +690,7 @@ namespace minMax
            }
        }

-        device::reduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp));
+        cudev::reduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp));

        GlobalReduce<BLOCK_SIZE, R>::run(mymin, mymax, minval, maxval, tid, bid, sminval, smaxval);
    }
@ -994,7 +994,7 @@ namespace countNonZero
            }
        }

-        device::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>());
+        cudev::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>());

    #if __CUDA_ARCH__ >= 200
        if (tid == 0)
@ -1019,7 +1019,7 @@ namespace countNonZero
        {
            mycount = tid < gridDim.x * gridDim.y ? count[tid] : 0;

-            device::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>());
+            cudev::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>());

            if (tid == 0)
            {
@ -1217,7 +1217,7 @@ namespace reduce
        volatile S* srow = smem + threadIdx.y * 16;

        myVal = srow[threadIdx.x];
-        device::reduce<16>(srow, myVal, threadIdx.x, op);
+        cudev::reduce<16>(srow, myVal, threadIdx.x, op);

        if (threadIdx.x == 0)
            srow[0] = myVal;
@ -1301,7 +1301,7 @@ namespace reduce
        for (int x = threadIdx.x; x < src.cols; x += BLOCK_SIZE)
            myVal = op(myVal, saturate_cast<work_type>(srcRow[x]));

-        device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(myVal), threadIdx.x, detail::Unroll<cn>::op(op));
+        cudev::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(myVal), threadIdx.x, detail::Unroll<cn>::op(op));

        if (threadIdx.x == 0)
            dst[y] = saturate_cast<dst_type>(op.result(myVal, src.cols));
--- a/modules/gpu/src/cuda/nlm.cu
+++ b/modules/gpu/src/cuda/nlm.cu
@ -42,12 +42,12 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/vec_traits.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/functional.hpp"
-#include "opencv2/gpu/device/reduce.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"

 using namespace cv::gpu;

@ -57,7 +57,7 @@ typedef unsigned short ushort;
 //////////////////////////////////////////////////////////////////////////////////
 //// Non Local Means Denosing

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace imgproc
    {
@ -179,7 +179,7 @@ namespace cv { namespace gpu { namespace device
 //////////////////////////////////////////////////////////////////////////////////
 //// Non Local Means Denosing (fast approximate version)

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace imgproc
    {
@ -190,7 +190,7 @@ namespace cv { namespace gpu { namespace device
            template <int BLOCK_SIZE>
            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*> smem_tuple(float* smem)
            {
-                return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE);
+                return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE);
            }

            static __device__ __forceinline__ thrust::tuple<float&, float&> tie(float& val1, float& val2)
@ -209,7 +209,7 @@ namespace cv { namespace gpu { namespace device
            template <int BLOCK_SIZE>
            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
            {
-                return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE);
+                return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE);
            }

            static __device__ __forceinline__ thrust::tuple<float&, float&, float&> tie(float& val1, float2& val2)
@ -228,7 +228,7 @@ namespace cv { namespace gpu { namespace device
            template <int BLOCK_SIZE>
            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
            {
-                return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE);
+                return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE);
            }

            static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&> tie(float& val1, float3& val2)
@ -247,7 +247,7 @@ namespace cv { namespace gpu { namespace device
            template <int BLOCK_SIZE>
            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
            {
-                return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE, smem + 4 * BLOCK_SIZE);
+                return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE, smem + 4 * BLOCK_SIZE);
            }

            static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&, float&> tie(float& val1, float4& val2)
--- a/modules/gpu/src/cuda/optflowbm.cu
+++ b/modules/gpu/src/cuda/optflowbm.cu
@ -42,13 +42,13 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/limits.hpp"
-#include "opencv2/gpu/device/functional.hpp"
-#include "opencv2/gpu/device/reduce.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/reduce.hpp"

 using namespace cv::gpu;
-using namespace cv::gpu::device;
+using namespace cv::gpu::cudev;

 namespace optflowbm
 {
--- a/modules/gpu/src/cuda/optical_flow.cu
+++ b/modules/gpu/src/cuda/optical_flow.cu
@ -42,9 +42,9 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/core/cuda/common.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace optical_flow
    {
--- a/modules/gpu/src/cuda/optical_flow_farneback.cu
+++ b/modules/gpu/src/cuda/optical_flow_farneback.cu
@ -42,8 +42,8 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"

 #define tx threadIdx.x
 #define ty threadIdx.y
@ -55,7 +55,7 @@
 #define BORDER_SIZE 5
 #define MAX_KSIZE_HALF 100

-namespace cv { namespace gpu { namespace device { namespace optflow_farneback
+namespace cv { namespace gpu { namespace cudev { namespace optflow_farneback
 {
    __constant__ float c_g[8];
    __constant__ float c_xg[8];
@ -641,7 +641,7 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
        callers[borderMode](src, ksizeHalf, dst, stream);
    }

-}}}} // namespace cv { namespace gpu { namespace device { namespace optflow_farneback
+}}}} // namespace cv { namespace gpu { namespace cudev { namespace optflow_farneback


 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/orb.cu
+++ b/modules/gpu/src/cuda/orb.cu
@ -45,11 +45,11 @@
 #include <thrust/device_ptr.h>
 #include <thrust/sort.h>

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/reduce.hpp"
-#include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/functional.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace orb
    {
--- a/modules/gpu/src/cuda/pyr_down.cu
+++ b/modules/gpu/src/cuda/pyr_down.cu
@ -42,13 +42,13 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
-#include "opencv2/gpu/device/vec_traits.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace imgproc
    {
@ -222,7 +222,7 @@ namespace cv { namespace gpu { namespace device
        template void pyrDown_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
        template void pyrDown_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace device
+}}} // namespace cv { namespace gpu { namespace cudev


 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/pyr_up.cu
+++ b/modules/gpu/src/cuda/pyr_up.cu
@ -42,13 +42,13 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
-#include "opencv2/gpu/device/vec_traits.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace imgproc
    {
@ -191,6 +191,6 @@ namespace cv { namespace gpu { namespace device
        template void pyrUp_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
        template void pyrUp_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace device
+}}} // namespace cv { namespace gpu { namespace cudev

 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/pyrlk.cu
+++ b/modules/gpu/src/cuda/pyrlk.cu
@ -42,15 +42,15 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/utility.hpp"
-#include "opencv2/gpu/device/functional.hpp"
-#include "opencv2/gpu/device/limits.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/reduce.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/reduce.hpp"

 using namespace cv::gpu;
-using namespace cv::gpu::device;
+using namespace cv::gpu::cudev;

 namespace pyrlk
 {
--- a/modules/gpu/src/cuda/remap.cu
+++ b/modules/gpu/src/cuda/remap.cu
@ -42,14 +42,14 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
-#include "opencv2/gpu/device/vec_traits.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/filters.hpp"
-
-namespace cv { namespace gpu { namespace device
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/filters.hpp"
+
+namespace cv { namespace gpu { namespace cudev
 {
    namespace imgproc
    {
@ -268,7 +268,7 @@ namespace cv { namespace gpu { namespace device
        template void remap_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void remap_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace device
+}}} // namespace cv { namespace gpu { namespace cudev


 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/resize.cu
+++ b/modules/gpu/src/cuda/resize.cu
@ -43,15 +43,15 @@
 #if !defined CUDA_DISABLER

 #include <cfloat>
-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
-#include "opencv2/gpu/device/vec_traits.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/filters.hpp"
-#include "opencv2/gpu/device/scan.hpp"
-
-namespace cv { namespace gpu { namespace device
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/filters.hpp"
+#include "opencv2/core/cuda/scan.hpp"
+
+namespace cv { namespace gpu { namespace cudev
 {
    namespace imgproc
    {
@ -296,7 +296,7 @@ namespace cv { namespace gpu { namespace device
        };

    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace device
+}}} // namespace cv { namespace gpu { namespace cudev


 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/rgb_to_yv12.cu
+++ b/modules/gpu/src/cuda/rgb_to_yv12.cu
@ -42,10 +42,10 @@

 #if !defined CUDA_DISABLER

-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/vec_traits.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"

-namespace cv { namespace gpu { namespace device
+namespace cv { namespace gpu { namespace cudev
 {
    namespace video_encoding
    {
--- a/Show More
+++ b/Show More