Merge pull request #1299 from jet47:gpu-cuda-rename

12 years ago · 298a1d50d2
parent c0c575d68e 6bbac2a7d9
commit 298a1d50d2
601 changed files with 8926 additions and 9764 deletions
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@ -86,7 +86,7 @@ endmacro()
 # Usage:
 #   ocv_add_module(<name> [INTERNAL|BINDINGS] [REQUIRED] [<list of dependencies>] [OPTIONAL <list of optional dependencies>])
 # Example:
-#   ocv_add_module(yaom INTERNAL opencv_core opencv_highgui opencv_flann OPTIONAL opencv_gpu)
+#   ocv_add_module(yaom INTERNAL opencv_core opencv_highgui opencv_flann OPTIONAL opencv_cuda)
 macro(ocv_add_module _name)
  string(TOLOWER "${_name}" name)
  string(REGEX REPLACE "^opencv_" "" ${name} "${name}")
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@ -17,7 +17,7 @@ if(BUILD_DOCS AND HAVE_SPHINX)
  set(OPTIONAL_DOC_LIST "")


-  set(OPENCV2_BASE_MODULES core imgproc highgui video calib3d features2d objdetect ml flann gpu photo stitching nonfree contrib legacy bioinspired)
+  set(OPENCV2_BASE_MODULES core imgproc highgui video calib3d features2d objdetect ml flann photo stitching nonfree contrib legacy bioinspired)

  # build lists of modules to be documented
  set(OPENCV2_MODULES "")
--- a/doc/check_docs2.py
+++ b/doc/check_docs2.py
@ -33,7 +33,7 @@ doc_signatures_whitelist = [
 # templates
 "Matx", "Vec", "SparseMat_", "Scalar_", "Mat_", "Ptr", "Size_", "Point_", "Rect_", "Point3_",
 "DataType", "detail::RotationWarperBase", "flann::Index_", "CalonderDescriptorExtractor",
-"gpu::PtrStepSz", "gpu::PtrStep", "gpu::PtrElemStep_",
+"cuda::PtrStepSz", "cuda::PtrStep", "cuda::PtrElemStep_",
 # black boxes
 "CvArr", "CvFileStorage",
 # other
@ -200,10 +200,10 @@ def process_module(module, path):
        for filename in fnmatch.filter(files, "*.h*"):
            hdrlist.append(os.path.join(root, filename))

-    if module == "gpu":
-        hdrlist.append(os.path.join(path, "..", "core", "include", "opencv2", "core", "gpu_types.hpp"))
-        hdrlist.append(os.path.join(path, "..", "core", "include", "opencv2", "core", "gpu.hpp"))
-        hdrlist.append(os.path.join(path, "..", "core", "include", "opencv2", "core", "gpu_stream_accessor.hpp"))
+    if module == "cuda":
+        hdrlist.append(os.path.join(path, "..", "core", "include", "opencv2", "core", "cuda_types.hpp"))
+        hdrlist.append(os.path.join(path, "..", "core", "include", "opencv2", "core", "cuda.hpp"))
+        hdrlist.append(os.path.join(path, "..", "core", "include", "opencv2", "core", "cuda_stream_accessor.hpp"))

    decls = []
    for hname in hdrlist:
@ -212,7 +212,7 @@ def process_module(module, path):

    funcs = []
    # not really needed to hardcode all the namespaces. Normally all they are collected automatically
-    namespaces = ['cv', 'cv.gpu', 'cvflann', 'cvflann.anyimpl', 'cvflann.lsh', 'cv.flann', 'cv.linemod', 'cv.detail', 'cvtest', 'perf', 'cv.videostab']
+    namespaces = ['cv', 'cv.cuda', 'cvflann', 'cvflann.anyimpl', 'cvflann.lsh', 'cv.flann', 'cv.linemod', 'cv.detail', 'cvtest', 'perf', 'cv.videostab']
    classes = []
    structs = []

--- a/modules/core/doc/opengl_interop.rst
+++ b/modules/core/doc/opengl_interop.rst
@ -83,7 +83,7 @@ The constructors.

    :param abufId: Buffer object name.

-    :param arr: Input array (host or device memory, it can be :ocv:class:`Mat` , :ocv:class:`gpu::GpuMat` or ``std::vector`` ).
+    :param arr: Input array (host or device memory, it can be :ocv:class:`Mat` , :ocv:class:`cuda::GpuMat` or ``std::vector`` ).

    :param target: Buffer usage. See :ocv:enum:`ogl::Buffer::Target` .

@ -148,7 +148,7 @@ Copies from host/device memory to OpenGL buffer.

 .. ocv:function:: void ogl::Buffer::copyFrom(InputArray arr, Target target = ARRAY_BUFFER, bool autoRelease = false)

-    :param arr: Input array (host or device memory, it can be :ocv:class:`Mat` , :ocv:class:`gpu::GpuMat` or ``std::vector`` ).
+    :param arr: Input array (host or device memory, it can be :ocv:class:`Mat` , :ocv:class:`cuda::GpuMat` or ``std::vector`` ).

    :param target: Buffer usage. See :ocv:enum:`ogl::Buffer::Target` .

@ -162,7 +162,7 @@ Copies from OpenGL buffer to host/device memory or another OpenGL buffer object.

 .. ocv:function:: void ogl::Buffer::copyTo(OutputArray arr) const

-    :param arr: Destination array (host or device memory, can be :ocv:class:`Mat` , :ocv:class:`gpu::GpuMat` , ``std::vector`` or ``ogl::Buffer`` ).
+    :param arr: Destination array (host or device memory, can be :ocv:class:`Mat` , :ocv:class:`cuda::GpuMat` , ``std::vector`` or ``ogl::Buffer`` ).



@ -229,7 +229,7 @@ ogl::Buffer::mapDevice
 ----------------------
 Maps OpenGL buffer to CUDA device memory.

-.. ocv:function:: gpu::GpuMat ogl::Buffer::mapDevice()
+.. ocv:function:: cuda::GpuMat ogl::Buffer::mapDevice()

 This operatation doesn't copy data.
 Several buffer objects can be mapped to CUDA memory at a time.
@ -291,7 +291,7 @@ The constructors.

    :param aformat: Image format. See :ocv:enum:`ogl::Texture2D::Format` .

-    :param arr: Input array (host or device memory, it can be :ocv:class:`Mat` , :ocv:class:`gpu::GpuMat` or :ocv:class:`ogl::Buffer` ).
+    :param arr: Input array (host or device memory, it can be :ocv:class:`Mat` , :ocv:class:`cuda::GpuMat` or :ocv:class:`ogl::Buffer` ).

    :param autoRelease: Auto release mode (if true, release will be called in object's destructor).

@ -351,7 +351,7 @@ Copies from host/device memory to OpenGL texture.

 .. ocv:function:: void ogl::Texture2D::copyFrom(InputArray arr, bool autoRelease = false)

-    :param arr: Input array (host or device memory, it can be :ocv:class:`Mat` , :ocv:class:`gpu::GpuMat` or :ocv:class:`ogl::Buffer` ).
+    :param arr: Input array (host or device memory, it can be :ocv:class:`Mat` , :ocv:class:`cuda::GpuMat` or :ocv:class:`ogl::Buffer` ).

    :param autoRelease: Auto release mode (if true, release will be called in object's destructor).

@ -363,7 +363,7 @@ Copies from OpenGL texture to host/device memory or another OpenGL texture objec

 .. ocv:function:: void ogl::Texture2D::copyTo(OutputArray arr, int ddepth = CV_32F, bool autoRelease = false) const

-    :param arr: Destination array (host or device memory, can be :ocv:class:`Mat` , :ocv:class:`gpu::GpuMat` , :ocv:class:`ogl::Buffer` or ``ogl::Texture2D`` ).
+    :param arr: Destination array (host or device memory, can be :ocv:class:`Mat` , :ocv:class:`cuda::GpuMat` , :ocv:class:`ogl::Buffer` or ``ogl::Texture2D`` ).

    :param ddepth: Destination depth.

@ -532,12 +532,12 @@ Render OpenGL texture or primitives.



-gpu::setGlDevice
----------------
+cuda::setGlDevice
+-----------------
 Sets a CUDA device and initializes it for the current thread with OpenGL interoperability.

-.. ocv:function:: void gpu::setGlDevice( int device = 0 )
+.. ocv:function:: void cuda::setGlDevice( int device = 0 )

-    :param device: System index of a GPU device starting with 0.
+    :param device: System index of a CUDA device starting with 0.

 This function should be explicitly called after OpenGL context creation and before any CUDA calls.
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@ -490,7 +490,7 @@ namespace ogl
    class CV_EXPORTS Arrays;
 }

-namespace gpu
+namespace cuda
 {
    class CV_EXPORTS GpuMat;
    class CV_EXPORTS CudaMem;
--- a/modules/core/include/opencv2/core/cuda.hpp
+++ b/modules/core/include/opencv2/core/cuda.hpp
@ -41,17 +41,17 @@
 //
 //M*/

-#ifndef __OPENCV_CORE_GPU_HPP__
-#define __OPENCV_CORE_GPU_HPP__
+#ifndef __OPENCV_CORE_CUDA_HPP__
+#define __OPENCV_CORE_CUDA_HPP__

 #ifndef __cplusplus
-#  error gpu.hpp header must be compiled as C++
+#  error cuda.hpp header must be compiled as C++
 #endif

 #include "opencv2/core.hpp"
-#include "opencv2/core/gpu_types.hpp"
+#include "opencv2/core/cuda_types.hpp"

-namespace cv { namespace gpu {
+namespace cv { namespace cuda {

 //////////////////////////////// GpuMat ///////////////////////////////

@ -453,7 +453,7 @@ enum FeatureSet
 //! checks whether current device supports the given feature
 CV_EXPORTS bool deviceSupports(FeatureSet feature_set);

-//! information about what GPU archs this OpenCV GPU module was compiled for
+//! information about what GPU archs this OpenCV CUDA module was compiled for
 class CV_EXPORTS TargetArchs
 {
 public:
@ -654,7 +654,7 @@ public:
    //! checks whether device supports the given feature
    bool supports(FeatureSet feature_set) const;

-    //! checks whether the GPU module can be run on the given device
+    //! checks whether the CUDA module can be run on the given device
    bool isCompatible() const;

 private:
@ -664,9 +664,9 @@ private:
 CV_EXPORTS void printCudaDeviceInfo(int device);
 CV_EXPORTS void printShortCudaDeviceInfo(int device);

-}} // namespace cv { namespace gpu {
+}} // namespace cv { namespace cuda {


-#include "opencv2/core/gpu.inl.hpp"
+#include "opencv2/core/cuda.inl.hpp"

-#endif /* __OPENCV_CORE_GPU_HPP__ */
+#endif /* __OPENCV_CORE_CUDA_HPP__ */
--- a/modules/core/include/opencv2/core/cuda.inl.hpp
+++ b/modules/core/include/opencv2/core/cuda.inl.hpp
@ -41,12 +41,12 @@
 //
 //M*/

-#ifndef __OPENCV_CORE_GPUINL_HPP__
-#define __OPENCV_CORE_GPUINL_HPP__
+#ifndef __OPENCV_CORE_CUDAINL_HPP__
+#define __OPENCV_CORE_CUDAINL_HPP__

-#include "opencv2/core/gpu.hpp"
+#include "opencv2/core/cuda.hpp"

-namespace cv { namespace gpu {
+namespace cv { namespace cuda {

 //////////////////////////////// GpuMat ///////////////////////////////

@ -587,14 +587,14 @@ bool DeviceInfo::supports(FeatureSet feature_set) const
    return version >= feature_set;
 }

-}} // namespace cv { namespace gpu {
+}} // namespace cv { namespace cuda {

 //////////////////////////////// Mat ////////////////////////////////

 namespace cv {

 inline
-Mat::Mat(const gpu::GpuMat& m)
+Mat::Mat(const cuda::GpuMat& m)
    : flags(0), dims(0), rows(0), cols(0), data(0), refcount(0), datastart(0), dataend(0), datalimit(0), allocator(0), size(&rows)
 {
    m.download(*this);
@ -602,4 +602,4 @@ Mat::Mat(const gpu::GpuMat& m)

 }

-#endif // __OPENCV_CORE_GPUINL_HPP__
+#endif // __OPENCV_CORE_CUDAINL_HPP__
--- a/modules/core/include/opencv2/core/cuda/block.hpp
+++ b/modules/core/include/opencv2/core/cuda/block.hpp
@ -40,10 +40,10 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_DEVICE_BLOCK_HPP__
-#define __OPENCV_GPU_DEVICE_BLOCK_HPP__
+#ifndef __OPENCV_CUDA_DEVICE_BLOCK_HPP__
+#define __OPENCV_CUDA_DEVICE_BLOCK_HPP__

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    struct Block
    {
@ -200,4 +200,4 @@ namespace cv { namespace gpu { namespace cudev
    };
 }}}

-#endif /* __OPENCV_GPU_DEVICE_BLOCK_HPP__ */
+#endif /* __OPENCV_CUDA_DEVICE_BLOCK_HPP__ */
--- a/modules/core/include/opencv2/core/cuda/border_interpolate.hpp
+++ b/modules/core/include/opencv2/core/cuda/border_interpolate.hpp
@ -40,14 +40,14 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
-#define __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
+#ifndef __OPENCV_CUDA_BORDER_INTERPOLATE_HPP__
+#define __OPENCV_CUDA_BORDER_INTERPOLATE_HPP__

 #include "saturate_cast.hpp"
 #include "vec_traits.hpp"
 #include "vec_math.hpp"

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    //////////////////////////////////////////////////////////////
    // BrdConstant
@ -709,6 +709,6 @@ namespace cv { namespace gpu { namespace cudev
        int width;
        D val;
    };
-}}} // namespace cv { namespace gpu { namespace cudev
+}}} // namespace cv { namespace cuda { namespace cudev

-#endif // __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
+#endif // __OPENCV_CUDA_BORDER_INTERPOLATE_HPP__
--- a/modules/core/include/opencv2/core/cuda/color.hpp
+++ b/modules/core/include/opencv2/core/cuda/color.hpp
@ -40,262 +40,262 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_COLOR_HPP__
-#define __OPENCV_GPU_COLOR_HPP__
+#ifndef __OPENCV_CUDA_COLOR_HPP__
+#define __OPENCV_CUDA_COLOR_HPP__

 #include "detail/color_detail.hpp"

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
-    // All OPENCV_GPU_IMPLEMENT_*_TRAITS(ColorSpace1_to_ColorSpace2, ...) macros implements
+    // All OPENCV_CUDA_IMPLEMENT_*_TRAITS(ColorSpace1_to_ColorSpace2, ...) macros implements
    // template <typename T> class ColorSpace1_to_ColorSpace2_traits
    // {
    //     typedef ... functor_type;
    //     static __host__ __device__ functor_type create_functor();
    // };

-    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgb, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_bgra, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgba, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_bgr, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgb, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgba, 4, 4, 2)
-
-    #undef OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr555, 3, 0, 5)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr565, 3, 0, 6)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr555, 3, 2, 5)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr565, 3, 2, 6)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr555, 4, 0, 5)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr565, 4, 0, 6)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr555, 4, 2, 5)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr565, 4, 2, 6)
-
-    #undef OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgb, 3, 2, 5)
-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgb, 3, 2, 6)
-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgr, 3, 0, 5)
-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgr, 3, 0, 6)
-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgba, 4, 2, 5)
-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgba, 4, 2, 6)
-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgra, 4, 0, 5)
-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgra, 4, 0, 6)
-
-    #undef OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgr, 3)
-    OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgra, 4)
-
-    #undef OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr555, 5)
-    OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr565, 6)
-
-    #undef OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr555_to_gray, 5)
-    OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr565_to_gray, 6)
-
-    #undef OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgb_to_gray, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgr_to_gray, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgba_to_gray, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgra_to_gray, 4, 0)
-
-    #undef OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv4, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv4, 4, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv, 3, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv4, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv4, 4, 4, 0)
-
-    #undef OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgb, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgba, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgb, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgba, 4, 4, 2)
-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgr, 3, 3, 0)
-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgra, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgr, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgra, 4, 4, 0)
-
-    #undef OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb4, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb4, 4, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb, 3, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb4, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb4, 4, 4, 0)
-
-    #undef OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgb, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgba, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgb, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgba, 4, 4, 2)
-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgr, 3, 3, 0)
-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgra, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgr, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgra, 4, 4, 0)
-
-    #undef OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz4, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz4, 4, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz, 3, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz4, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz4, 4, 4, 0)
-
-    #undef OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgb, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgb, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgba, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgba, 4, 4, 2)
-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgr, 3, 3, 0)
-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgr, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgra, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgra, 4, 4, 0)
-
-    #undef OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv4, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv4, 4, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv, 3, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv4, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv4, 4, 4, 0)
-
-    #undef OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgb, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgba, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgb, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgba, 4, 4, 2)
-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgr, 3, 3, 0)
-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgra, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgr, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgra, 4, 4, 0)
-
-    #undef OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls4, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls4, 4, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls, 3, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls4, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls4, 4, 4, 0)
-
-    #undef OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgb, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgba, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgb, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgba, 4, 4, 2)
-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgr, 3, 3, 0)
-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgra, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgr, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgra, 4, 4, 0)
-
-    #undef OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(rgb_to_lab, 3, 3, true, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(rgba_to_lab, 4, 3, true, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(rgb_to_lab4, 3, 4, true, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(rgba_to_lab4, 4, 4, true, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(bgr_to_lab, 3, 3, true, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(bgra_to_lab, 4, 3, true, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(bgr_to_lab4, 3, 4, true, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(bgra_to_lab4, 4, 4, true, 0)
-
-    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lrgb_to_lab, 3, 3, false, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lrgba_to_lab, 4, 3, false, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lrgb_to_lab4, 3, 4, false, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lrgba_to_lab4, 4, 4, false, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lbgr_to_lab, 3, 3, false, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lbgra_to_lab, 4, 3, false, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lbgr_to_lab4, 3, 4, false, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lbgra_to_lab4, 4, 4, false, 0)
-
-    #undef OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_rgb, 3, 3, true, 2)
-    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_rgb, 4, 3, true, 2)
-    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_rgba, 3, 4, true, 2)
-    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_rgba, 4, 4, true, 2)
-    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_bgr, 3, 3, true, 0)
-    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_bgr, 4, 3, true, 0)
-    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_bgra, 3, 4, true, 0)
-    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_bgra, 4, 4, true, 0)
-
-    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lrgb, 3, 3, false, 2)
-    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lrgb, 4, 3, false, 2)
-    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lrgba, 3, 4, false, 2)
-    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lrgba, 4, 4, false, 2)
-    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lbgr, 3, 3, false, 0)
-    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lbgr, 4, 3, false, 0)
-    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lbgra, 3, 4, false, 0)
-    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lbgra, 4, 4, false, 0)
-
-    #undef OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(rgb_to_luv, 3, 3, true, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(rgba_to_luv, 4, 3, true, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(rgb_to_luv4, 3, 4, true, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(rgba_to_luv4, 4, 4, true, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(bgr_to_luv, 3, 3, true, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(bgra_to_luv, 4, 3, true, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(bgr_to_luv4, 3, 4, true, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(bgra_to_luv4, 4, 4, true, 0)
-
-    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lrgb_to_luv, 3, 3, false, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lrgba_to_luv, 4, 3, false, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lrgb_to_luv4, 3, 4, false, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lrgba_to_luv4, 4, 4, false, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lbgr_to_luv, 3, 3, false, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lbgra_to_luv, 4, 3, false, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lbgr_to_luv4, 3, 4, false, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lbgra_to_luv4, 4, 4, false, 0)
-
-    #undef OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_rgb, 3, 3, true, 2)
-    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_rgb, 4, 3, true, 2)
-    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_rgba, 3, 4, true, 2)
-    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_rgba, 4, 4, true, 2)
-    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_bgr, 3, 3, true, 0)
-    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_bgr, 4, 3, true, 0)
-    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_bgra, 3, 4, true, 0)
-    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_bgra, 4, 4, true, 0)
-
-    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lrgb, 3, 3, false, 2)
-    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lrgb, 4, 3, false, 2)
-    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lrgba, 3, 4, false, 2)
-    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lrgba, 4, 4, false, 2)
-    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lbgr, 3, 3, false, 0)
-    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lbgr, 4, 3, false, 0)
-    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lbgra, 3, 4, false, 0)
-    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lbgra, 4, 4, false, 0)
-
-    #undef OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS
-}}} // namespace cv { namespace gpu { namespace cudev
-
-#endif // __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_bgra, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgba, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_bgr, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgba, 4, 4, 2)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr555, 3, 0, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr565, 3, 0, 6)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr555, 3, 2, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr565, 3, 2, 6)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr555, 4, 0, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr565, 4, 0, 6)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr555, 4, 2, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr565, 4, 2, 6)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgb, 3, 2, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgb, 3, 2, 6)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgr, 3, 0, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgr, 3, 0, 6)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgba, 4, 2, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgba, 4, 2, 6)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgra, 4, 0, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgra, 4, 0, 6)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgr, 3)
+    OPENCV_CUDA_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgra, 4)
+
+    #undef OPENCV_CUDA_IMPLEMENT_GRAY2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr555, 5)
+    OPENCV_CUDA_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr565, 6)
+
+    #undef OPENCV_CUDA_IMPLEMENT_GRAY2RGB5x5_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr555_to_gray, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr565_to_gray, 6)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB5x52GRAY_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2GRAY_TRAITS(rgb_to_gray, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2GRAY_TRAITS(bgr_to_gray, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2GRAY_TRAITS(rgba_to_gray, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2GRAY_TRAITS(bgra_to_gray, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2GRAY_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv4, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv4, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv4, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv4, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgba, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgba, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgr, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgra, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgr, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb4, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb4, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb4, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb4, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgba, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgba, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgr, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgra, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgr, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz4, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz4, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz4, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz4, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgba, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgba, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgr, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgr, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgra, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv4, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv4, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv4, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv4, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgba, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgba, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgr, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgra, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgr, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls4, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls4, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls4, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls4, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgba, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgba, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgr, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgra, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgr, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(rgb_to_lab, 3, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(rgba_to_lab, 4, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(rgb_to_lab4, 3, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(rgba_to_lab4, 4, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(bgr_to_lab, 3, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(bgra_to_lab, 4, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(bgr_to_lab4, 3, 4, true, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(bgra_to_lab4, 4, 4, true, 0)
+
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lrgb_to_lab, 3, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lrgba_to_lab, 4, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lrgb_to_lab4, 3, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lrgba_to_lab4, 4, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lbgr_to_lab, 3, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lbgra_to_lab, 4, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lbgr_to_lab4, 3, 4, false, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lbgra_to_lab4, 4, 4, false, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_rgb, 3, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_rgb, 4, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_rgba, 3, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_rgba, 4, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_bgr, 3, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_bgr, 4, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_bgra, 3, 4, true, 0)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_bgra, 4, 4, true, 0)
+
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lrgb, 3, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lrgb, 4, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lrgba, 3, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lrgba, 4, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lbgr, 3, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lbgr, 4, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lbgra, 3, 4, false, 0)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lbgra, 4, 4, false, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(rgb_to_luv, 3, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(rgba_to_luv, 4, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(rgb_to_luv4, 3, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(rgba_to_luv4, 4, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(bgr_to_luv, 3, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(bgra_to_luv, 4, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(bgr_to_luv4, 3, 4, true, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(bgra_to_luv4, 4, 4, true, 0)
+
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lrgb_to_luv, 3, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lrgba_to_luv, 4, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lrgb_to_luv4, 3, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lrgba_to_luv4, 4, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lbgr_to_luv, 3, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lbgra_to_luv, 4, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lbgr_to_luv4, 3, 4, false, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lbgra_to_luv4, 4, 4, false, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_rgb, 3, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_rgb, 4, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_rgba, 3, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_rgba, 4, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_bgr, 3, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_bgr, 4, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_bgra, 3, 4, true, 0)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_bgra, 4, 4, true, 0)
+
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lrgb, 3, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lrgb, 4, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lrgba, 3, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lrgba, 4, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lbgr, 3, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lbgr, 4, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lbgra, 3, 4, false, 0)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lbgra, 4, 4, false, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS
+}}} // namespace cv { namespace cuda { namespace cudev
+
+#endif // __OPENCV_CUDA_BORDER_INTERPOLATE_HPP__
--- a/modules/core/include/opencv2/core/cuda/common.hpp
+++ b/modules/core/include/opencv2/core/cuda/common.hpp
@ -40,11 +40,11 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_COMMON_HPP__
-#define __OPENCV_GPU_COMMON_HPP__
+#ifndef __OPENCV_CUDA_COMMON_HPP__
+#define __OPENCV_CUDA_COMMON_HPP__

 #include <cuda_runtime.h>
-#include "opencv2/core/gpu_types.hpp"
+#include "opencv2/core/cuda_types.hpp"
 #include "opencv2/core/cvdef.h"
 #include "opencv2/core/base.hpp"

@ -56,7 +56,7 @@
    #endif
 #endif

-namespace cv { namespace gpu {
+namespace cv { namespace cuda {
    static inline void checkCudaError(cudaError_t err, const char* file, const int line, const char* func)
    {
        if (cudaSuccess != err)
@ -66,13 +66,13 @@ namespace cv { namespace gpu {

 #ifndef cudaSafeCall
    #if defined(__GNUC__)
-        #define cudaSafeCall(expr)  cv::gpu::checkCudaError(expr, __FILE__, __LINE__, __func__)
+        #define cudaSafeCall(expr)  cv::cuda::checkCudaError(expr, __FILE__, __LINE__, __func__)
    #else /* defined(__CUDACC__) || defined(__MSVC__) */
-        #define cudaSafeCall(expr)  cv::gpu::checkCudaError(expr, __FILE__, __LINE__, "")
+        #define cudaSafeCall(expr)  cv::cuda::checkCudaError(expr, __FILE__, __LINE__, "")
    #endif
 #endif

-namespace cv { namespace gpu
+namespace cv { namespace cuda
 {
    template <typename T> static inline bool isAligned(const T* ptr, size_t size)
    {
@ -85,9 +85,9 @@ namespace cv { namespace gpu
    }
 }}

-namespace cv { namespace gpu
+namespace cv { namespace cuda
 {
-    namespace cudev
+    namespace device
    {
        __host__ __device__ __forceinline__ int divUp(int total, int grain)
        {
@ -104,4 +104,4 @@ namespace cv { namespace gpu



-#endif // __OPENCV_GPU_COMMON_HPP__
+#endif // __OPENCV_CUDA_COMMON_HPP__
--- a/modules/core/include/opencv2/core/cuda/datamov_utils.hpp
+++ b/modules/core/include/opencv2/core/cuda/datamov_utils.hpp
@ -40,12 +40,12 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_DATAMOV_UTILS_HPP__
-#define __OPENCV_GPU_DATAMOV_UTILS_HPP__
+#ifndef __OPENCV_CUDA_DATAMOV_UTILS_HPP__
+#define __OPENCV_CUDA_DATAMOV_UTILS_HPP__

 #include "common.hpp"

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 200

@ -59,47 +59,47 @@ namespace cv { namespace gpu { namespace cudev

        #if defined(_WIN64) || defined(__LP64__)
            // 64-bit register modifier for inlined asm
-            #define OPENCV_GPU_ASM_PTR "l"
+            #define OPENCV_CUDA_ASM_PTR "l"
        #else
            // 32-bit register modifier for inlined asm
-            #define OPENCV_GPU_ASM_PTR "r"
+            #define OPENCV_CUDA_ASM_PTR "r"
        #endif

        template<class T> struct ForceGlob;

-        #define OPENCV_GPU_DEFINE_FORCE_GLOB(base_type, ptx_type, reg_mod) \
+        #define OPENCV_CUDA_DEFINE_FORCE_GLOB(base_type, ptx_type, reg_mod) \
            template <> struct ForceGlob<base_type> \
            { \
                __device__ __forceinline__ static void Load(const base_type* ptr, int offset, base_type& val) \
                { \
-                    asm("ld.global."#ptx_type" %0, [%1];" : "="#reg_mod(val) : OPENCV_GPU_ASM_PTR(ptr + offset)); \
+                    asm("ld.global."#ptx_type" %0, [%1];" : "="#reg_mod(val) : OPENCV_CUDA_ASM_PTR(ptr + offset)); \
                } \
            };

-        #define OPENCV_GPU_DEFINE_FORCE_GLOB_B(base_type, ptx_type) \
+        #define OPENCV_CUDA_DEFINE_FORCE_GLOB_B(base_type, ptx_type) \
            template <> struct ForceGlob<base_type> \
            { \
                __device__ __forceinline__ static void Load(const base_type* ptr, int offset, base_type& val) \
                { \
-                    asm("ld.global."#ptx_type" %0, [%1];" : "=r"(*reinterpret_cast<uint*>(&val)) : OPENCV_GPU_ASM_PTR(ptr + offset)); \
+                    asm("ld.global."#ptx_type" %0, [%1];" : "=r"(*reinterpret_cast<uint*>(&val)) : OPENCV_CUDA_ASM_PTR(ptr + offset)); \
                } \
            };

-            OPENCV_GPU_DEFINE_FORCE_GLOB_B(uchar,  u8)
-            OPENCV_GPU_DEFINE_FORCE_GLOB_B(schar,  s8)
-            OPENCV_GPU_DEFINE_FORCE_GLOB_B(char,   b8)
-            OPENCV_GPU_DEFINE_FORCE_GLOB  (ushort, u16, h)
-            OPENCV_GPU_DEFINE_FORCE_GLOB  (short,  s16, h)
-            OPENCV_GPU_DEFINE_FORCE_GLOB  (uint,   u32, r)
-            OPENCV_GPU_DEFINE_FORCE_GLOB  (int,    s32, r)
-            OPENCV_GPU_DEFINE_FORCE_GLOB  (float,  f32, f)
-            OPENCV_GPU_DEFINE_FORCE_GLOB  (double, f64, d)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB_B(uchar,  u8)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB_B(schar,  s8)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB_B(char,   b8)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB  (ushort, u16, h)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB  (short,  s16, h)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB  (uint,   u32, r)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB  (int,    s32, r)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB  (float,  f32, f)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB  (double, f64, d)

-        #undef OPENCV_GPU_DEFINE_FORCE_GLOB
-        #undef OPENCV_GPU_DEFINE_FORCE_GLOB_B
-        #undef OPENCV_GPU_ASM_PTR
+        #undef OPENCV_CUDA_DEFINE_FORCE_GLOB
+        #undef OPENCV_CUDA_DEFINE_FORCE_GLOB_B
+        #undef OPENCV_CUDA_ASM_PTR

    #endif // __CUDA_ARCH__ >= 200
-}}} // namespace cv { namespace gpu { namespace cudev
+}}} // namespace cv { namespace cuda { namespace cudev

-#endif // __OPENCV_GPU_DATAMOV_UTILS_HPP__
+#endif // __OPENCV_CUDA_DATAMOV_UTILS_HPP__
--- a/modules/core/include/opencv2/core/cuda/detail/color_detail.hpp
+++ b/modules/core/include/opencv2/core/cuda/detail/color_detail.hpp
@ -40,8 +40,8 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_COLOR_DETAIL_HPP__
-#define __OPENCV_GPU_COLOR_DETAIL_HPP__
+#ifndef __OPENCV_CUDA_COLOR_DETAIL_HPP__
+#define __OPENCV_CUDA_COLOR_DETAIL_HPP__

 #include "../common.hpp"
 #include "../vec_traits.hpp"
@ -49,7 +49,7 @@
 #include "../limits.hpp"
 #include "../functional.hpp"

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    #ifndef CV_DESCALE
        #define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))
@ -143,10 +143,10 @@ namespace cv { namespace gpu { namespace cudev
        };
    }

-#define OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(name, scn, dcn, bidx) \
+#define OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(name, scn, dcn, bidx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::cudev::color_detail::RGB2RGB<T, scn, dcn, bidx> functor_type; \
+        typedef ::cv::cuda::device::color_detail::RGB2RGB<T, scn, dcn, bidx> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -216,10 +216,10 @@ namespace cv { namespace gpu { namespace cudev
        };
    }

-#define OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(name, scn, bidx, green_bits) \
+#define OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(name, scn, bidx, green_bits) \
    struct name ## _traits \
    { \
-        typedef ::cv::gpu::cudev::color_detail::RGB2RGB5x5<scn, bidx, green_bits> functor_type; \
+        typedef ::cv::cuda::device::color_detail::RGB2RGB5x5<scn, bidx, green_bits> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -297,10 +297,10 @@ namespace cv { namespace gpu { namespace cudev
        };
    }

-#define OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(name, dcn, bidx, green_bits) \
+#define OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(name, dcn, bidx, green_bits) \
    struct name ## _traits \
    { \
-        typedef ::cv::gpu::cudev::color_detail::RGB5x52RGB<dcn, bidx, green_bits> functor_type; \
+        typedef ::cv::cuda::device::color_detail::RGB5x52RGB<dcn, bidx, green_bits> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -343,10 +343,10 @@ namespace cv { namespace gpu { namespace cudev
        };
    }

-#define OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(name, dcn) \
+#define OPENCV_CUDA_IMPLEMENT_GRAY2RGB_TRAITS(name, dcn) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::cudev::color_detail::Gray2RGB<T, dcn> functor_type; \
+        typedef ::cv::cuda::device::color_detail::Gray2RGB<T, dcn> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -385,10 +385,10 @@ namespace cv { namespace gpu { namespace cudev
        };
    }

-#define OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(name, green_bits) \
+#define OPENCV_CUDA_IMPLEMENT_GRAY2RGB5x5_TRAITS(name, green_bits) \
    struct name ## _traits \
    { \
-        typedef ::cv::gpu::cudev::color_detail::Gray2RGB5x5<green_bits> functor_type; \
+        typedef ::cv::cuda::device::color_detail::Gray2RGB5x5<green_bits> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -427,10 +427,10 @@ namespace cv { namespace gpu { namespace cudev
        };
    }

-#define OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(name, green_bits) \
+#define OPENCV_CUDA_IMPLEMENT_RGB5x52GRAY_TRAITS(name, green_bits) \
    struct name ## _traits \
    { \
-        typedef ::cv::gpu::cudev::color_detail::RGB5x52Gray<green_bits> functor_type; \
+        typedef ::cv::cuda::device::color_detail::RGB5x52Gray<green_bits> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -478,10 +478,10 @@ namespace cv { namespace gpu { namespace cudev
        };
    }

-#define OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(name, scn, bidx) \
+#define OPENCV_CUDA_IMPLEMENT_RGB2GRAY_TRAITS(name, scn, bidx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::cudev::color_detail::RGB2Gray<T, scn, bidx> functor_type; \
+        typedef ::cv::cuda::device::color_detail::RGB2Gray<T, scn, bidx> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -529,10 +529,10 @@ namespace cv { namespace gpu { namespace cudev
        };
    }

-#define OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(name, scn, dcn, bidx) \
+#define OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(name, scn, dcn, bidx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::cudev::color_detail::RGB2YUV<T, scn, dcn, bidx> functor_type; \
+        typedef ::cv::cuda::device::color_detail::RGB2YUV<T, scn, dcn, bidx> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -617,10 +617,10 @@ namespace cv { namespace gpu { namespace cudev
        };
    }

-#define OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(name, scn, dcn, bidx) \
+#define OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(name, scn, dcn, bidx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::cudev::color_detail::YUV2RGB<T, scn, dcn, bidx> functor_type; \
+        typedef ::cv::cuda::device::color_detail::YUV2RGB<T, scn, dcn, bidx> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -696,10 +696,10 @@ namespace cv { namespace gpu { namespace cudev
        };
    }

-#define OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(name, scn, dcn, bidx) \
+#define OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(name, scn, dcn, bidx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::cudev::color_detail::RGB2YCrCb<T, scn, dcn, bidx> functor_type; \
+        typedef ::cv::cuda::device::color_detail::RGB2YCrCb<T, scn, dcn, bidx> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -775,10 +775,10 @@ namespace cv { namespace gpu { namespace cudev
        };
    }

-#define OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(name, scn, dcn, bidx) \
+#define OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(name, scn, dcn, bidx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::cudev::color_detail::YCrCb2RGB<T, scn, dcn, bidx> functor_type; \
+        typedef ::cv::cuda::device::color_detail::YCrCb2RGB<T, scn, dcn, bidx> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -851,10 +851,10 @@ namespace cv { namespace gpu { namespace cudev
        };
    }

-#define OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(name, scn, dcn, bidx) \
+#define OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(name, scn, dcn, bidx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::cudev::color_detail::RGB2XYZ<T, scn, dcn, bidx> functor_type; \
+        typedef ::cv::cuda::device::color_detail::RGB2XYZ<T, scn, dcn, bidx> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -926,10 +926,10 @@ namespace cv { namespace gpu { namespace cudev
        };
    }

-#define OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(name, scn, dcn, bidx) \
+#define OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(name, scn, dcn, bidx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::cudev::color_detail::XYZ2RGB<T, scn, dcn, bidx> functor_type; \
+        typedef ::cv::cuda::device::color_detail::XYZ2RGB<T, scn, dcn, bidx> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1064,10 +1064,10 @@ namespace cv { namespace gpu { namespace cudev
        };
    }

-#define OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(name, scn, dcn, bidx) \
+#define OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(name, scn, dcn, bidx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::cudev::color_detail::RGB2HSV<T, scn, dcn, bidx, 180> functor_type; \
+        typedef ::cv::cuda::device::color_detail::RGB2HSV<T, scn, dcn, bidx, 180> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1075,7 +1075,7 @@ namespace cv { namespace gpu { namespace cudev
    }; \
    template <typename T> struct name ## _full_traits \
    { \
-        typedef ::cv::gpu::cudev::color_detail::RGB2HSV<T, scn, dcn, bidx, 256> functor_type; \
+        typedef ::cv::cuda::device::color_detail::RGB2HSV<T, scn, dcn, bidx, 256> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1083,7 +1083,7 @@ namespace cv { namespace gpu { namespace cudev
    }; \
    template <> struct name ## _traits<float> \
    { \
-        typedef ::cv::gpu::cudev::color_detail::RGB2HSV<float, scn, dcn, bidx, 360> functor_type; \
+        typedef ::cv::cuda::device::color_detail::RGB2HSV<float, scn, dcn, bidx, 360> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1091,7 +1091,7 @@ namespace cv { namespace gpu { namespace cudev
    }; \
    template <> struct name ## _full_traits<float> \
    { \
-        typedef ::cv::gpu::cudev::color_detail::RGB2HSV<float, scn, dcn, bidx, 360> functor_type; \
+        typedef ::cv::cuda::device::color_detail::RGB2HSV<float, scn, dcn, bidx, 360> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1204,10 +1204,10 @@ namespace cv { namespace gpu { namespace cudev
        };
    }

-#define OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(name, scn, dcn, bidx) \
+#define OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(name, scn, dcn, bidx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::cudev::color_detail::HSV2RGB<T, scn, dcn, bidx, 180> functor_type; \
+        typedef ::cv::cuda::device::color_detail::HSV2RGB<T, scn, dcn, bidx, 180> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1215,7 +1215,7 @@ namespace cv { namespace gpu { namespace cudev
    }; \
    template <typename T> struct name ## _full_traits \
    { \
-        typedef ::cv::gpu::cudev::color_detail::HSV2RGB<T, scn, dcn, bidx, 255> functor_type; \
+        typedef ::cv::cuda::device::color_detail::HSV2RGB<T, scn, dcn, bidx, 255> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1223,7 +1223,7 @@ namespace cv { namespace gpu { namespace cudev
    }; \
    template <> struct name ## _traits<float> \
    { \
-        typedef ::cv::gpu::cudev::color_detail::HSV2RGB<float, scn, dcn, bidx, 360> functor_type; \
+        typedef ::cv::cuda::device::color_detail::HSV2RGB<float, scn, dcn, bidx, 360> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1231,7 +1231,7 @@ namespace cv { namespace gpu { namespace cudev
    }; \
    template <> struct name ## _full_traits<float> \
    { \
-        typedef ::cv::gpu::cudev::color_detail::HSV2RGB<float, scn, dcn, bidx, 360> functor_type; \
+        typedef ::cv::cuda::device::color_detail::HSV2RGB<float, scn, dcn, bidx, 360> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1337,10 +1337,10 @@ namespace cv { namespace gpu { namespace cudev
        };
    }

-#define OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(name, scn, dcn, bidx) \
+#define OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(name, scn, dcn, bidx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::cudev::color_detail::RGB2HLS<T, scn, dcn, bidx, 180> functor_type; \
+        typedef ::cv::cuda::device::color_detail::RGB2HLS<T, scn, dcn, bidx, 180> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1348,7 +1348,7 @@ namespace cv { namespace gpu { namespace cudev
    }; \
    template <typename T> struct name ## _full_traits \
    { \
-        typedef ::cv::gpu::cudev::color_detail::RGB2HLS<T, scn, dcn, bidx, 256> functor_type; \
+        typedef ::cv::cuda::device::color_detail::RGB2HLS<T, scn, dcn, bidx, 256> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1356,7 +1356,7 @@ namespace cv { namespace gpu { namespace cudev
    }; \
    template <> struct name ## _traits<float> \
    { \
-        typedef ::cv::gpu::cudev::color_detail::RGB2HLS<float, scn, dcn, bidx, 360> functor_type; \
+        typedef ::cv::cuda::device::color_detail::RGB2HLS<float, scn, dcn, bidx, 360> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1364,7 +1364,7 @@ namespace cv { namespace gpu { namespace cudev
    }; \
    template <> struct name ## _full_traits<float> \
    { \
-        typedef ::cv::gpu::cudev::color_detail::RGB2HLS<float, scn, dcn, bidx, 360> functor_type; \
+        typedef ::cv::cuda::device::color_detail::RGB2HLS<float, scn, dcn, bidx, 360> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1477,10 +1477,10 @@ namespace cv { namespace gpu { namespace cudev
        };
    }

-#define OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(name, scn, dcn, bidx) \
+#define OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(name, scn, dcn, bidx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::cudev::color_detail::HLS2RGB<T, scn, dcn, bidx, 180> functor_type; \
+        typedef ::cv::cuda::device::color_detail::HLS2RGB<T, scn, dcn, bidx, 180> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1488,7 +1488,7 @@ namespace cv { namespace gpu { namespace cudev
    }; \
    template <typename T> struct name ## _full_traits \
    { \
-        typedef ::cv::gpu::cudev::color_detail::HLS2RGB<T, scn, dcn, bidx, 255> functor_type; \
+        typedef ::cv::cuda::device::color_detail::HLS2RGB<T, scn, dcn, bidx, 255> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1496,7 +1496,7 @@ namespace cv { namespace gpu { namespace cudev
    }; \
    template <> struct name ## _traits<float> \
    { \
-        typedef ::cv::gpu::cudev::color_detail::HLS2RGB<float, scn, dcn, bidx, 360> functor_type; \
+        typedef ::cv::cuda::device::color_detail::HLS2RGB<float, scn, dcn, bidx, 360> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1504,7 +1504,7 @@ namespace cv { namespace gpu { namespace cudev
    }; \
    template <> struct name ## _full_traits<float> \
    { \
-        typedef ::cv::gpu::cudev::color_detail::HLS2RGB<float, scn, dcn, bidx, 360> functor_type; \
+        typedef ::cv::cuda::device::color_detail::HLS2RGB<float, scn, dcn, bidx, 360> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1646,10 +1646,10 @@ namespace cv { namespace gpu { namespace cudev
        };
    }

-#define OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(name, scn, dcn, srgb, blueIdx) \
+#define OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(name, scn, dcn, srgb, blueIdx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::cudev::color_detail::RGB2Lab<T, scn, dcn, srgb, blueIdx> functor_type; \
+        typedef ::cv::cuda::device::color_detail::RGB2Lab<T, scn, dcn, srgb, blueIdx> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1759,10 +1759,10 @@ namespace cv { namespace gpu { namespace cudev
        };
    }

-#define OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(name, scn, dcn, srgb, blueIdx) \
+#define OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(name, scn, dcn, srgb, blueIdx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::cudev::color_detail::Lab2RGB<T, scn, dcn, srgb, blueIdx> functor_type; \
+        typedef ::cv::cuda::device::color_detail::Lab2RGB<T, scn, dcn, srgb, blueIdx> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1858,10 +1858,10 @@ namespace cv { namespace gpu { namespace cudev
        };
    }

-#define OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(name, scn, dcn, srgb, blueIdx) \
+#define OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(name, scn, dcn, srgb, blueIdx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::cudev::color_detail::RGB2Luv<T, scn, dcn, srgb, blueIdx> functor_type; \
+        typedef ::cv::cuda::device::color_detail::RGB2Luv<T, scn, dcn, srgb, blueIdx> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1959,10 +1959,10 @@ namespace cv { namespace gpu { namespace cudev
        };
    }

-#define OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(name, scn, dcn, srgb, blueIdx) \
+#define OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(name, scn, dcn, srgb, blueIdx) \
    template <typename T> struct name ## _traits \
    { \
-        typedef ::cv::gpu::cudev::color_detail::Luv2RGB<T, scn, dcn, srgb, blueIdx> functor_type; \
+        typedef ::cv::cuda::device::color_detail::Luv2RGB<T, scn, dcn, srgb, blueIdx> functor_type; \
        static __host__ __device__ __forceinline__ functor_type create_functor() \
        { \
            return functor_type(); \
@ -1971,6 +1971,6 @@ namespace cv { namespace gpu { namespace cudev

    #undef CV_DESCALE

-}}} // namespace cv { namespace gpu { namespace cudev
+}}} // namespace cv { namespace cuda { namespace cudev

-#endif // __OPENCV_GPU_COLOR_DETAIL_HPP__
+#endif // __OPENCV_CUDA_COLOR_DETAIL_HPP__
--- a/modules/core/include/opencv2/core/cuda/detail/reduce.hpp
+++ b/modules/core/include/opencv2/core/cuda/detail/reduce.hpp
@ -40,14 +40,14 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_REDUCE_DETAIL_HPP__
-#define __OPENCV_GPU_REDUCE_DETAIL_HPP__
+#ifndef __OPENCV_CUDA_REDUCE_DETAIL_HPP__
+#define __OPENCV_CUDA_REDUCE_DETAIL_HPP__

 #include <thrust/tuple.h>
 #include "../warp.hpp"
 #include "../warp_shuffle.hpp"

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    namespace reduce_detail
    {
@ -358,4 +358,4 @@ namespace cv { namespace gpu { namespace cudev
    }
 }}}

-#endif // __OPENCV_GPU_REDUCE_DETAIL_HPP__
+#endif // __OPENCV_CUDA_REDUCE_DETAIL_HPP__
--- a/modules/core/include/opencv2/core/cuda/detail/reduce_key_val.hpp
+++ b/modules/core/include/opencv2/core/cuda/detail/reduce_key_val.hpp
@ -40,14 +40,14 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
-#define __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
+#ifndef __OPENCV_CUDA_PRED_VAL_REDUCE_DETAIL_HPP__
+#define __OPENCV_CUDA_PRED_VAL_REDUCE_DETAIL_HPP__

 #include <thrust/tuple.h>
 #include "../warp.hpp"
 #include "../warp_shuffle.hpp"

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    namespace reduce_key_val_detail
    {
@ -495,4 +495,4 @@ namespace cv { namespace gpu { namespace cudev
    }
 }}}

-#endif // __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
+#endif // __OPENCV_CUDA_PRED_VAL_REDUCE_DETAIL_HPP__
--- a/modules/core/include/opencv2/core/cuda/detail/transform_detail.hpp
+++ b/modules/core/include/opencv2/core/cuda/detail/transform_detail.hpp
@ -40,14 +40,14 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_TRANSFORM_DETAIL_HPP__
-#define __OPENCV_GPU_TRANSFORM_DETAIL_HPP__
+#ifndef __OPENCV_CUDA_TRANSFORM_DETAIL_HPP__
+#define __OPENCV_CUDA_TRANSFORM_DETAIL_HPP__

 #include "../common.hpp"
 #include "../vec_traits.hpp"
 #include "../functional.hpp"

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    namespace transform_detail
    {
@ -390,6 +390,6 @@ namespace cv { namespace gpu { namespace cudev
            }
        };
    } // namespace transform_detail
-}}} // namespace cv { namespace gpu { namespace cudev
+}}} // namespace cv { namespace cuda { namespace cudev

-#endif // __OPENCV_GPU_TRANSFORM_DETAIL_HPP__
+#endif // __OPENCV_CUDA_TRANSFORM_DETAIL_HPP__
--- a/modules/core/include/opencv2/core/cuda/detail/type_traits_detail.hpp
+++ b/modules/core/include/opencv2/core/cuda/detail/type_traits_detail.hpp
@ -40,13 +40,13 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__
-#define __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__
+#ifndef __OPENCV_CUDA_TYPE_TRAITS_DETAIL_HPP__
+#define __OPENCV_CUDA_TYPE_TRAITS_DETAIL_HPP__

 #include "../common.hpp"
 #include "../vec_traits.hpp"

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    namespace type_traits_detail
    {
@ -182,6 +182,6 @@ namespace cv { namespace gpu { namespace cudev
            enum { value = 1 };
        };
    } // namespace type_traits_detail
-}}} // namespace cv { namespace gpu { namespace cudev
+}}} // namespace cv { namespace cuda { namespace cudev

-#endif // __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__
+#endif // __OPENCV_CUDA_TYPE_TRAITS_DETAIL_HPP__
--- a/modules/core/include/opencv2/core/cuda/detail/vec_distance_detail.hpp
+++ b/modules/core/include/opencv2/core/cuda/detail/vec_distance_detail.hpp
@ -40,12 +40,12 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
-#define __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
+#ifndef __OPENCV_CUDA_VEC_DISTANCE_DETAIL_HPP__
+#define __OPENCV_CUDA_VEC_DISTANCE_DETAIL_HPP__

 #include "../datamov_utils.hpp"

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    namespace vec_distance_detail
    {
@ -112,6 +112,6 @@ namespace cv { namespace gpu { namespace cudev
            }
        };
    } // namespace vec_distance_detail
-}}} // namespace cv { namespace gpu { namespace cudev
+}}} // namespace cv { namespace cuda { namespace cudev

-#endif // __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
+#endif // __OPENCV_CUDA_VEC_DISTANCE_DETAIL_HPP__
--- a/modules/core/include/opencv2/core/cuda/dynamic_smem.hpp
+++ b/modules/core/include/opencv2/core/cuda/dynamic_smem.hpp
@ -40,10 +40,10 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_DYNAMIC_SMEM_HPP__
-#define __OPENCV_GPU_DYNAMIC_SMEM_HPP__
+#ifndef __OPENCV_CUDA_DYNAMIC_SMEM_HPP__
+#define __OPENCV_CUDA_DYNAMIC_SMEM_HPP__

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    template<class T> struct DynamicSharedMem
    {
@ -77,4 +77,4 @@ namespace cv { namespace gpu { namespace cudev
    };
 }}}

-#endif // __OPENCV_GPU_DYNAMIC_SMEM_HPP__
+#endif // __OPENCV_CUDA_DYNAMIC_SMEM_HPP__
--- a/modules/core/include/opencv2/core/cuda/emulation.hpp
+++ b/modules/core/include/opencv2/core/cuda/emulation.hpp
@ -40,13 +40,13 @@
 //
 //M*/

-#ifndef OPENCV_GPU_EMULATION_HPP_
-#define OPENCV_GPU_EMULATION_HPP_
+#ifndef OPENCV_CUDA_EMULATION_HPP_
+#define OPENCV_CUDA_EMULATION_HPP_

 #include "common.hpp"
 #include "warp_reduce.hpp"

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    struct Emulation
    {
@ -256,6 +256,6 @@ namespace cv { namespace gpu { namespace cudev
            }
        };
    }; //struct Emulation
-}}} // namespace cv { namespace gpu { namespace cudev
+}}} // namespace cv { namespace cuda { namespace cudev

-#endif /* OPENCV_GPU_EMULATION_HPP_ */
+#endif /* OPENCV_CUDA_EMULATION_HPP_ */
--- a/modules/core/include/opencv2/core/cuda/filters.hpp
+++ b/modules/core/include/opencv2/core/cuda/filters.hpp
@ -40,15 +40,15 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_FILTERS_HPP__
-#define __OPENCV_GPU_FILTERS_HPP__
+#ifndef __OPENCV_CUDA_FILTERS_HPP__
+#define __OPENCV_CUDA_FILTERS_HPP__

 #include "saturate_cast.hpp"
 #include "vec_traits.hpp"
 #include "vec_math.hpp"
 #include "type_traits.hpp"

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    template <typename Ptr2D> struct PointFilter
    {
@ -273,6 +273,6 @@ namespace cv { namespace gpu { namespace cudev
        float scale_x, scale_y;
        int width, haight;
    };
-}}} // namespace cv { namespace gpu { namespace cudev
+}}} // namespace cv { namespace cuda { namespace cudev

-#endif // __OPENCV_GPU_FILTERS_HPP__
+#endif // __OPENCV_CUDA_FILTERS_HPP__
--- a/modules/core/include/opencv2/core/cuda/funcattrib.hpp
+++ b/modules/core/include/opencv2/core/cuda/funcattrib.hpp
@ -40,12 +40,12 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_DEVICE_FUNCATTRIB_HPP_
-#define __OPENCV_GPU_DEVICE_FUNCATTRIB_HPP_
+#ifndef __OPENCV_CUDA_DEVICE_FUNCATTRIB_HPP_
+#define __OPENCV_CUDA_DEVICE_FUNCATTRIB_HPP_

 #include <cstdio>

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    template<class Func>
    void printFuncAttrib(Func& func)
@ -66,6 +66,6 @@ namespace cv { namespace gpu { namespace cudev
        printf("\n");
        fflush(stdout);
    }
-}}} // namespace cv { namespace gpu { namespace cudev
+}}} // namespace cv { namespace cuda { namespace cudev

-#endif  /* __OPENCV_GPU_DEVICE_FUNCATTRIB_HPP_ */
+#endif  /* __OPENCV_CUDA_DEVICE_FUNCATTRIB_HPP_ */
--- a/modules/core/include/opencv2/core/cuda/functional.hpp
+++ b/modules/core/include/opencv2/core/cuda/functional.hpp
@ -40,8 +40,8 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_FUNCTIONAL_HPP__
-#define __OPENCV_GPU_FUNCTIONAL_HPP__
+#ifndef __OPENCV_CUDA_FUNCTIONAL_HPP__
+#define __OPENCV_CUDA_FUNCTIONAL_HPP__

 #include <functional>
 #include "saturate_cast.hpp"
@ -49,7 +49,7 @@
 #include "type_traits.hpp"
 #include "device_functions.h"

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    // Function Objects
    template<typename Argument, typename Result> struct unary_function : public std::unary_function<Argument, Result> {};
@ -298,7 +298,7 @@ namespace cv { namespace gpu { namespace cudev

    // Min/Max Operations

-#define OPENCV_GPU_IMPLEMENT_MINMAX(name, type, op) \
+#define OPENCV_CUDA_IMPLEMENT_MINMAX(name, type, op) \
    template <> struct name<type> : binary_function<type, type, type> \
    { \
        __device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \
@ -316,15 +316,15 @@ namespace cv { namespace gpu { namespace cudev
        __host__ __device__ __forceinline__ maximum(const maximum&) {}
    };

-    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, ::max)
-    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, schar, ::max)
-    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, char, ::max)
-    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, ushort, ::max)
-    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, short, ::max)
-    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, int, ::max)
-    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uint, ::max)
-    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, float, ::fmax)
-    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, double, ::fmax)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, uchar, ::max)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, schar, ::max)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, char, ::max)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, ushort, ::max)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, short, ::max)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, int, ::max)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, uint, ::max)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, float, ::fmax)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, double, ::fmax)

    template <typename T> struct minimum : binary_function<T, T, T>
    {
@ -336,17 +336,17 @@ namespace cv { namespace gpu { namespace cudev
        __host__ __device__ __forceinline__ minimum(const minimum&) {}
    };

-    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, ::min)
-    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, schar, ::min)
-    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, char, ::min)
-    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, ushort, ::min)
-    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, short, ::min)
-    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, int, ::min)
-    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uint, ::min)
-    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, float, ::fmin)
-    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, double, ::fmin)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, uchar, ::min)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, schar, ::min)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, char, ::min)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, ushort, ::min)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, short, ::min)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, int, ::min)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, uint, ::min)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, float, ::fmin)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, double, ::fmin)

-#undef OPENCV_GPU_IMPLEMENT_MINMAX
+#undef OPENCV_CUDA_IMPLEMENT_MINMAX

    // Math functions

@ -451,7 +451,7 @@ namespace cv { namespace gpu { namespace cudev
        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
    };

-#define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(name, func) \
+#define OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(name, func) \
    template <typename T> struct name ## _func : unary_function<T, float> \
    { \
        __device__ __forceinline__ float operator ()(typename TypeTraits<T>::ParameterType v) const \
@ -471,7 +471,7 @@ namespace cv { namespace gpu { namespace cudev
        __host__ __device__ __forceinline__ name ## _func(const name ## _func&) {} \
    };

-#define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(name, func) \
+#define OPENCV_CUDA_IMPLEMENT_BIN_FUNCTOR(name, func) \
    template <typename T> struct name ## _func : binary_function<T, T, float> \
    { \
        __device__ __forceinline__ float operator ()(typename TypeTraits<T>::ParameterType v1, typename TypeTraits<T>::ParameterType v2) const \
@ -491,33 +491,33 @@ namespace cv { namespace gpu { namespace cudev
        __host__ __device__ __forceinline__ name ## _func(const name ## _func&) {} \
    };

-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt, ::sqrt)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp, ::exp)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp2, ::exp2)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp10, ::exp10)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log, ::log)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log2, ::log2)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log10, ::log10)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sin, ::sin)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(cos, ::cos)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(tan, ::tan)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(asin, ::asin)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(acos, ::acos)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(atan, ::atan)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sinh, ::sinh)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(cosh, ::cosh)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(tanh, ::tanh)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(asinh, ::asinh)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(acosh, ::acosh)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(atanh, ::atanh)
-
-    OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(hypot, ::hypot)
-    OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(atan2, ::atan2)
-    OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(pow, ::pow)
-
-    #undef OPENCV_GPU_IMPLEMENT_UN_FUNCTOR
-    #undef OPENCV_GPU_IMPLEMENT_UN_FUNCTOR_NO_DOUBLE
-    #undef OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(sqrt, ::sqrt)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(exp, ::exp)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(exp2, ::exp2)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(exp10, ::exp10)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(log, ::log)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(log2, ::log2)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(log10, ::log10)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(sin, ::sin)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(cos, ::cos)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(tan, ::tan)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(asin, ::asin)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(acos, ::acos)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(atan, ::atan)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(sinh, ::sinh)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(cosh, ::cosh)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(tanh, ::tanh)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(asinh, ::asinh)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(acosh, ::acosh)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(atanh, ::atanh)
+
+    OPENCV_CUDA_IMPLEMENT_BIN_FUNCTOR(hypot, ::hypot)
+    OPENCV_CUDA_IMPLEMENT_BIN_FUNCTOR(atan2, ::atan2)
+    OPENCV_CUDA_IMPLEMENT_BIN_FUNCTOR(pow, ::pow)
+
+    #undef OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR
+    #undef OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR_NO_DOUBLE
+    #undef OPENCV_CUDA_IMPLEMENT_BIN_FUNCTOR

    template<typename T> struct hypot_sqr_func : binary_function<T, T, float>
    {
@ -782,8 +782,8 @@ namespace cv { namespace gpu { namespace cudev

    template <typename Func> struct TransformFunctorTraits : DefaultTransformFunctorTraits<Func> {};

-#define OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(type) \
+#define OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(type) \
    template <> struct TransformFunctorTraits< type > : DefaultTransformFunctorTraits< type >
-}}} // namespace cv { namespace gpu { namespace cudev
+}}} // namespace cv { namespace cuda { namespace cudev

-#endif // __OPENCV_GPU_FUNCTIONAL_HPP__
+#endif // __OPENCV_CUDA_FUNCTIONAL_HPP__
--- a/modules/core/include/opencv2/core/cuda/limits.hpp
+++ b/modules/core/include/opencv2/core/cuda/limits.hpp
@ -40,14 +40,14 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_LIMITS_GPU_HPP__
-#define __OPENCV_GPU_LIMITS_GPU_HPP__
+#ifndef __OPENCV_CUDA_LIMITS_HPP__
+#define __OPENCV_CUDA_LIMITS_HPP__

 #include <limits.h>
 #include <float.h>
 #include "common.hpp"

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {

 template <class T> struct numeric_limits;
@ -117,6 +117,6 @@ template <> struct numeric_limits<double>
    static const bool is_signed = true;
 };

-}}} // namespace cv { namespace gpu { namespace cudev {
+}}} // namespace cv { namespace cuda { namespace cudev {

-#endif // __OPENCV_GPU_LIMITS_GPU_HPP__
+#endif // __OPENCV_CUDA_LIMITS_HPP__
--- a/modules/core/include/opencv2/core/cuda/reduce.hpp
+++ b/modules/core/include/opencv2/core/cuda/reduce.hpp
@ -40,14 +40,14 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_REDUCE_HPP__
-#define __OPENCV_GPU_REDUCE_HPP__
+#ifndef __OPENCV_CUDA_REDUCE_HPP__
+#define __OPENCV_CUDA_REDUCE_HPP__

 #include <thrust/tuple.h>
 #include "detail/reduce.hpp"
 #include "detail/reduce_key_val.hpp"

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    template <int N, typename T, class Op>
    __device__ __forceinline__ void reduce(volatile T* smem, T& val, unsigned int tid, const Op& op)
@ -194,4 +194,4 @@ namespace cv { namespace gpu { namespace cudev
    }
 }}}

-#endif // __OPENCV_GPU_UTILITY_HPP__
+#endif // __OPENCV_CUDA_UTILITY_HPP__
--- a/modules/core/include/opencv2/core/cuda/saturate_cast.hpp
+++ b/modules/core/include/opencv2/core/cuda/saturate_cast.hpp
@ -40,12 +40,12 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_SATURATE_CAST_HPP__
-#define __OPENCV_GPU_SATURATE_CAST_HPP__
+#ifndef __OPENCV_CUDA_SATURATE_CAST_HPP__
+#define __OPENCV_CUDA_SATURATE_CAST_HPP__

 #include "common.hpp"

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); }
    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); }
@ -281,4 +281,4 @@ namespace cv { namespace gpu { namespace cudev
    }
 }}}

-#endif /* __OPENCV_GPU_SATURATE_CAST_HPP__ */
+#endif /* __OPENCV_CUDA_SATURATE_CAST_HPP__ */
--- a/modules/core/include/opencv2/core/cuda/scan.hpp
+++ b/modules/core/include/opencv2/core/cuda/scan.hpp
@ -40,15 +40,15 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_SCAN_HPP__
-#define __OPENCV_GPU_SCAN_HPP__
+#ifndef __OPENCV_CUDA_SCAN_HPP__
+#define __OPENCV_CUDA_SCAN_HPP__

 #include "opencv2/core/cuda/common.hpp"
 #include "opencv2/core/cuda/utility.hpp"
 #include "opencv2/core/cuda/warp.hpp"
 #include "opencv2/core/cuda/warp_shuffle.hpp"

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    enum ScanKind { EXCLUSIVE = 0,  INCLUSIVE = 1 };

@ -174,22 +174,22 @@ namespace cv { namespace gpu { namespace cudev
    __device__ T warpScanInclusive(T idata, volatile T* s_Data, unsigned int tid)
    {
    #if __CUDA_ARCH__ >= 300
-        const unsigned int laneId = cv::gpu::cudev::Warp::laneId();
+        const unsigned int laneId = cv::cuda::device::Warp::laneId();

        // scan on shuffl functions
        #pragma unroll
-        for (int i = 1; i <= (OPENCV_GPU_WARP_SIZE / 2); i *= 2)
+        for (int i = 1; i <= (OPENCV_CUDA_WARP_SIZE / 2); i *= 2)
        {
-            const T n = cv::gpu::cudev::shfl_up(idata, i);
+            const T n = cv::cuda::device::shfl_up(idata, i);
            if (laneId >= i)
                  idata += n;
        }

        return idata;
    #else
-        unsigned int pos = 2 * tid - (tid & (OPENCV_GPU_WARP_SIZE - 1));
+        unsigned int pos = 2 * tid - (tid & (OPENCV_CUDA_WARP_SIZE - 1));
        s_Data[pos] = 0;
-        pos += OPENCV_GPU_WARP_SIZE;
+        pos += OPENCV_CUDA_WARP_SIZE;
        s_Data[pos] = idata;

        s_Data[pos] += s_Data[pos - 1];
@ -211,7 +211,7 @@ namespace cv { namespace gpu { namespace cudev
    template <int tiNumScanThreads, typename T>
    __device__ T blockScanInclusive(T idata, volatile T* s_Data, unsigned int tid)
    {
-        if (tiNumScanThreads > OPENCV_GPU_WARP_SIZE)
+        if (tiNumScanThreads > OPENCV_CUDA_WARP_SIZE)
        {
            //Bottom-level inclusive warp scan
            T warpResult = warpScanInclusive(idata, s_Data, tid);
@ -219,15 +219,15 @@ namespace cv { namespace gpu { namespace cudev
            //Save top elements of each warp for exclusive warp scan
            //sync to wait for warp scans to complete (because s_Data is being overwritten)
            __syncthreads();
-            if ((tid & (OPENCV_GPU_WARP_SIZE - 1)) == (OPENCV_GPU_WARP_SIZE - 1))
+            if ((tid & (OPENCV_CUDA_WARP_SIZE - 1)) == (OPENCV_CUDA_WARP_SIZE - 1))
            {
-                s_Data[tid >> OPENCV_GPU_LOG_WARP_SIZE] = warpResult;
+                s_Data[tid >> OPENCV_CUDA_LOG_WARP_SIZE] = warpResult;
            }

            //wait for warp scans to complete
            __syncthreads();

-            if (tid < (tiNumScanThreads / OPENCV_GPU_WARP_SIZE) )
+            if (tid < (tiNumScanThreads / OPENCV_CUDA_WARP_SIZE) )
            {
                //grab top warp elements
                T val = s_Data[tid];
@ -238,7 +238,7 @@ namespace cv { namespace gpu { namespace cudev
            //return updated warp scans with exclusive scan results
            __syncthreads();

-            return warpResult + s_Data[tid >> OPENCV_GPU_LOG_WARP_SIZE];
+            return warpResult + s_Data[tid >> OPENCV_CUDA_LOG_WARP_SIZE];
        }
        else
        {
@ -247,4 +247,4 @@ namespace cv { namespace gpu { namespace cudev
    }
 }}}

-#endif // __OPENCV_GPU_SCAN_HPP__
+#endif // __OPENCV_CUDA_SCAN_HPP__
--- a/modules/core/include/opencv2/core/cuda/simd_functions.hpp
+++ b/modules/core/include/opencv2/core/cuda/simd_functions.hpp
@ -70,8 +70,8 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */

-#ifndef __OPENCV_GPU_SIMD_FUNCTIONS_HPP__
-#define __OPENCV_GPU_SIMD_FUNCTIONS_HPP__
+#ifndef __OPENCV_CUDA_SIMD_FUNCTIONS_HPP__
+#define __OPENCV_CUDA_SIMD_FUNCTIONS_HPP__

 #include "common.hpp"

@ -123,7 +123,7 @@
  vmin4(a,b)      per-byte unsigned minimum: min(a, b)
 */

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    // 2

@ -906,4 +906,4 @@ namespace cv { namespace gpu { namespace cudev
    }
 }}}

-#endif // __OPENCV_GPU_SIMD_FUNCTIONS_HPP__
+#endif // __OPENCV_CUDA_SIMD_FUNCTIONS_HPP__
--- a/modules/core/include/opencv2/core/cuda/transform.hpp
+++ b/modules/core/include/opencv2/core/cuda/transform.hpp
@ -40,14 +40,14 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_TRANSFORM_HPP__
-#define __OPENCV_GPU_TRANSFORM_HPP__
+#ifndef __OPENCV_CUDA_TRANSFORM_HPP__
+#define __OPENCV_CUDA_TRANSFORM_HPP__

 #include "common.hpp"
 #include "utility.hpp"
 #include "detail/transform_detail.hpp"

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    template <typename T, typename D, typename UnOp, typename Mask>
    static inline void transform(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, const Mask& mask, cudaStream_t stream)
@ -64,4 +64,4 @@ namespace cv { namespace gpu { namespace cudev
    }
 }}}

-#endif // __OPENCV_GPU_TRANSFORM_HPP__
+#endif // __OPENCV_CUDA_TRANSFORM_HPP__
--- a/modules/core/include/opencv2/core/cuda/type_traits.hpp
+++ b/modules/core/include/opencv2/core/cuda/type_traits.hpp
@ -40,12 +40,12 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_TYPE_TRAITS_HPP__
-#define __OPENCV_GPU_TYPE_TRAITS_HPP__
+#ifndef __OPENCV_CUDA_TYPE_TRAITS_HPP__
+#define __OPENCV_CUDA_TYPE_TRAITS_HPP__

 #include "detail/type_traits_detail.hpp"

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    template <typename T> struct IsSimpleParameter
    {
@ -79,4 +79,4 @@ namespace cv { namespace gpu { namespace cudev
    };
 }}}

-#endif // __OPENCV_GPU_TYPE_TRAITS_HPP__
+#endif // __OPENCV_CUDA_TYPE_TRAITS_HPP__
--- a/modules/core/include/opencv2/core/cuda/utility.hpp
+++ b/modules/core/include/opencv2/core/cuda/utility.hpp
@ -40,18 +40,18 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_UTILITY_HPP__
-#define __OPENCV_GPU_UTILITY_HPP__
+#ifndef __OPENCV_CUDA_UTILITY_HPP__
+#define __OPENCV_CUDA_UTILITY_HPP__

 #include "saturate_cast.hpp"
 #include "datamov_utils.hpp"

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
-    #define OPENCV_GPU_LOG_WARP_SIZE        (5)
-    #define OPENCV_GPU_WARP_SIZE            (1 << OPENCV_GPU_LOG_WARP_SIZE)
-    #define OPENCV_GPU_LOG_MEM_BANKS        ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
-    #define OPENCV_GPU_MEM_BANKS            (1 << OPENCV_GPU_LOG_MEM_BANKS)
+    #define OPENCV_CUDA_LOG_WARP_SIZE        (5)
+    #define OPENCV_CUDA_WARP_SIZE            (1 << OPENCV_CUDA_LOG_WARP_SIZE)
+    #define OPENCV_CUDA_LOG_MEM_BANKS        ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
+    #define OPENCV_CUDA_MEM_BANKS            (1 << OPENCV_CUDA_LOG_MEM_BANKS)

    ///////////////////////////////////////////////////////////////////////////////
    // swap
@ -208,6 +208,6 @@ namespace cv { namespace gpu { namespace cudev

        return false;
    }
-}}} // namespace cv { namespace gpu { namespace cudev
+}}} // namespace cv { namespace cuda { namespace cudev

-#endif // __OPENCV_GPU_UTILITY_HPP__
+#endif // __OPENCV_CUDA_UTILITY_HPP__
--- a/modules/core/include/opencv2/core/cuda/vec_distance.hpp
+++ b/modules/core/include/opencv2/core/cuda/vec_distance.hpp
@ -40,14 +40,14 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__
-#define __OPENCV_GPU_VEC_DISTANCE_HPP__
+#ifndef __OPENCV_CUDA_VEC_DISTANCE_HPP__
+#define __OPENCV_CUDA_VEC_DISTANCE_HPP__

 #include "reduce.hpp"
 #include "functional.hpp"
 #include "detail/vec_distance_detail.hpp"

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    template <typename T> struct L1Dist
    {
@ -219,6 +219,6 @@ namespace cv { namespace gpu { namespace cudev

        U vec1Vals[MAX_LEN / THREAD_DIM];
    };
-}}} // namespace cv { namespace gpu { namespace cudev
+}}} // namespace cv { namespace cuda { namespace cudev

-#endif // __OPENCV_GPU_VEC_DISTANCE_HPP__
+#endif // __OPENCV_CUDA_VEC_DISTANCE_HPP__
--- a/modules/core/include/opencv2/core/cuda/vec_math.hpp
+++ b/modules/core/include/opencv2/core/cuda/vec_math.hpp
@ -40,13 +40,13 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_VECMATH_HPP__
-#define __OPENCV_GPU_VECMATH_HPP__
+#ifndef __OPENCV_CUDA_VECMATH_HPP__
+#define __OPENCV_CUDA_VECMATH_HPP__

 #include "vec_traits.hpp"
 #include "saturate_cast.hpp"

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {

 // saturate_cast
@ -917,6 +917,6 @@ CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, double, double, double)

 #undef CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC

-}}} // namespace cv { namespace gpu { namespace device
+}}} // namespace cv { namespace cuda { namespace device

-#endif // __OPENCV_GPU_VECMATH_HPP__
+#endif // __OPENCV_CUDA_VECMATH_HPP__
--- a/modules/core/include/opencv2/core/cuda/vec_traits.hpp
+++ b/modules/core/include/opencv2/core/cuda/vec_traits.hpp
@ -40,12 +40,12 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_VEC_TRAITS_HPP__
-#define __OPENCV_GPU_VEC_TRAITS_HPP__
+#ifndef __OPENCV_CUDA_VEC_TRAITS_HPP__
+#define __OPENCV_CUDA_VEC_TRAITS_HPP__

 #include "common.hpp"

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    template<typename T, int N> struct TypeVec;

@ -122,7 +122,7 @@ namespace cv { namespace gpu { namespace cudev
        return val;
    }

-#define OPENCV_GPU_IMPLEMENT_TYPE_VEC(type) \
+#define OPENCV_CUDA_IMPLEMENT_TYPE_VEC(type) \
    template<> struct TypeVec<type, 1> { typedef type vec_type; }; \
    template<> struct TypeVec<type ## 1, 1> { typedef type ## 1 vec_type; }; \
    template<> struct TypeVec<type, 2> { typedef type ## 2 vec_type; }; \
@ -134,16 +134,16 @@ namespace cv { namespace gpu { namespace cudev
    template<> struct TypeVec<type, 8> { typedef type ## 8 vec_type; }; \
    template<> struct TypeVec<type ## 8, 8> { typedef type ## 8 vec_type; };

-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(uchar)
-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(char)
-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(ushort)
-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(short)
-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(int)
-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(uint)
-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(float)
-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(double)
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(uchar)
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(char)
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(ushort)
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(short)
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(int)
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(uint)
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(float)
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(double)

-    #undef OPENCV_GPU_IMPLEMENT_TYPE_VEC
+    #undef OPENCV_CUDA_IMPLEMENT_TYPE_VEC

    template<> struct TypeVec<schar, 1> { typedef schar vec_type; };
    template<> struct TypeVec<schar, 2> { typedef char2 vec_type; };
@ -159,7 +159,7 @@ namespace cv { namespace gpu { namespace cudev

    template<typename T> struct VecTraits;

-#define OPENCV_GPU_IMPLEMENT_VEC_TRAITS(type) \
+#define OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(type) \
    template<> struct VecTraits<type> \
    { \
        typedef type elem_type; \
@ -209,15 +209,15 @@ namespace cv { namespace gpu { namespace cudev
        static __device__ __host__ __forceinline__ type ## 8 make(const type* v) {return make_ ## type ## 8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);} \
    };

-    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uchar)
-    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(ushort)
-    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(short)
-    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(int)
-    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uint)
-    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(float)
-    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(double)
+    OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(uchar)
+    OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(ushort)
+    OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(short)
+    OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(int)
+    OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(uint)
+    OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(float)
+    OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(double)

-    #undef OPENCV_GPU_IMPLEMENT_VEC_TRAITS
+    #undef OPENCV_CUDA_IMPLEMENT_VEC_TRAITS

    template<> struct VecTraits<char>
    {
@ -275,6 +275,6 @@ namespace cv { namespace gpu { namespace cudev
        static __device__ __host__ __forceinline__ char8 make(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7) {return make_char8(a0, a1, a2, a3, a4, a5, a6, a7);}
        static __device__ __host__ __forceinline__ char8 make(const schar* v) {return make_char8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);}
    };
-}}} // namespace cv { namespace gpu { namespace cudev
+}}} // namespace cv { namespace cuda { namespace cudev

-#endif // __OPENCV_GPU_VEC_TRAITS_HPP__
+#endif // __OPENCV_CUDA_VEC_TRAITS_HPP__
--- a/modules/core/include/opencv2/core/cuda/warp.hpp
+++ b/modules/core/include/opencv2/core/cuda/warp.hpp
@ -40,10 +40,10 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_DEVICE_WARP_HPP__
-#define __OPENCV_GPU_DEVICE_WARP_HPP__
+#ifndef __OPENCV_CUDA_DEVICE_WARP_HPP__
+#define __OPENCV_CUDA_DEVICE_WARP_HPP__

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    struct Warp
    {
@ -126,6 +126,6 @@ namespace cv { namespace gpu { namespace cudev
                *t = value;
        }
    };
-}}} // namespace cv { namespace gpu { namespace cudev
+}}} // namespace cv { namespace cuda { namespace cudev

-#endif /* __OPENCV_GPU_DEVICE_WARP_HPP__ */
+#endif /* __OPENCV_CUDA_DEVICE_WARP_HPP__ */
--- a/modules/core/include/opencv2/core/cuda/warp_reduce.hpp
+++ b/modules/core/include/opencv2/core/cuda/warp_reduce.hpp
@ -40,10 +40,10 @@
 //
 //M*/

-#ifndef OPENCV_GPU_WARP_REDUCE_HPP__
-#define OPENCV_GPU_WARP_REDUCE_HPP__
+#ifndef OPENCV_CUDA_WARP_REDUCE_HPP__
+#define OPENCV_CUDA_WARP_REDUCE_HPP__

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    template <class T>
    __device__ __forceinline__ T warp_reduce(volatile T *ptr , const unsigned int tid = threadIdx.x)
@ -63,6 +63,6 @@ namespace cv { namespace gpu { namespace cudev

        return ptr[tid - lane];
    }
-}}} // namespace cv { namespace gpu { namespace cudev {
+}}} // namespace cv { namespace cuda { namespace cudev {

-#endif /* OPENCV_GPU_WARP_REDUCE_HPP__ */
+#endif /* OPENCV_CUDA_WARP_REDUCE_HPP__ */
--- a/modules/core/include/opencv2/core/cuda/warp_shuffle.hpp
+++ b/modules/core/include/opencv2/core/cuda/warp_shuffle.hpp
@ -40,10 +40,10 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_WARP_SHUFFLE_HPP__
-#define __OPENCV_GPU_WARP_SHUFFLE_HPP__
+#ifndef __OPENCV_CUDA_WARP_SHUFFLE_HPP__
+#define __OPENCV_CUDA_WARP_SHUFFLE_HPP__

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    template <typename T>
    __device__ __forceinline__ T shfl(T val, int srcLane, int width = warpSize)
@ -142,4 +142,4 @@ namespace cv { namespace gpu { namespace cudev
    }
 }}}

-#endif // __OPENCV_GPU_WARP_SHUFFLE_HPP__
+#endif // __OPENCV_CUDA_WARP_SHUFFLE_HPP__
--- a/modules/core/include/opencv2/core/cuda_stream_accessor.hpp
+++ b/modules/core/include/opencv2/core/cuda_stream_accessor.hpp
@ -40,16 +40,16 @@
 //
 //M*/

-#ifndef __OPENCV_CORE_GPU_STREAM_ACCESSOR_HPP__
-#define __OPENCV_CORE_GPU_STREAM_ACCESSOR_HPP__
+#ifndef __OPENCV_CORE_CUDA_STREAM_ACCESSOR_HPP__
+#define __OPENCV_CORE_CUDA_STREAM_ACCESSOR_HPP__

 #ifndef __cplusplus
-#  error gpu_stream_accessor.hpp header must be compiled as C++
+#  error cuda_stream_accessor.hpp header must be compiled as C++
 #endif

 // This is only header file that depends on Cuda. All other headers are independent.
 // So if you use OpenCV binaries you do noot need to install Cuda Toolkit.
-// But of you wanna use GPU by yourself, may get cuda stream instance using the class below.
+// But of you wanna use CUDA by yourself, may get cuda stream instance using the class below.
 // In this case you have to install Cuda Toolkit.

 #include <cuda_runtime.h>
@ -57,7 +57,7 @@

 namespace cv
 {
-    namespace gpu
+    namespace cuda
    {
        class Stream;
        class Event;
@ -74,4 +74,4 @@ namespace cv
    }
 }

-#endif /* __OPENCV_CORE_GPU_STREAM_ACCESSOR_HPP__ */
+#endif /* __OPENCV_CORE_CUDA_STREAM_ACCESSOR_HPP__ */
--- a/modules/core/include/opencv2/core/cuda_types.hpp
+++ b/modules/core/include/opencv2/core/cuda_types.hpp
@ -40,22 +40,22 @@
 //
 //M*/

-#ifndef __OPENCV_CORE_GPU_TYPES_HPP__
-#define __OPENCV_CORE_GPU_TYPES_HPP__
+#ifndef __OPENCV_CORE_CUDA_TYPES_HPP__
+#define __OPENCV_CORE_CUDA_TYPES_HPP__

 #ifndef __cplusplus
-#  error gpu_types.hpp header must be compiled as C++
+#  error cuda_types.hpp header must be compiled as C++
 #endif

 #ifdef __CUDACC__
-    #define __CV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__
+    #define __CV_CUDA_HOST_DEVICE__ __host__ __device__ __forceinline__
 #else
-    #define __CV_GPU_HOST_DEVICE__
+    #define __CV_CUDA_HOST_DEVICE__
 #endif

 namespace cv
 {
-    namespace gpu
+    namespace cuda
    {
        // Simple lightweight structures that encapsulates information about an image on device.
        // It is intended to pass to nvcc-compiled code. GpuMat depends on headers that nvcc can't compile
@ -69,41 +69,41 @@ namespace cv

            T* data;

-            __CV_GPU_HOST_DEVICE__ DevPtr() : data(0) {}
-            __CV_GPU_HOST_DEVICE__ DevPtr(T* data_) : data(data_) {}
+            __CV_CUDA_HOST_DEVICE__ DevPtr() : data(0) {}
+            __CV_CUDA_HOST_DEVICE__ DevPtr(T* data_) : data(data_) {}

-            __CV_GPU_HOST_DEVICE__ size_t elemSize() const { return elem_size; }
-            __CV_GPU_HOST_DEVICE__ operator       T*()       { return data; }
-            __CV_GPU_HOST_DEVICE__ operator const T*() const { return data; }
+            __CV_CUDA_HOST_DEVICE__ size_t elemSize() const { return elem_size; }
+            __CV_CUDA_HOST_DEVICE__ operator       T*()       { return data; }
+            __CV_CUDA_HOST_DEVICE__ operator const T*() const { return data; }
        };

        template <typename T> struct PtrSz : public DevPtr<T>
        {
-            __CV_GPU_HOST_DEVICE__ PtrSz() : size(0) {}
-            __CV_GPU_HOST_DEVICE__ PtrSz(T* data_, size_t size_) : DevPtr<T>(data_), size(size_) {}
+            __CV_CUDA_HOST_DEVICE__ PtrSz() : size(0) {}
+            __CV_CUDA_HOST_DEVICE__ PtrSz(T* data_, size_t size_) : DevPtr<T>(data_), size(size_) {}

            size_t size;
        };

        template <typename T> struct PtrStep : public DevPtr<T>
        {
-            __CV_GPU_HOST_DEVICE__ PtrStep() : step(0) {}
-            __CV_GPU_HOST_DEVICE__ PtrStep(T* data_, size_t step_) : DevPtr<T>(data_), step(step_) {}
+            __CV_CUDA_HOST_DEVICE__ PtrStep() : step(0) {}
+            __CV_CUDA_HOST_DEVICE__ PtrStep(T* data_, size_t step_) : DevPtr<T>(data_), step(step_) {}

            //! stride between two consecutive rows in bytes. Step is stored always and everywhere in bytes!!!
            size_t step;

-            __CV_GPU_HOST_DEVICE__       T* ptr(int y = 0)       { return (      T*)( (      char*)DevPtr<T>::data + y * step); }
-            __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return (const T*)( (const char*)DevPtr<T>::data + y * step); }
+            __CV_CUDA_HOST_DEVICE__       T* ptr(int y = 0)       { return (      T*)( (      char*)DevPtr<T>::data + y * step); }
+            __CV_CUDA_HOST_DEVICE__ const T* ptr(int y = 0) const { return (const T*)( (const char*)DevPtr<T>::data + y * step); }

-            __CV_GPU_HOST_DEVICE__       T& operator ()(int y, int x)       { return ptr(y)[x]; }
-            __CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }
+            __CV_CUDA_HOST_DEVICE__       T& operator ()(int y, int x)       { return ptr(y)[x]; }
+            __CV_CUDA_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }
        };

        template <typename T> struct PtrStepSz : public PtrStep<T>
        {
-            __CV_GPU_HOST_DEVICE__ PtrStepSz() : cols(0), rows(0) {}
-            __CV_GPU_HOST_DEVICE__ PtrStepSz(int rows_, int cols_, T* data_, size_t step_)
+            __CV_CUDA_HOST_DEVICE__ PtrStepSz() : cols(0), rows(0) {}
+            __CV_CUDA_HOST_DEVICE__ PtrStepSz(int rows_, int cols_, T* data_, size_t step_)
                : PtrStep<T>(data_, step_), cols(cols_), rows(rows_) {}

            template <typename U>
@ -123,4 +123,4 @@ namespace cv
    }
 }

-#endif /* __OPENCV_CORE_GPU_TYPES_HPP__ */
+#endif /* __OPENCV_CORE_CUDA_TYPES_HPP__ */
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@ -93,14 +93,14 @@ public:
    template<typename _Tp> _InputArray(const _Tp* vec, int n);
    template<typename _Tp, int m, int n> _InputArray(const Matx<_Tp, m, n>& matx);
    _InputArray(const double& val);
-    _InputArray(const gpu::GpuMat& d_mat);
+    _InputArray(const cuda::GpuMat& d_mat);
    _InputArray(const ogl::Buffer& buf);
-    _InputArray(const gpu::CudaMem& cuda_mem);
+    _InputArray(const cuda::CudaMem& cuda_mem);
    template<typename _Tp> _InputArray(const cudev::GpuMat_<_Tp>& m);

    virtual Mat getMat(int i=-1) const;
    virtual void getMatVector(std::vector<Mat>& mv) const;
-    virtual gpu::GpuMat getGpuMat() const;
+    virtual cuda::GpuMat getGpuMat() const;
    virtual ogl::Buffer getOGlBuffer() const;

    virtual int kind() const;
@ -142,9 +142,9 @@ public:
    _OutputArray();
    _OutputArray(Mat& m);
    _OutputArray(std::vector<Mat>& vec);
-    _OutputArray(gpu::GpuMat& d_mat);
+    _OutputArray(cuda::GpuMat& d_mat);
    _OutputArray(ogl::Buffer& buf);
-    _OutputArray(gpu::CudaMem& cuda_mem);
+    _OutputArray(cuda::CudaMem& cuda_mem);
    template<typename _Tp> _OutputArray(cudev::GpuMat_<_Tp>& m);
    template<typename _Tp> _OutputArray(std::vector<_Tp>& vec);
    template<typename _Tp> _OutputArray(std::vector<std::vector<_Tp> >& vec);
@ -155,9 +155,9 @@ public:

    _OutputArray(const Mat& m);
    _OutputArray(const std::vector<Mat>& vec);
-    _OutputArray(const gpu::GpuMat& d_mat);
+    _OutputArray(const cuda::GpuMat& d_mat);
    _OutputArray(const ogl::Buffer& buf);
-    _OutputArray(const gpu::CudaMem& cuda_mem);
+    _OutputArray(const cuda::CudaMem& cuda_mem);
    template<typename _Tp> _OutputArray(const cudev::GpuMat_<_Tp>& m);
    template<typename _Tp> _OutputArray(const std::vector<_Tp>& vec);
    template<typename _Tp> _OutputArray(const std::vector<std::vector<_Tp> >& vec);
@ -170,9 +170,9 @@ public:
    virtual bool fixedType() const;
    virtual bool needed() const;
    virtual Mat& getMatRef(int i=-1) const;
-    virtual gpu::GpuMat& getGpuMatRef() const;
+    virtual cuda::GpuMat& getGpuMatRef() const;
    virtual ogl::Buffer& getOGlBufferRef() const;
-    virtual gpu::CudaMem& getCudaMemRef() const;
+    virtual cuda::CudaMem& getCudaMemRef() const;
    virtual void create(Size sz, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
    virtual void create(int rows, int cols, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
    virtual void create(int dims, const int* size, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
@ -506,7 +506,7 @@ public:
    //Mat(const void* img, bool copyData=false);

    //! download data from GpuMat
-    explicit Mat(const gpu::GpuMat& m);
+    explicit Mat(const cuda::GpuMat& m);

    //! destructor - calls release()
    ~Mat();
--- a/modules/core/include/opencv2/core/opengl.hpp
+++ b/modules/core/include/opencv2/core/opengl.hpp
@ -99,12 +99,12 @@ public:
    //! copy from host/device memory (blocking)
    void copyFrom(InputArray arr, Target target = ARRAY_BUFFER, bool autoRelease = false);
    //! copy from device memory (non blocking)
-    void copyFrom(InputArray arr, gpu::Stream& stream, Target target = ARRAY_BUFFER, bool autoRelease = false);
+    void copyFrom(InputArray arr, cuda::Stream& stream, Target target = ARRAY_BUFFER, bool autoRelease = false);

    //! copy to host/device memory (blocking)
    void copyTo(OutputArray arr) const;
    //! copy to device memory (non blocking)
-    void copyTo(OutputArray arr, gpu::Stream& stream) const;
+    void copyTo(OutputArray arr, cuda::Stream& stream) const;

    //! create copy of current buffer
    Buffer clone(Target target = ARRAY_BUFFER, bool autoRelease = false) const;
@ -120,12 +120,12 @@ public:
    void unmapHost();

    //! map to device memory (blocking)
-    gpu::GpuMat mapDevice();
+    cuda::GpuMat mapDevice();
    void unmapDevice();

    //! map to device memory (non blocking)
-    gpu::GpuMat mapDevice(gpu::Stream& stream);
-    void unmapDevice(gpu::Stream& stream);
+    cuda::GpuMat mapDevice(cuda::Stream& stream);
+    void unmapDevice(cuda::Stream& stream);

    int rows() const;
    int cols() const;
@ -276,7 +276,7 @@ CV_EXPORTS void render(const Arrays& arr, InputArray indices, int mode = POINTS,

 }} // namespace cv::ogl

-namespace cv { namespace gpu {
+namespace cv { namespace cuda {

 //! set a CUDA device to use OpenGL interoperability
 CV_EXPORTS void setGlDevice(int device = 0);
--- a/modules/core/include/opencv2/core/private.cuda.hpp
+++ b/modules/core/include/opencv2/core/private.cuda.hpp
@ -41,8 +41,8 @@
 //
 //M*/

-#ifndef __OPENCV_CORE_PRIVATE_GPU_HPP__
-#define __OPENCV_CORE_PRIVATE_GPU_HPP__
+#ifndef __OPENCV_CORE_PRIVATE_CUDA_HPP__
+#define __OPENCV_CORE_PRIVATE_CUDA_HPP__

 #ifndef __OPENCV_BUILD
 #  error this is a private header which should not be used from outside of the OpenCV library
@ -53,13 +53,13 @@
 #include "opencv2/core/cvdef.h"
 #include "opencv2/core/base.hpp"

-#include "opencv2/core/gpu.hpp"
+#include "opencv2/core/cuda.hpp"

 #ifdef HAVE_CUDA
 #  include <cuda.h>
 #  include <cuda_runtime.h>
 #  include <npp.h>
-#  include "opencv2/core/gpu_stream_accessor.hpp"
+#  include "opencv2/core/cuda_stream_accessor.hpp"
 #  include "opencv2/core/cuda/common.hpp"

 #  define NPP_VERSION (NPP_VERSION_MAJOR * 1000 + NPP_VERSION_MINOR * 100 + NPP_VERSION_BUILD)
@ -71,24 +71,24 @@
 #  endif

 #  if defined(CUDA_ARCH_BIN_OR_PTX_10)
-#    error "OpenCV GPU module doesn't support NVIDIA compute capability 1.0"
+#    error "OpenCV CUDA module doesn't support NVIDIA compute capability 1.0"
 #  endif
 #endif

-namespace cv { namespace gpu {
+namespace cv { namespace cuda {
    CV_EXPORTS cv::String getNppErrorMessage(int code);
    CV_EXPORTS cv::String getCudaDriverApiErrorMessage(int code);
 }}

 #ifndef HAVE_CUDA

-static inline void throw_no_cuda() { CV_Error(cv::Error::GpuNotSupported, "The library is compiled without GPU support"); }
+static inline void throw_no_cuda() { CV_Error(cv::Error::GpuNotSupported, "The library is compiled without CUDA support"); }

 #else // HAVE_CUDA

 static inline void throw_no_cuda() { CV_Error(cv::Error::StsNotImplemented, "The called functionality is disabled for current build or platform"); }

-namespace cv { namespace gpu
+namespace cv { namespace cuda
 {
    static inline void checkNppError(int code, const char* file, const int line, const char* func)
    {
@ -131,13 +131,13 @@ namespace cv { namespace gpu
 }}

 #if defined(__GNUC__)
-    #define nppSafeCall(expr)  cv::gpu::checkNppError(expr, __FILE__, __LINE__, __func__)
-    #define cuSafeCall(expr)  cv::gpu::checkCudaDriverApiError(expr, __FILE__, __LINE__, __func__)
+    #define nppSafeCall(expr)  cv::cuda::checkNppError(expr, __FILE__, __LINE__, __func__)
+    #define cuSafeCall(expr)  cv::cuda::checkCudaDriverApiError(expr, __FILE__, __LINE__, __func__)
 #else /* defined(__CUDACC__) || defined(__MSVC__) */
-    #define nppSafeCall(expr)  cv::gpu::checkNppError(expr, __FILE__, __LINE__, "")
-    #define cuSafeCall(expr)  cv::gpu::checkCudaDriverApiError(expr, __FILE__, __LINE__, "")
+    #define nppSafeCall(expr)  cv::cuda::checkNppError(expr, __FILE__, __LINE__, "")
+    #define cuSafeCall(expr)  cv::cuda::checkCudaDriverApiError(expr, __FILE__, __LINE__, "")
 #endif

 #endif // HAVE_CUDA

-#endif // __OPENCV_CORE_GPU_PRIVATE_HPP__
+#endif // __OPENCV_CORE_CUDA_PRIVATE_HPP__
--- a/modules/core/src/cuda/gpu_mat.cu
+++ b/modules/core/src/cuda/gpu_mat.cu
@ -48,17 +48,17 @@

 #else

-#include "opencv2/core/gpu.hpp"
+#include "opencv2/core/cuda.hpp"
 #include "opencv2/cudev.hpp"

 using namespace cv;
-using namespace cv::gpu;
+using namespace cv::cuda;
 using namespace cv::cudev;

 /////////////////////////////////////////////////////
 /// create

-void cv::gpu::GpuMat::create(int _rows, int _cols, int _type)
+void cv::cuda::GpuMat::create(int _rows, int _cols, int _type)
 {
    CV_DbgAssert( _rows >= 0 && _cols >= 0 );

@ -108,7 +108,7 @@ void cv::gpu::GpuMat::create(int _rows, int _cols, int _type)
 /////////////////////////////////////////////////////
 /// release

-void cv::gpu::GpuMat::release()
+void cv::cuda::GpuMat::release()
 {
    if (refcount && CV_XADD(refcount, -1) == 1)
    {
@ -124,7 +124,7 @@ void cv::gpu::GpuMat::release()
 /////////////////////////////////////////////////////
 /// upload

-void cv::gpu::GpuMat::upload(InputArray arr)
+void cv::cuda::GpuMat::upload(InputArray arr)
 {
    Mat mat = arr.getMat();

@ -135,7 +135,7 @@ void cv::gpu::GpuMat::upload(InputArray arr)
    CV_CUDEV_SAFE_CALL( cudaMemcpy2D(data, step, mat.data, mat.step, cols * elemSize(), rows, cudaMemcpyHostToDevice) );
 }

-void cv::gpu::GpuMat::upload(InputArray arr, Stream& _stream)
+void cv::cuda::GpuMat::upload(InputArray arr, Stream& _stream)
 {
    Mat mat = arr.getMat();

@ -150,7 +150,7 @@ void cv::gpu::GpuMat::upload(InputArray arr, Stream& _stream)
 /////////////////////////////////////////////////////
 /// download

-void cv::gpu::GpuMat::download(OutputArray _dst) const
+void cv::cuda::GpuMat::download(OutputArray _dst) const
 {
    CV_DbgAssert( !empty() );

@ -160,7 +160,7 @@ void cv::gpu::GpuMat::download(OutputArray _dst) const
    CV_CUDEV_SAFE_CALL( cudaMemcpy2D(dst.data, dst.step, data, step, cols * elemSize(), rows, cudaMemcpyDeviceToHost) );
 }

-void cv::gpu::GpuMat::download(OutputArray _dst, Stream& _stream) const
+void cv::cuda::GpuMat::download(OutputArray _dst, Stream& _stream) const
 {
    CV_DbgAssert( !empty() );

@ -174,7 +174,7 @@ void cv::gpu::GpuMat::download(OutputArray _dst, Stream& _stream) const
 /////////////////////////////////////////////////////
 /// copyTo

-void cv::gpu::GpuMat::copyTo(OutputArray _dst) const
+void cv::cuda::GpuMat::copyTo(OutputArray _dst) const
 {
    CV_DbgAssert( !empty() );

@ -184,7 +184,7 @@ void cv::gpu::GpuMat::copyTo(OutputArray _dst) const
    CV_CUDEV_SAFE_CALL( cudaMemcpy2D(dst.data, dst.step, data, step, cols * elemSize(), rows, cudaMemcpyDeviceToDevice) );
 }

-void cv::gpu::GpuMat::copyTo(OutputArray _dst, Stream& _stream) const
+void cv::cuda::GpuMat::copyTo(OutputArray _dst, Stream& _stream) const
 {
    CV_DbgAssert( !empty() );

@ -220,7 +220,7 @@ namespace
    }
 }

-void cv::gpu::GpuMat::copyTo(OutputArray _dst, InputArray _mask, Stream& stream) const
+void cv::cuda::GpuMat::copyTo(OutputArray _dst, InputArray _mask, Stream& stream) const
 {
    CV_DbgAssert( !empty() );
    CV_DbgAssert( depth() <= CV_64F && channels() <= 4 );
@ -279,7 +279,7 @@ namespace
    }
 }

-GpuMat& cv::gpu::GpuMat::setTo(Scalar value, Stream& stream)
+GpuMat& cv::cuda::GpuMat::setTo(Scalar value, Stream& stream)
 {
    CV_DbgAssert( !empty() );
    CV_DbgAssert( depth() <= CV_64F && channels() <= 4 );
@ -333,7 +333,7 @@ GpuMat& cv::gpu::GpuMat::setTo(Scalar value, Stream& stream)
    return *this;
 }

-GpuMat& cv::gpu::GpuMat::setTo(Scalar value, InputArray _mask, Stream& stream)
+GpuMat& cv::cuda::GpuMat::setTo(Scalar value, InputArray _mask, Stream& stream)
 {
    CV_DbgAssert( !empty() );
    CV_DbgAssert( depth() <= CV_64F && channels() <= 4 );
@ -412,7 +412,7 @@ namespace
    }
 }

-void cv::gpu::GpuMat::convertTo(OutputArray _dst, int rtype, Stream& stream) const
+void cv::cuda::GpuMat::convertTo(OutputArray _dst, int rtype, Stream& stream) const
 {
    if (rtype < 0)
        rtype = type();
@ -453,7 +453,7 @@ void cv::gpu::GpuMat::convertTo(OutputArray _dst, int rtype, Stream& stream) con
    funcs[sdepth][ddepth](reshape(1), dst.reshape(1), stream);
 }

-void cv::gpu::GpuMat::convertTo(OutputArray _dst, int rtype, double alpha, double beta, Stream& stream) const
+void cv::cuda::GpuMat::convertTo(OutputArray _dst, int rtype, double alpha, double beta, Stream& stream) const
 {
    if (rtype < 0)
        rtype = type();
--- a/modules/core/src/cuda_gpu_mat.cpp
+++ b/modules/core/src/cuda_gpu_mat.cpp
@ -44,9 +44,9 @@
 #include "precomp.hpp"

 using namespace cv;
-using namespace cv::gpu;
+using namespace cv::cuda;

-cv::gpu::GpuMat::GpuMat(int rows_, int cols_, int type_, void* data_, size_t step_) :
+cv::cuda::GpuMat::GpuMat(int rows_, int cols_, int type_, void* data_, size_t step_) :
    flags(Mat::MAGIC_VAL + (type_ & Mat::TYPE_MASK)), rows(rows_), cols(cols_),
    step(step_), data((uchar*)data_), refcount(0),
    datastart((uchar*)data_), dataend((uchar*)data_)
@ -71,7 +71,7 @@ cv::gpu::GpuMat::GpuMat(int rows_, int cols_, int type_, void* data_, size_t ste
    dataend += step * (rows - 1) + minstep;
 }

-cv::gpu::GpuMat::GpuMat(Size size_, int type_, void* data_, size_t step_) :
+cv::cuda::GpuMat::GpuMat(Size size_, int type_, void* data_, size_t step_) :
    flags(Mat::MAGIC_VAL + (type_ & Mat::TYPE_MASK)), rows(size_.height), cols(size_.width),
    step(step_), data((uchar*)data_), refcount(0),
    datastart((uchar*)data_), dataend((uchar*)data_)
@ -95,7 +95,7 @@ cv::gpu::GpuMat::GpuMat(Size size_, int type_, void* data_, size_t step_) :
    dataend += step * (rows - 1) + minstep;
 }

-cv::gpu::GpuMat::GpuMat(const GpuMat& m, Range rowRange_, Range colRange_)
+cv::cuda::GpuMat::GpuMat(const GpuMat& m, Range rowRange_, Range colRange_)
 {
    flags = m.flags;
    step = m.step; refcount = m.refcount;
@ -136,7 +136,7 @@ cv::gpu::GpuMat::GpuMat(const GpuMat& m, Range rowRange_, Range colRange_)
        rows = cols = 0;
 }

-cv::gpu::GpuMat::GpuMat(const GpuMat& m, Rect roi) :
+cv::cuda::GpuMat::GpuMat(const GpuMat& m, Rect roi) :
    flags(m.flags), rows(roi.height), cols(roi.width),
    step(m.step), data(m.data + roi.y*step), refcount(m.refcount),
    datastart(m.datastart), dataend(m.dataend)
@ -153,7 +153,7 @@ cv::gpu::GpuMat::GpuMat(const GpuMat& m, Rect roi) :
        rows = cols = 0;
 }

-GpuMat cv::gpu::GpuMat::reshape(int new_cn, int new_rows) const
+GpuMat cv::cuda::GpuMat::reshape(int new_cn, int new_rows) const
 {
    GpuMat hdr = *this;

@ -196,7 +196,7 @@ GpuMat cv::gpu::GpuMat::reshape(int new_cn, int new_rows) const
    return hdr;
 }

-void cv::gpu::GpuMat::locateROI(Size& wholeSize, Point& ofs) const
+void cv::cuda::GpuMat::locateROI(Size& wholeSize, Point& ofs) const
 {
    CV_DbgAssert( step > 0 );

@ -222,7 +222,7 @@ void cv::gpu::GpuMat::locateROI(Size& wholeSize, Point& ofs) const
    wholeSize.width = std::max(static_cast<int>((delta2 - step * (wholeSize.height - 1)) / esz), ofs.x + cols);
 }

-GpuMat& cv::gpu::GpuMat::adjustROI(int dtop, int dbottom, int dleft, int dright)
+GpuMat& cv::cuda::GpuMat::adjustROI(int dtop, int dbottom, int dleft, int dright)
 {
    Size wholeSize;
    Point ofs;
@ -262,7 +262,7 @@ namespace
    }
 }

-void cv::gpu::createContinuous(int rows, int cols, int type, OutputArray arr)
+void cv::cuda::createContinuous(int rows, int cols, int type, OutputArray arr)
 {
    switch (arr.kind())
    {
@ -316,7 +316,7 @@ namespace
    }
 }

-void cv::gpu::ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr)
+void cv::cuda::ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr)
 {
    switch (arr.kind())
    {
@ -337,7 +337,7 @@ void cv::gpu::ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr)
    }
 }

-GpuMat cv::gpu::allocMatFromBuf(int rows, int cols, int type, GpuMat& mat)
+GpuMat cv::cuda::allocMatFromBuf(int rows, int cols, int type, GpuMat& mat)
 {
    if (!mat.empty() && mat.type() == type && mat.rows >= rows && mat.cols >= cols)
        return mat(Rect(0, 0, cols, rows));
@ -347,7 +347,7 @@ GpuMat cv::gpu::allocMatFromBuf(int rows, int cols, int type, GpuMat& mat)

 #ifndef HAVE_CUDA

-void cv::gpu::GpuMat::create(int _rows, int _cols, int _type)
+void cv::cuda::GpuMat::create(int _rows, int _cols, int _type)
 {
    (void) _rows;
    (void) _cols;
@ -355,50 +355,50 @@ void cv::gpu::GpuMat::create(int _rows, int _cols, int _type)
    throw_no_cuda();
 }

-void cv::gpu::GpuMat::release()
+void cv::cuda::GpuMat::release()
 {
 }

-void cv::gpu::GpuMat::upload(InputArray arr)
+void cv::cuda::GpuMat::upload(InputArray arr)
 {
    (void) arr;
    throw_no_cuda();
 }

-void cv::gpu::GpuMat::upload(InputArray arr, Stream& _stream)
+void cv::cuda::GpuMat::upload(InputArray arr, Stream& _stream)
 {
    (void) arr;
    (void) _stream;
    throw_no_cuda();
 }

-void cv::gpu::GpuMat::download(OutputArray _dst) const
+void cv::cuda::GpuMat::download(OutputArray _dst) const
 {
    (void) _dst;
    throw_no_cuda();
 }

-void cv::gpu::GpuMat::download(OutputArray _dst, Stream& _stream) const
+void cv::cuda::GpuMat::download(OutputArray _dst, Stream& _stream) const
 {
    (void) _dst;
    (void) _stream;
    throw_no_cuda();
 }

-void cv::gpu::GpuMat::copyTo(OutputArray _dst) const
+void cv::cuda::GpuMat::copyTo(OutputArray _dst) const
 {
    (void) _dst;
    throw_no_cuda();
 }

-void cv::gpu::GpuMat::copyTo(OutputArray _dst, Stream& _stream) const
+void cv::cuda::GpuMat::copyTo(OutputArray _dst, Stream& _stream) const
 {
    (void) _dst;
    (void) _stream;
    throw_no_cuda();
 }

-void cv::gpu::GpuMat::copyTo(OutputArray _dst, InputArray _mask, Stream& _stream) const
+void cv::cuda::GpuMat::copyTo(OutputArray _dst, InputArray _mask, Stream& _stream) const
 {
    (void) _dst;
    (void) _mask;
@ -406,7 +406,7 @@ void cv::gpu::GpuMat::copyTo(OutputArray _dst, InputArray _mask, Stream& _stream
    throw_no_cuda();
 }

-GpuMat& cv::gpu::GpuMat::setTo(Scalar s, Stream& _stream)
+GpuMat& cv::cuda::GpuMat::setTo(Scalar s, Stream& _stream)
 {
    (void) s;
    (void) _stream;
@ -414,7 +414,7 @@ GpuMat& cv::gpu::GpuMat::setTo(Scalar s, Stream& _stream)
    return *this;
 }

-GpuMat& cv::gpu::GpuMat::setTo(Scalar s, InputArray _mask, Stream& _stream)
+GpuMat& cv::cuda::GpuMat::setTo(Scalar s, InputArray _mask, Stream& _stream)
 {
    (void) s;
    (void) _mask;
@ -423,7 +423,7 @@ GpuMat& cv::gpu::GpuMat::setTo(Scalar s, InputArray _mask, Stream& _stream)
    return *this;
 }

-void cv::gpu::GpuMat::convertTo(OutputArray _dst, int rtype, Stream& _stream) const
+void cv::cuda::GpuMat::convertTo(OutputArray _dst, int rtype, Stream& _stream) const
 {
    (void) _dst;
    (void) rtype;
@ -431,7 +431,7 @@ void cv::gpu::GpuMat::convertTo(OutputArray _dst, int rtype, Stream& _stream) co
    throw_no_cuda();
 }

-void cv::gpu::GpuMat::convertTo(OutputArray _dst, int rtype, double alpha, double beta, Stream& _stream) const
+void cv::cuda::GpuMat::convertTo(OutputArray _dst, int rtype, double alpha, double beta, Stream& _stream) const
 {
    (void) _dst;
    (void) rtype;
--- a/modules/core/src/cuda_host_mem.cpp
+++ b/modules/core/src/cuda_host_mem.cpp
@ -44,7 +44,7 @@
 #include "precomp.hpp"

 using namespace cv;
-using namespace cv::gpu;
+using namespace cv::cuda;

 namespace
 {
@ -57,7 +57,7 @@ namespace
    }
 }

-void cv::gpu::CudaMem::create(int rows_, int cols_, int type_)
+void cv::cuda::CudaMem::create(int rows_, int cols_, int type_)
 {
 #ifndef HAVE_CUDA
    (void) rows_;
@ -121,7 +121,7 @@ void cv::gpu::CudaMem::create(int rows_, int cols_, int type_)
 #endif
 }

-CudaMem cv::gpu::CudaMem::reshape(int new_cn, int new_rows) const
+CudaMem cv::cuda::CudaMem::reshape(int new_cn, int new_rows) const
 {
    CudaMem hdr = *this;

@ -164,7 +164,7 @@ CudaMem cv::gpu::CudaMem::reshape(int new_cn, int new_rows) const
    return hdr;
 }

-void cv::gpu::CudaMem::release()
+void cv::cuda::CudaMem::release()
 {
 #ifdef HAVE_CUDA
    if (refcount && CV_XADD(refcount, -1) == 1)
@ -179,7 +179,7 @@ void cv::gpu::CudaMem::release()
 #endif
 }

-GpuMat cv::gpu::CudaMem::createGpuMatHeader() const
+GpuMat cv::cuda::CudaMem::createGpuMatHeader() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -194,7 +194,7 @@ GpuMat cv::gpu::CudaMem::createGpuMatHeader() const
 #endif
 }

-void cv::gpu::registerPageLocked(Mat& m)
+void cv::cuda::registerPageLocked(Mat& m)
 {
 #ifndef HAVE_CUDA
    (void) m;
@ -205,7 +205,7 @@ void cv::gpu::registerPageLocked(Mat& m)
 #endif
 }

-void cv::gpu::unregisterPageLocked(Mat& m)
+void cv::cuda::unregisterPageLocked(Mat& m)
 {
 #ifndef HAVE_CUDA
    (void) m;
--- a/modules/core/src/cuda_info.cpp
+++ b/modules/core/src/cuda_info.cpp
@ -43,9 +43,9 @@
 #include "precomp.hpp"

 using namespace cv;
-using namespace cv::gpu;
+using namespace cv::cuda;

-int cv::gpu::getCudaEnabledDeviceCount()
+int cv::cuda::getCudaEnabledDeviceCount()
 {
 #ifndef HAVE_CUDA
    return 0;
@ -64,7 +64,7 @@ int cv::gpu::getCudaEnabledDeviceCount()
 #endif
 }

-void cv::gpu::setDevice(int device)
+void cv::cuda::setDevice(int device)
 {
 #ifndef HAVE_CUDA
    (void) device;
@ -74,7 +74,7 @@ void cv::gpu::setDevice(int device)
 #endif
 }

-int cv::gpu::getDevice()
+int cv::cuda::getDevice()
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -86,7 +86,7 @@ int cv::gpu::getDevice()
 #endif
 }

-void cv::gpu::resetDevice()
+void cv::cuda::resetDevice()
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -95,7 +95,7 @@ void cv::gpu::resetDevice()
 #endif
 }

-bool cv::gpu::deviceSupports(FeatureSet feature_set)
+bool cv::cuda::deviceSupports(FeatureSet feature_set)
 {
 #ifndef HAVE_CUDA
    (void) feature_set;
@ -225,7 +225,7 @@ namespace

 #endif

-bool cv::gpu::TargetArchs::builtWith(cv::gpu::FeatureSet feature_set)
+bool cv::cuda::TargetArchs::builtWith(cv::cuda::FeatureSet feature_set)
 {
 #ifndef HAVE_CUDA
    (void) feature_set;
@ -236,7 +236,7 @@ bool cv::gpu::TargetArchs::builtWith(cv::gpu::FeatureSet feature_set)
 #endif
 }

-bool cv::gpu::TargetArchs::hasPtx(int major, int minor)
+bool cv::cuda::TargetArchs::hasPtx(int major, int minor)
 {
 #ifndef HAVE_CUDA
    (void) major;
@ -248,7 +248,7 @@ bool cv::gpu::TargetArchs::hasPtx(int major, int minor)
 #endif
 }

-bool cv::gpu::TargetArchs::hasBin(int major, int minor)
+bool cv::cuda::TargetArchs::hasBin(int major, int minor)
 {
 #ifndef HAVE_CUDA
    (void) major;
@ -260,7 +260,7 @@ bool cv::gpu::TargetArchs::hasBin(int major, int minor)
 #endif
 }

-bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor)
+bool cv::cuda::TargetArchs::hasEqualOrLessPtx(int major, int minor)
 {
 #ifndef HAVE_CUDA
    (void) major;
@ -272,7 +272,7 @@ bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor)
 #endif
 }

-bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor)
+bool cv::cuda::TargetArchs::hasEqualOrGreaterPtx(int major, int minor)
 {
 #ifndef HAVE_CUDA
    (void) major;
@ -284,7 +284,7 @@ bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor)
 #endif
 }

-bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor)
+bool cv::cuda::TargetArchs::hasEqualOrGreaterBin(int major, int minor)
 {
 #ifndef HAVE_CUDA
    (void) major;
@ -345,7 +345,7 @@ namespace

 #endif

-const char* cv::gpu::DeviceInfo::name() const
+const char* cv::cuda::DeviceInfo::name() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -355,7 +355,7 @@ const char* cv::gpu::DeviceInfo::name() const
 #endif
 }

-size_t cv::gpu::DeviceInfo::totalGlobalMem() const
+size_t cv::cuda::DeviceInfo::totalGlobalMem() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -365,7 +365,7 @@ size_t cv::gpu::DeviceInfo::totalGlobalMem() const
 #endif
 }

-size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const
+size_t cv::cuda::DeviceInfo::sharedMemPerBlock() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -375,7 +375,7 @@ size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const
 #endif
 }

-int cv::gpu::DeviceInfo::regsPerBlock() const
+int cv::cuda::DeviceInfo::regsPerBlock() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -385,7 +385,7 @@ int cv::gpu::DeviceInfo::regsPerBlock() const
 #endif
 }

-int cv::gpu::DeviceInfo::warpSize() const
+int cv::cuda::DeviceInfo::warpSize() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -395,7 +395,7 @@ int cv::gpu::DeviceInfo::warpSize() const
 #endif
 }

-size_t cv::gpu::DeviceInfo::memPitch() const
+size_t cv::cuda::DeviceInfo::memPitch() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -405,7 +405,7 @@ size_t cv::gpu::DeviceInfo::memPitch() const
 #endif
 }

-int cv::gpu::DeviceInfo::maxThreadsPerBlock() const
+int cv::cuda::DeviceInfo::maxThreadsPerBlock() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -415,7 +415,7 @@ int cv::gpu::DeviceInfo::maxThreadsPerBlock() const
 #endif
 }

-Vec3i cv::gpu::DeviceInfo::maxThreadsDim() const
+Vec3i cv::cuda::DeviceInfo::maxThreadsDim() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -425,7 +425,7 @@ Vec3i cv::gpu::DeviceInfo::maxThreadsDim() const
 #endif
 }

-Vec3i cv::gpu::DeviceInfo::maxGridSize() const
+Vec3i cv::cuda::DeviceInfo::maxGridSize() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -435,7 +435,7 @@ Vec3i cv::gpu::DeviceInfo::maxGridSize() const
 #endif
 }

-int cv::gpu::DeviceInfo::clockRate() const
+int cv::cuda::DeviceInfo::clockRate() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -445,7 +445,7 @@ int cv::gpu::DeviceInfo::clockRate() const
 #endif
 }

-size_t cv::gpu::DeviceInfo::totalConstMem() const
+size_t cv::cuda::DeviceInfo::totalConstMem() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -455,7 +455,7 @@ size_t cv::gpu::DeviceInfo::totalConstMem() const
 #endif
 }

-int cv::gpu::DeviceInfo::majorVersion() const
+int cv::cuda::DeviceInfo::majorVersion() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -465,7 +465,7 @@ int cv::gpu::DeviceInfo::majorVersion() const
 #endif
 }

-int cv::gpu::DeviceInfo::minorVersion() const
+int cv::cuda::DeviceInfo::minorVersion() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -475,7 +475,7 @@ int cv::gpu::DeviceInfo::minorVersion() const
 #endif
 }

-size_t cv::gpu::DeviceInfo::textureAlignment() const
+size_t cv::cuda::DeviceInfo::textureAlignment() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -485,7 +485,7 @@ size_t cv::gpu::DeviceInfo::textureAlignment() const
 #endif
 }

-size_t cv::gpu::DeviceInfo::texturePitchAlignment() const
+size_t cv::cuda::DeviceInfo::texturePitchAlignment() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -495,7 +495,7 @@ size_t cv::gpu::DeviceInfo::texturePitchAlignment() const
 #endif
 }

-int cv::gpu::DeviceInfo::multiProcessorCount() const
+int cv::cuda::DeviceInfo::multiProcessorCount() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -505,7 +505,7 @@ int cv::gpu::DeviceInfo::multiProcessorCount() const
 #endif
 }

-bool cv::gpu::DeviceInfo::kernelExecTimeoutEnabled() const
+bool cv::cuda::DeviceInfo::kernelExecTimeoutEnabled() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -515,7 +515,7 @@ bool cv::gpu::DeviceInfo::kernelExecTimeoutEnabled() const
 #endif
 }

-bool cv::gpu::DeviceInfo::integrated() const
+bool cv::cuda::DeviceInfo::integrated() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -525,7 +525,7 @@ bool cv::gpu::DeviceInfo::integrated() const
 #endif
 }

-bool cv::gpu::DeviceInfo::canMapHostMemory() const
+bool cv::cuda::DeviceInfo::canMapHostMemory() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -535,7 +535,7 @@ bool cv::gpu::DeviceInfo::canMapHostMemory() const
 #endif
 }

-DeviceInfo::ComputeMode cv::gpu::DeviceInfo::computeMode() const
+DeviceInfo::ComputeMode cv::cuda::DeviceInfo::computeMode() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -553,7 +553,7 @@ DeviceInfo::ComputeMode cv::gpu::DeviceInfo::computeMode() const
 #endif
 }

-int cv::gpu::DeviceInfo::maxTexture1D() const
+int cv::cuda::DeviceInfo::maxTexture1D() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -563,7 +563,7 @@ int cv::gpu::DeviceInfo::maxTexture1D() const
 #endif
 }

-int cv::gpu::DeviceInfo::maxTexture1DMipmap() const
+int cv::cuda::DeviceInfo::maxTexture1DMipmap() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -578,7 +578,7 @@ int cv::gpu::DeviceInfo::maxTexture1DMipmap() const
 #endif
 }

-int cv::gpu::DeviceInfo::maxTexture1DLinear() const
+int cv::cuda::DeviceInfo::maxTexture1DLinear() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -588,7 +588,7 @@ int cv::gpu::DeviceInfo::maxTexture1DLinear() const
 #endif
 }

-Vec2i cv::gpu::DeviceInfo::maxTexture2D() const
+Vec2i cv::cuda::DeviceInfo::maxTexture2D() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -598,7 +598,7 @@ Vec2i cv::gpu::DeviceInfo::maxTexture2D() const
 #endif
 }

-Vec2i cv::gpu::DeviceInfo::maxTexture2DMipmap() const
+Vec2i cv::cuda::DeviceInfo::maxTexture2DMipmap() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -613,7 +613,7 @@ Vec2i cv::gpu::DeviceInfo::maxTexture2DMipmap() const
 #endif
 }

-Vec3i cv::gpu::DeviceInfo::maxTexture2DLinear() const
+Vec3i cv::cuda::DeviceInfo::maxTexture2DLinear() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -623,7 +623,7 @@ Vec3i cv::gpu::DeviceInfo::maxTexture2DLinear() const
 #endif
 }

-Vec2i cv::gpu::DeviceInfo::maxTexture2DGather() const
+Vec2i cv::cuda::DeviceInfo::maxTexture2DGather() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -633,7 +633,7 @@ Vec2i cv::gpu::DeviceInfo::maxTexture2DGather() const
 #endif
 }

-Vec3i cv::gpu::DeviceInfo::maxTexture3D() const
+Vec3i cv::cuda::DeviceInfo::maxTexture3D() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -643,7 +643,7 @@ Vec3i cv::gpu::DeviceInfo::maxTexture3D() const
 #endif
 }

-int cv::gpu::DeviceInfo::maxTextureCubemap() const
+int cv::cuda::DeviceInfo::maxTextureCubemap() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -653,7 +653,7 @@ int cv::gpu::DeviceInfo::maxTextureCubemap() const
 #endif
 }

-Vec2i cv::gpu::DeviceInfo::maxTexture1DLayered() const
+Vec2i cv::cuda::DeviceInfo::maxTexture1DLayered() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -663,7 +663,7 @@ Vec2i cv::gpu::DeviceInfo::maxTexture1DLayered() const
 #endif
 }

-Vec3i cv::gpu::DeviceInfo::maxTexture2DLayered() const
+Vec3i cv::cuda::DeviceInfo::maxTexture2DLayered() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -673,7 +673,7 @@ Vec3i cv::gpu::DeviceInfo::maxTexture2DLayered() const
 #endif
 }

-Vec2i cv::gpu::DeviceInfo::maxTextureCubemapLayered() const
+Vec2i cv::cuda::DeviceInfo::maxTextureCubemapLayered() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -683,7 +683,7 @@ Vec2i cv::gpu::DeviceInfo::maxTextureCubemapLayered() const
 #endif
 }

-int cv::gpu::DeviceInfo::maxSurface1D() const
+int cv::cuda::DeviceInfo::maxSurface1D() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -693,7 +693,7 @@ int cv::gpu::DeviceInfo::maxSurface1D() const
 #endif
 }

-Vec2i cv::gpu::DeviceInfo::maxSurface2D() const
+Vec2i cv::cuda::DeviceInfo::maxSurface2D() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -703,7 +703,7 @@ Vec2i cv::gpu::DeviceInfo::maxSurface2D() const
 #endif
 }

-Vec3i cv::gpu::DeviceInfo::maxSurface3D() const
+Vec3i cv::cuda::DeviceInfo::maxSurface3D() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -713,7 +713,7 @@ Vec3i cv::gpu::DeviceInfo::maxSurface3D() const
 #endif
 }

-Vec2i cv::gpu::DeviceInfo::maxSurface1DLayered() const
+Vec2i cv::cuda::DeviceInfo::maxSurface1DLayered() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -723,7 +723,7 @@ Vec2i cv::gpu::DeviceInfo::maxSurface1DLayered() const
 #endif
 }

-Vec3i cv::gpu::DeviceInfo::maxSurface2DLayered() const
+Vec3i cv::cuda::DeviceInfo::maxSurface2DLayered() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -733,7 +733,7 @@ Vec3i cv::gpu::DeviceInfo::maxSurface2DLayered() const
 #endif
 }

-int cv::gpu::DeviceInfo::maxSurfaceCubemap() const
+int cv::cuda::DeviceInfo::maxSurfaceCubemap() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -743,7 +743,7 @@ int cv::gpu::DeviceInfo::maxSurfaceCubemap() const
 #endif
 }

-Vec2i cv::gpu::DeviceInfo::maxSurfaceCubemapLayered() const
+Vec2i cv::cuda::DeviceInfo::maxSurfaceCubemapLayered() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -753,7 +753,7 @@ Vec2i cv::gpu::DeviceInfo::maxSurfaceCubemapLayered() const
 #endif
 }

-size_t cv::gpu::DeviceInfo::surfaceAlignment() const
+size_t cv::cuda::DeviceInfo::surfaceAlignment() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -763,7 +763,7 @@ size_t cv::gpu::DeviceInfo::surfaceAlignment() const
 #endif
 }

-bool cv::gpu::DeviceInfo::concurrentKernels() const
+bool cv::cuda::DeviceInfo::concurrentKernels() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -773,7 +773,7 @@ bool cv::gpu::DeviceInfo::concurrentKernels() const
 #endif
 }

-bool cv::gpu::DeviceInfo::ECCEnabled() const
+bool cv::cuda::DeviceInfo::ECCEnabled() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -783,7 +783,7 @@ bool cv::gpu::DeviceInfo::ECCEnabled() const
 #endif
 }

-int cv::gpu::DeviceInfo::pciBusID() const
+int cv::cuda::DeviceInfo::pciBusID() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -793,7 +793,7 @@ int cv::gpu::DeviceInfo::pciBusID() const
 #endif
 }

-int cv::gpu::DeviceInfo::pciDeviceID() const
+int cv::cuda::DeviceInfo::pciDeviceID() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -803,7 +803,7 @@ int cv::gpu::DeviceInfo::pciDeviceID() const
 #endif
 }

-int cv::gpu::DeviceInfo::pciDomainID() const
+int cv::cuda::DeviceInfo::pciDomainID() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -813,7 +813,7 @@ int cv::gpu::DeviceInfo::pciDomainID() const
 #endif
 }

-bool cv::gpu::DeviceInfo::tccDriver() const
+bool cv::cuda::DeviceInfo::tccDriver() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -823,7 +823,7 @@ bool cv::gpu::DeviceInfo::tccDriver() const
 #endif
 }

-int cv::gpu::DeviceInfo::asyncEngineCount() const
+int cv::cuda::DeviceInfo::asyncEngineCount() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -833,7 +833,7 @@ int cv::gpu::DeviceInfo::asyncEngineCount() const
 #endif
 }

-bool cv::gpu::DeviceInfo::unifiedAddressing() const
+bool cv::cuda::DeviceInfo::unifiedAddressing() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -843,7 +843,7 @@ bool cv::gpu::DeviceInfo::unifiedAddressing() const
 #endif
 }

-int cv::gpu::DeviceInfo::memoryClockRate() const
+int cv::cuda::DeviceInfo::memoryClockRate() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -853,7 +853,7 @@ int cv::gpu::DeviceInfo::memoryClockRate() const
 #endif
 }

-int cv::gpu::DeviceInfo::memoryBusWidth() const
+int cv::cuda::DeviceInfo::memoryBusWidth() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -863,7 +863,7 @@ int cv::gpu::DeviceInfo::memoryBusWidth() const
 #endif
 }

-int cv::gpu::DeviceInfo::l2CacheSize() const
+int cv::cuda::DeviceInfo::l2CacheSize() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -873,7 +873,7 @@ int cv::gpu::DeviceInfo::l2CacheSize() const
 #endif
 }

-int cv::gpu::DeviceInfo::maxThreadsPerMultiProcessor() const
+int cv::cuda::DeviceInfo::maxThreadsPerMultiProcessor() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -883,7 +883,7 @@ int cv::gpu::DeviceInfo::maxThreadsPerMultiProcessor() const
 #endif
 }

-void cv::gpu::DeviceInfo::queryMemory(size_t& _totalMemory, size_t& _freeMemory) const
+void cv::cuda::DeviceInfo::queryMemory(size_t& _totalMemory, size_t& _freeMemory) const
 {
 #ifndef HAVE_CUDA
    (void) _totalMemory;
@ -901,7 +901,7 @@ void cv::gpu::DeviceInfo::queryMemory(size_t& _totalMemory, size_t& _freeMemory)
 #endif
 }

-bool cv::gpu::DeviceInfo::isCompatible() const
+bool cv::cuda::DeviceInfo::isCompatible() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -951,7 +951,7 @@ namespace

 #endif

-void cv::gpu::printCudaDeviceInfo(int device)
+void cv::cuda::printCudaDeviceInfo(int device)
 {
 #ifndef HAVE_CUDA
    (void) device;
@ -1037,7 +1037,7 @@ void cv::gpu::printCudaDeviceInfo(int device)
 #endif
 }

-void cv::gpu::printShortCudaDeviceInfo(int device)
+void cv::cuda::printShortCudaDeviceInfo(int device)
 {
 #ifndef HAVE_CUDA
    (void) device;
@ -1251,7 +1251,7 @@ namespace

 #endif

-String cv::gpu::getNppErrorMessage(int code)
+String cv::cuda::getNppErrorMessage(int code)
 {
 #ifndef HAVE_CUDA
    (void) code;
@ -1261,7 +1261,7 @@ String cv::gpu::getNppErrorMessage(int code)
 #endif
 }

-String cv::gpu::getCudaDriverApiErrorMessage(int code)
+String cv::cuda::getCudaDriverApiErrorMessage(int code)
 {
 #ifndef HAVE_CUDA
    (void) code;
--- a/modules/core/src/cuda_stream.cpp
+++ b/modules/core/src/cuda_stream.cpp
@ -43,14 +43,14 @@
 #include "precomp.hpp"

 using namespace cv;
-using namespace cv::gpu;
+using namespace cv::cuda;

 ////////////////////////////////////////////////////////////////
 // Stream

 #ifndef HAVE_CUDA

-class cv::gpu::Stream::Impl
+class cv::cuda::Stream::Impl
 {
 public:
    Impl(void* ptr = 0)
@ -62,7 +62,7 @@ public:

 #else

-class cv::gpu::Stream::Impl
+class cv::cuda::Stream::Impl
 {
 public:
    cudaStream_t stream;
@ -73,29 +73,29 @@ public:
    ~Impl();
 };

-cv::gpu::Stream::Impl::Impl() : stream(0)
+cv::cuda::Stream::Impl::Impl() : stream(0)
 {
    cudaSafeCall( cudaStreamCreate(&stream) );
 }

-cv::gpu::Stream::Impl::Impl(cudaStream_t stream_) : stream(stream_)
+cv::cuda::Stream::Impl::Impl(cudaStream_t stream_) : stream(stream_)
 {
 }

-cv::gpu::Stream::Impl::~Impl()
+cv::cuda::Stream::Impl::~Impl()
 {
    if (stream)
        cudaStreamDestroy(stream);
 }

-cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream)
+cudaStream_t cv::cuda::StreamAccessor::getStream(const Stream& stream)
 {
    return stream.impl_->stream;
 }

 #endif

-cv::gpu::Stream::Stream()
+cv::cuda::Stream::Stream()
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -104,7 +104,7 @@ cv::gpu::Stream::Stream()
 #endif
 }

-bool cv::gpu::Stream::queryIfComplete() const
+bool cv::cuda::Stream::queryIfComplete() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -120,7 +120,7 @@ bool cv::gpu::Stream::queryIfComplete() const
 #endif
 }

-void cv::gpu::Stream::waitForCompletion()
+void cv::cuda::Stream::waitForCompletion()
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -129,7 +129,7 @@ void cv::gpu::Stream::waitForCompletion()
 #endif
 }

-void cv::gpu::Stream::waitEvent(const Event& event)
+void cv::cuda::Stream::waitEvent(const Event& event)
 {
 #ifndef HAVE_CUDA
    (void) event;
@ -161,7 +161,7 @@ namespace

 #endif

-void cv::gpu::Stream::enqueueHostCallback(StreamCallback callback, void* userData)
+void cv::cuda::Stream::enqueueHostCallback(StreamCallback callback, void* userData)
 {
 #ifndef HAVE_CUDA
    (void) callback;
@ -180,13 +180,13 @@ void cv::gpu::Stream::enqueueHostCallback(StreamCallback callback, void* userDat
 #endif
 }

-Stream& cv::gpu::Stream::Null()
+Stream& cv::cuda::Stream::Null()
 {
    static Stream s(Ptr<Impl>(new Impl(0)));
    return s;
 }

-cv::gpu::Stream::operator bool_type() const
+cv::cuda::Stream::operator bool_type() const
 {
 #ifndef HAVE_CUDA
    return 0;
@ -201,7 +201,7 @@ cv::gpu::Stream::operator bool_type() const

 #ifndef HAVE_CUDA

-class cv::gpu::Event::Impl
+class cv::cuda::Event::Impl
 {
 public:
    Impl(unsigned int)
@ -212,7 +212,7 @@ public:

 #else

-class cv::gpu::Event::Impl
+class cv::cuda::Event::Impl
 {
 public:
    cudaEvent_t event;
@ -221,25 +221,25 @@ public:
    ~Impl();
 };

-cv::gpu::Event::Impl::Impl(unsigned int flags) : event(0)
+cv::cuda::Event::Impl::Impl(unsigned int flags) : event(0)
 {
    cudaSafeCall( cudaEventCreateWithFlags(&event, flags) );
 }

-cv::gpu::Event::Impl::~Impl()
+cv::cuda::Event::Impl::~Impl()
 {
    if (event)
        cudaEventDestroy(event);
 }

-cudaEvent_t cv::gpu::EventAccessor::getEvent(const Event& event)
+cudaEvent_t cv::cuda::EventAccessor::getEvent(const Event& event)
 {
    return event.impl_->event;
 }

 #endif

-cv::gpu::Event::Event(CreateFlags flags)
+cv::cuda::Event::Event(CreateFlags flags)
 {
 #ifndef HAVE_CUDA
    (void) flags;
@ -249,7 +249,7 @@ cv::gpu::Event::Event(CreateFlags flags)
 #endif
 }

-void cv::gpu::Event::record(Stream& stream)
+void cv::cuda::Event::record(Stream& stream)
 {
 #ifndef HAVE_CUDA
    (void) stream;
@ -259,7 +259,7 @@ void cv::gpu::Event::record(Stream& stream)
 #endif
 }

-bool cv::gpu::Event::queryIfComplete() const
+bool cv::cuda::Event::queryIfComplete() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -275,7 +275,7 @@ bool cv::gpu::Event::queryIfComplete() const
 #endif
 }

-void cv::gpu::Event::waitForCompletion()
+void cv::cuda::Event::waitForCompletion()
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -284,7 +284,7 @@ void cv::gpu::Event::waitForCompletion()
 #endif
 }

-float cv::gpu::Event::elapsedTime(const Event& start, const Event& end)
+float cv::cuda::Event::elapsedTime(const Event& start, const Event& end)
 {
 #ifndef HAVE_CUDA
    (void) start;
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@ -943,9 +943,9 @@ _InputArray::_InputArray(const Mat& m) : flags(MAT), obj((void*)&m) {}
 _InputArray::_InputArray(const std::vector<Mat>& vec) : flags(STD_VECTOR_MAT), obj((void*)&vec) {}
 _InputArray::_InputArray(const double& val) : flags(FIXED_TYPE + FIXED_SIZE + MATX + CV_64F), obj((void*)&val), sz(Size(1,1)) {}
 _InputArray::_InputArray(const MatExpr& expr) : flags(FIXED_TYPE + FIXED_SIZE + EXPR), obj((void*)&expr) {}
-_InputArray::_InputArray(const gpu::GpuMat& d_mat) : flags(GPU_MAT), obj((void*)&d_mat) {}
+_InputArray::_InputArray(const cuda::GpuMat& d_mat) : flags(GPU_MAT), obj((void*)&d_mat) {}
 _InputArray::_InputArray(const ogl::Buffer& buf) : flags(OPENGL_BUFFER), obj((void*)&buf) {}
-_InputArray::_InputArray(const gpu::CudaMem& cuda_mem) : flags(CUDA_MEM), obj((void*)&cuda_mem) {}
+_InputArray::_InputArray(const cuda::CudaMem& cuda_mem) : flags(CUDA_MEM), obj((void*)&cuda_mem) {}

 _InputArray::~_InputArray() {}

@ -1018,7 +1018,7 @@ Mat _InputArray::getMat(int i) const
    if( k == GPU_MAT )
    {
        CV_Assert( i < 0 );
-        CV_Error(cv::Error::StsNotImplemented, "You should explicitly call download method for gpu::GpuMat object");
+        CV_Error(cv::Error::StsNotImplemented, "You should explicitly call download method for cuda::GpuMat object");
        return Mat();
    }

@ -1027,7 +1027,7 @@ Mat _InputArray::getMat(int i) const
    {
        CV_Assert( i < 0 );

-        const gpu::CudaMem* cuda_mem = (const gpu::CudaMem*)obj;
+        const cuda::CudaMem* cuda_mem = (const cuda::CudaMem*)obj;

        return cuda_mem->createMatHeader();
    }
@ -1120,33 +1120,33 @@ void _InputArray::getMatVector(std::vector<Mat>& mv) const
    }
 }

-gpu::GpuMat _InputArray::getGpuMat() const
+cuda::GpuMat _InputArray::getGpuMat() const
 {
    int k = kind();

    if (k == GPU_MAT)
    {
-        const gpu::GpuMat* d_mat = (const gpu::GpuMat*)obj;
+        const cuda::GpuMat* d_mat = (const cuda::GpuMat*)obj;
        return *d_mat;
    }

    if (k == CUDA_MEM)
    {
-        const gpu::CudaMem* cuda_mem = (const gpu::CudaMem*)obj;
+        const cuda::CudaMem* cuda_mem = (const cuda::CudaMem*)obj;
        return cuda_mem->createGpuMatHeader();
    }

    if (k == OPENGL_BUFFER)
    {
        CV_Error(cv::Error::StsNotImplemented, "You should explicitly call mapDevice/unmapDevice methods for ogl::Buffer object");
-        return gpu::GpuMat();
+        return cuda::GpuMat();
    }

    if (k == NONE)
-        return gpu::GpuMat();
+        return cuda::GpuMat();

-    CV_Error(cv::Error::StsNotImplemented, "getGpuMat is available only for gpu::GpuMat and gpu::CudaMem");
-    return gpu::GpuMat();
+    CV_Error(cv::Error::StsNotImplemented, "getGpuMat is available only for cuda::GpuMat and cuda::CudaMem");
+    return cuda::GpuMat();
 }

 ogl::Buffer _InputArray::getOGlBuffer() const
@ -1230,7 +1230,7 @@ Size _InputArray::size(int i) const
    if( k == GPU_MAT )
    {
        CV_Assert( i < 0 );
-        const gpu::GpuMat* d_mat = (const gpu::GpuMat*)obj;
+        const cuda::GpuMat* d_mat = (const cuda::GpuMat*)obj;
        return d_mat->size();
    }

@ -1243,7 +1243,7 @@ Size _InputArray::size(int i) const
    //if( k == CUDA_MEM )
    {
        CV_Assert( i < 0 );
-        const gpu::CudaMem* cuda_mem = (const gpu::CudaMem*)obj;
+        const cuda::CudaMem* cuda_mem = (const cuda::CudaMem*)obj;
        return cuda_mem->size();
    }
 }
@ -1299,11 +1299,11 @@ int _InputArray::type(int i) const
        return ((const ogl::Buffer*)obj)->type();

    if( k == GPU_MAT )
-        return ((const gpu::GpuMat*)obj)->type();
+        return ((const cuda::GpuMat*)obj)->type();

    CV_Assert( k == CUDA_MEM );
    //if( k == CUDA_MEM )
-        return ((const gpu::CudaMem*)obj)->type();
+        return ((const cuda::CudaMem*)obj)->type();
 }

 int _InputArray::depth(int i) const
@ -1359,26 +1359,26 @@ bool _InputArray::empty() const
    }

    if( k == GPU_MAT )
-        return ((const gpu::GpuMat*)obj)->empty();
+        return ((const cuda::GpuMat*)obj)->empty();

    CV_Assert( k == CUDA_MEM );
    //if( k == CUDA_MEM )
-        return ((const gpu::CudaMem*)obj)->empty();
+        return ((const cuda::CudaMem*)obj)->empty();
 }


 _OutputArray::_OutputArray() {}
 _OutputArray::_OutputArray(Mat& m) : _InputArray(m) {}
 _OutputArray::_OutputArray(std::vector<Mat>& vec) : _InputArray(vec) {}
-_OutputArray::_OutputArray(gpu::GpuMat& d_mat) : _InputArray(d_mat) {}
+_OutputArray::_OutputArray(cuda::GpuMat& d_mat) : _InputArray(d_mat) {}
 _OutputArray::_OutputArray(ogl::Buffer& buf) : _InputArray(buf) {}
-_OutputArray::_OutputArray(gpu::CudaMem& cuda_mem) : _InputArray(cuda_mem) {}
+_OutputArray::_OutputArray(cuda::CudaMem& cuda_mem) : _InputArray(cuda_mem) {}

 _OutputArray::_OutputArray(const Mat& m) : _InputArray(m) {flags |= FIXED_SIZE|FIXED_TYPE;}
 _OutputArray::_OutputArray(const std::vector<Mat>& vec) : _InputArray(vec) {flags |= FIXED_SIZE;}
-_OutputArray::_OutputArray(const gpu::GpuMat& d_mat) : _InputArray(d_mat) {flags |= FIXED_SIZE|FIXED_TYPE;}
+_OutputArray::_OutputArray(const cuda::GpuMat& d_mat) : _InputArray(d_mat) {flags |= FIXED_SIZE|FIXED_TYPE;}
 _OutputArray::_OutputArray(const ogl::Buffer& buf) : _InputArray(buf) {flags |= FIXED_SIZE|FIXED_TYPE;}
-_OutputArray::_OutputArray(const gpu::CudaMem& cuda_mem) : _InputArray(cuda_mem) {flags |= FIXED_SIZE|FIXED_TYPE;}
+_OutputArray::_OutputArray(const cuda::CudaMem& cuda_mem) : _InputArray(cuda_mem) {flags |= FIXED_SIZE|FIXED_TYPE;}

 _OutputArray::~_OutputArray() {}

@ -1404,9 +1404,9 @@ void _OutputArray::create(Size _sz, int mtype, int i, bool allowTransposed, int
    }
    if( k == GPU_MAT && i < 0 && !allowTransposed && fixedDepthMask == 0 )
    {
-        CV_Assert(!fixedSize() || ((gpu::GpuMat*)obj)->size() == _sz);
-        CV_Assert(!fixedType() || ((gpu::GpuMat*)obj)->type() == mtype);
-        ((gpu::GpuMat*)obj)->create(_sz, mtype);
+        CV_Assert(!fixedSize() || ((cuda::GpuMat*)obj)->size() == _sz);
+        CV_Assert(!fixedType() || ((cuda::GpuMat*)obj)->type() == mtype);
+        ((cuda::GpuMat*)obj)->create(_sz, mtype);
        return;
    }
    if( k == OPENGL_BUFFER && i < 0 && !allowTransposed && fixedDepthMask == 0 )
@ -1418,9 +1418,9 @@ void _OutputArray::create(Size _sz, int mtype, int i, bool allowTransposed, int
    }
    if( k == CUDA_MEM && i < 0 && !allowTransposed && fixedDepthMask == 0 )
    {
-        CV_Assert(!fixedSize() || ((gpu::CudaMem*)obj)->size() == _sz);
-        CV_Assert(!fixedType() || ((gpu::CudaMem*)obj)->type() == mtype);
-        ((gpu::CudaMem*)obj)->create(_sz, mtype);
+        CV_Assert(!fixedSize() || ((cuda::CudaMem*)obj)->size() == _sz);
+        CV_Assert(!fixedType() || ((cuda::CudaMem*)obj)->type() == mtype);
+        ((cuda::CudaMem*)obj)->create(_sz, mtype);
        return;
    }
    int sizes[] = {_sz.height, _sz.width};
@ -1439,9 +1439,9 @@ void _OutputArray::create(int rows, int cols, int mtype, int i, bool allowTransp
    }
    if( k == GPU_MAT && i < 0 && !allowTransposed && fixedDepthMask == 0 )
    {
-        CV_Assert(!fixedSize() || ((gpu::GpuMat*)obj)->size() == Size(cols, rows));
-        CV_Assert(!fixedType() || ((gpu::GpuMat*)obj)->type() == mtype);
-        ((gpu::GpuMat*)obj)->create(rows, cols, mtype);
+        CV_Assert(!fixedSize() || ((cuda::GpuMat*)obj)->size() == Size(cols, rows));
+        CV_Assert(!fixedType() || ((cuda::GpuMat*)obj)->type() == mtype);
+        ((cuda::GpuMat*)obj)->create(rows, cols, mtype);
        return;
    }
    if( k == OPENGL_BUFFER && i < 0 && !allowTransposed && fixedDepthMask == 0 )
@ -1453,9 +1453,9 @@ void _OutputArray::create(int rows, int cols, int mtype, int i, bool allowTransp
    }
    if( k == CUDA_MEM && i < 0 && !allowTransposed && fixedDepthMask == 0 )
    {
-        CV_Assert(!fixedSize() || ((gpu::CudaMem*)obj)->size() == Size(cols, rows));
-        CV_Assert(!fixedType() || ((gpu::CudaMem*)obj)->type() == mtype);
-        ((gpu::CudaMem*)obj)->create(rows, cols, mtype);
+        CV_Assert(!fixedSize() || ((cuda::CudaMem*)obj)->size() == Size(cols, rows));
+        CV_Assert(!fixedType() || ((cuda::CudaMem*)obj)->type() == mtype);
+        ((cuda::CudaMem*)obj)->create(rows, cols, mtype);
        return;
    }
    int sizes[] = {rows, cols};
@ -1678,13 +1678,13 @@ void _OutputArray::release() const

    if( k == GPU_MAT )
    {
-        ((gpu::GpuMat*)obj)->release();
+        ((cuda::GpuMat*)obj)->release();
        return;
    }

    if( k == CUDA_MEM )
    {
-        ((gpu::CudaMem*)obj)->release();
+        ((cuda::CudaMem*)obj)->release();
        return;
    }

@ -1757,11 +1757,11 @@ Mat& _OutputArray::getMatRef(int i) const
    }
 }

-gpu::GpuMat& _OutputArray::getGpuMatRef() const
+cuda::GpuMat& _OutputArray::getGpuMatRef() const
 {
    int k = kind();
    CV_Assert( k == GPU_MAT );
-    return *(gpu::GpuMat*)obj;
+    return *(cuda::GpuMat*)obj;
 }

 ogl::Buffer& _OutputArray::getOGlBufferRef() const
@ -1771,11 +1771,11 @@ ogl::Buffer& _OutputArray::getOGlBufferRef() const
    return *(ogl::Buffer*)obj;
 }

-gpu::CudaMem& _OutputArray::getCudaMemRef() const
+cuda::CudaMem& _OutputArray::getCudaMemRef() const
 {
    int k = kind();
    CV_Assert( k == CUDA_MEM );
-    return *(gpu::CudaMem*)obj;
+    return *(cuda::CudaMem*)obj;
 }

 static _OutputArray _none;
--- a/modules/core/src/opengl.cpp
+++ b/modules/core/src/opengl.cpp
@ -50,7 +50,7 @@
 #endif

 using namespace cv;
-using namespace cv::gpu;
+using namespace cv::cuda;

 namespace
 {
@ -122,7 +122,7 @@ namespace
 ////////////////////////////////////////////////////////////////////////
 // setGlDevice

-void cv::gpu::setGlDevice(int device)
+void cv::cuda::setGlDevice(int device)
 {
 #ifndef HAVE_OPENGL
    (void) device;
@ -627,7 +627,7 @@ void cv::ogl::Buffer::copyFrom(InputArray arr, Target target, bool autoRelease)
 #endif
 }

-void cv::ogl::Buffer::copyFrom(InputArray arr, gpu::Stream& stream, Target target, bool autoRelease)
+void cv::ogl::Buffer::copyFrom(InputArray arr, cuda::Stream& stream, Target target, bool autoRelease)
 {
 #ifndef HAVE_OPENGL
    (void) arr;
@ -647,7 +647,7 @@ void cv::ogl::Buffer::copyFrom(InputArray arr, gpu::Stream& stream, Target targe

        create(dmat.size(), dmat.type(), target, autoRelease);

-        impl_->copyFrom(dmat.data, dmat.step, dmat.cols * dmat.elemSize(), dmat.rows, gpu::StreamAccessor::getStream(stream));
+        impl_->copyFrom(dmat.data, dmat.step, dmat.cols * dmat.elemSize(), dmat.rows, cuda::StreamAccessor::getStream(stream));
    #endif
 #endif
 }
@ -692,7 +692,7 @@ void cv::ogl::Buffer::copyTo(OutputArray arr) const
 #endif
 }

-void cv::ogl::Buffer::copyTo(OutputArray arr, gpu::Stream& stream) const
+void cv::ogl::Buffer::copyTo(OutputArray arr, cuda::Stream& stream) const
 {
 #ifndef HAVE_OPENGL
    (void) arr;
@ -706,7 +706,7 @@ void cv::ogl::Buffer::copyTo(OutputArray arr, gpu::Stream& stream) const
    #else
        arr.create(rows_, cols_, type_);
        GpuMat dmat = arr.getGpuMat();
-        impl_->copyTo(dmat.data, dmat.step, dmat.cols * dmat.elemSize(), dmat.rows, gpu::StreamAccessor::getStream(stream));
+        impl_->copyTo(dmat.data, dmat.step, dmat.cols * dmat.elemSize(), dmat.rows, cuda::StreamAccessor::getStream(stream));
    #endif
 #endif
 }
@ -794,7 +794,7 @@ void cv::ogl::Buffer::unmapDevice()
 #endif
 }

-gpu::GpuMat cv::ogl::Buffer::mapDevice(gpu::Stream& stream)
+cuda::GpuMat cv::ogl::Buffer::mapDevice(cuda::Stream& stream)
 {
 #ifndef HAVE_OPENGL
    (void) stream;
@ -806,12 +806,12 @@ gpu::GpuMat cv::ogl::Buffer::mapDevice(gpu::Stream& stream)
        throw_no_cuda();
        return GpuMat();
    #else
-        return GpuMat(rows_, cols_, type_, impl_->mapDevice(gpu::StreamAccessor::getStream(stream)));
+        return GpuMat(rows_, cols_, type_, impl_->mapDevice(cuda::StreamAccessor::getStream(stream)));
    #endif
 #endif
 }

-void cv::ogl::Buffer::unmapDevice(gpu::Stream& stream)
+void cv::ogl::Buffer::unmapDevice(cuda::Stream& stream)
 {
 #ifndef HAVE_OPENGL
    (void) stream;
@ -821,7 +821,7 @@ void cv::ogl::Buffer::unmapDevice(gpu::Stream& stream)
        (void) stream;
        throw_no_cuda();
    #else
-        impl_->unmapDevice(gpu::StreamAccessor::getStream(stream));
+        impl_->unmapDevice(cuda::StreamAccessor::getStream(stream));
    #endif
 #endif
 }
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@ -45,11 +45,11 @@

 #include "opencv2/core/utility.hpp"
 #include "opencv2/core/core_c.h"
-#include "opencv2/core/gpu.hpp"
+#include "opencv2/core/cuda.hpp"
 #include "opencv2/core/opengl.hpp"

 #include "opencv2/core/private.hpp"
-#include "opencv2/core/private.gpu.hpp"
+#include "opencv2/core/private.cuda.hpp"

 #include <assert.h>
 #include <ctype.h>
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@ -634,7 +634,7 @@ CV_IMPL const char* cvErrorStr( int status )
    case CV_StsNotImplemented :      return "The function/feature is not implemented";
    case CV_StsBadMemBlock :         return "Memory block has been corrupted";
    case CV_StsAssert :              return "Assertion failed";
-    case CV_GpuNotSupported :        return "No GPU support";
+    case CV_GpuNotSupported :        return "No CUDA support";
    case CV_GpuApiCallError :        return "Gpu API call";
    case CV_OpenGlNotSupported :     return "No OpenGL support";
    case CV_OpenGlApiCallError :     return "OpenGL API call";
--- a/modules/cuda/CMakeLists.txt
+++ b/modules/cuda/CMakeLists.txt
@ -0,0 +1,9 @@
+if(ANDROID OR IOS)
+  ocv_module_disable(cuda)
+endif()
+
+set(the_description "CUDA-accelerated Computer Vision")
+
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4100 /wd4324 /wd4512 /wd4515 -Wundef -Wmissing-declarations -Wshadow -Wunused-parameter)
+
+ocv_define_module(cuda opencv_calib3d opencv_objdetect opencv_cudaarithm opencv_cudawarping OPTIONAL opencv_cudalegacy)
--- a/modules/cuda/doc/calib3d.rst
+++ b/modules/cuda/doc/calib3d.rst
@ -5,11 +5,11 @@ Camera Calibration and 3D Reconstruction



-gpu::solvePnPRansac
-------------------
+cuda::solvePnPRansac
+--------------------
 Finds the object pose from 3D-2D point correspondences.

-.. ocv:function:: void gpu::solvePnPRansac(const Mat& object, const Mat& image, const Mat& camera_mat, const Mat& dist_coef, Mat& rvec, Mat& tvec, bool use_extrinsic_guess=false, int num_iters=100, float max_dist=8.0, int min_inlier_count=100, vector<int>* inliers=NULL)
+.. ocv:function:: void cuda::solvePnPRansac(const Mat& object, const Mat& image, const Mat& camera_mat, const Mat& dist_coef, Mat& rvec, Mat& tvec, bool use_extrinsic_guess=false, int num_iters=100, float max_dist=8.0, int min_inlier_count=100, vector<int>* inliers=NULL)

    :param object: Single-row matrix of object points.

--- a/modules/cuda/doc/cuda.rst
+++ b/modules/cuda/doc/cuda.rst
@ -1,6 +1,6 @@
-************************************
-gpu. GPU-accelerated Computer Vision
-************************************
+**************************************
+cuda. CUDA-accelerated Computer Vision
+**************************************

 .. toctree::
    :maxdepth: 1
--- a/modules/cuda/doc/data_structures.rst
+++ b/modules/cuda/doc/data_structures.rst
@ -5,9 +5,9 @@ Data Structures



-gpu::PtrStepSz
--------------
-.. ocv:class:: gpu::PtrStepSz
+cuda::PtrStepSz
+---------------
+.. ocv:class:: cuda::PtrStepSz

 Lightweight class encapsulating pitched memory on a GPU and passed to nvcc-compiled code (CUDA kernels). Typically, it is used internally by OpenCV and by users who write device code. You can call its members from both host and device code. ::

@ -30,11 +30,11 @@ Lightweight class encapsulating pitched memory on a GPU and passed to nvcc-compi



-gpu::PtrStep
------------
-.. ocv:class:: gpu::PtrStep
+cuda::PtrStep
+-------------
+.. ocv:class:: cuda::PtrStep

-Structure similar to :ocv:class:`gpu::PtrStepSz` but containing only a pointer and row step. Width and height fields are excluded due to performance reasons. The structure is intended for internal use or for users who write device code. ::
+Structure similar to :ocv:class:`cuda::PtrStepSz` but containing only a pointer and row step. Width and height fields are excluded due to performance reasons. The structure is intended for internal use or for users who write device code. ::

    template <typename T> struct PtrStep : public DevPtr<T>
    {
@ -57,9 +57,9 @@ Structure similar to :ocv:class:`gpu::PtrStepSz` but containing only a pointer a



-gpu::GpuMat
-----------
-.. ocv:class:: gpu::GpuMat
+cuda::GpuMat
+------------
+.. ocv:class:: cuda::GpuMat

 Base storage class for GPU memory with reference counting. Its interface matches the :ocv:class:`Mat` interface with the following limitations:

@ -67,7 +67,7 @@ Base storage class for GPU memory with reference counting. Its interface matches
 * no functions that return references to their data (because references on GPU are not valid for CPU)
 * no expression templates technique support

-Beware that the latter limitation may lead to overloaded matrix operators that cause memory allocations. The ``GpuMat`` class is convertible to :ocv:class:`gpu::PtrStepSz` and :ocv:class:`gpu::PtrStep` so it can be passed directly to the kernel.
+Beware that the latter limitation may lead to overloaded matrix operators that cause memory allocations. The ``GpuMat`` class is convertible to :ocv:class:`cuda::PtrStepSz` and :ocv:class:`cuda::PtrStep` so it can be passed directly to the kernel.

 .. note:: In contrast with :ocv:class:`Mat`, in most cases ``GpuMat::isContinuous() == false`` . This means that rows are aligned to a size depending on the hardware. Single-row ``GpuMat`` is always a continuous matrix.

@ -76,34 +76,34 @@ Beware that the latter limitation may lead to overloaded matrix operators that c
    class CV_EXPORTS GpuMat
    {
    public:
-            //! default constructor
-            GpuMat();
+        //! default constructor
+        GpuMat();

-            //! constructs GpuMat of the specified size and type
-            GpuMat(int rows, int cols, int type);
-            GpuMat(Size size, int type);
+        //! constructs GpuMat of the specified size and type
+        GpuMat(int rows, int cols, int type);
+        GpuMat(Size size, int type);

-            .....
+        .....

-            //! builds GpuMat from host memory (Blocking call)
-            explicit GpuMat(InputArray arr);
+        //! builds GpuMat from host memory (Blocking call)
+        explicit GpuMat(InputArray arr);

-            //! returns lightweight PtrStepSz structure for passing
-            //to nvcc-compiled code. Contains size, data ptr and step.
-            template <class T> operator PtrStepSz<T>() const;
-            template <class T> operator PtrStep<T>() const;
+        //! returns lightweight PtrStepSz structure for passing
+        //to nvcc-compiled code. Contains size, data ptr and step.
+        template <class T> operator PtrStepSz<T>() const;
+        template <class T> operator PtrStep<T>() const;

-            //! pefroms upload data to GpuMat (Blocking call)
-            void upload(InputArray arr);
+        //! pefroms upload data to GpuMat (Blocking call)
+        void upload(InputArray arr);

-            //! pefroms upload data to GpuMat (Non-Blocking call)
-            void upload(InputArray arr, Stream& stream);
+        //! pefroms upload data to GpuMat (Non-Blocking call)
+        void upload(InputArray arr, Stream& stream);

-            //! pefroms download data from device to host memory (Blocking call)
-            void download(OutputArray dst) const;
+        //! pefroms download data from device to host memory (Blocking call)
+        void download(OutputArray dst) const;

-            //! pefroms download data from device to host memory (Non-Blocking call)
-            void download(OutputArray dst, Stream& stream) const;
+        //! pefroms download data from device to host memory (Non-Blocking call)
+        void download(OutputArray dst, Stream& stream) const;
    };


@ -113,11 +113,11 @@ Beware that the latter limitation may lead to overloaded matrix operators that c



-gpu::createContinuous
---------------------
+cuda::createContinuous
+----------------------
 Creates a continuous matrix.

-.. ocv:function:: void gpu::createContinuous(int rows, int cols, int type, OutputArray arr)
+.. ocv:function:: void cuda::createContinuous(int rows, int cols, int type, OutputArray arr)

    :param rows: Row count.

@ -131,11 +131,11 @@ Matrix is called continuous if its elements are stored continuously, that is, wi



-gpu::ensureSizeIsEnough
-----------------------
+cuda::ensureSizeIsEnough
+------------------------
 Ensures that the size of a matrix is big enough and the matrix has a proper type.

-.. ocv:function:: void gpu::ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr)
+.. ocv:function:: void cuda::ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr)

    :param rows: Minimum desired number of rows.

@ -149,9 +149,9 @@ The function does not reallocate memory if the matrix has proper attributes alre



-gpu::CudaMem
------------
-.. ocv:class:: gpu::CudaMem
+cuda::CudaMem
+-------------
+.. ocv:class:: cuda::CudaMem

 Class with reference counting wrapping special memory type allocation functions from CUDA. Its interface is also :ocv:func:`Mat`-like but with additional memory type parameters.

@ -191,47 +191,47 @@ Class with reference counting wrapping special memory type allocation functions



-gpu::CudaMem::createMatHeader
-----------------------------
-Creates a header without reference counting to :ocv:class:`gpu::CudaMem` data.
+cuda::CudaMem::createMatHeader
+------------------------------
+Creates a header without reference counting to :ocv:class:`cuda::CudaMem` data.

-.. ocv:function:: Mat gpu::CudaMem::createMatHeader() const
+.. ocv:function:: Mat cuda::CudaMem::createMatHeader() const



-gpu::CudaMem::createGpuMatHeader
--------------------------------
-Maps CPU memory to GPU address space and creates the :ocv:class:`gpu::GpuMat` header without reference counting for it.
+cuda::CudaMem::createGpuMatHeader
+---------------------------------
+Maps CPU memory to GPU address space and creates the :ocv:class:`cuda::GpuMat` header without reference counting for it.

-.. ocv:function:: GpuMat gpu::CudaMem::createGpuMatHeader() const
+.. ocv:function:: GpuMat cuda::CudaMem::createGpuMatHeader() const

 This can be done only if memory was allocated with the ``SHARED`` flag and if it is supported by the hardware. Laptops often share video and CPU memory, so address spaces can be mapped, which eliminates an extra copy.



-gpu::registerPageLocked
-----------------------
+cuda::registerPageLocked
+------------------------
 Page-locks the memory of matrix and maps it for the device(s).

-.. ocv:function:: void gpu::registerPageLocked(Mat& m)
+.. ocv:function:: void cuda::registerPageLocked(Mat& m)

    :param m: Input matrix.



-gpu::unregisterPageLocked
-------------------------
+cuda::unregisterPageLocked
+--------------------------
 Unmaps the memory of matrix and makes it pageable again.

-.. ocv:function:: void gpu::unregisterPageLocked(Mat& m)
+.. ocv:function:: void cuda::unregisterPageLocked(Mat& m)

    :param m: Input matrix.



-gpu::Stream
-----------
-.. ocv:class:: gpu::Stream
+cuda::Stream
+------------
+.. ocv:class:: cuda::Stream

 This class encapsulates a queue of asynchronous calls.

@ -265,45 +265,45 @@ This class encapsulates a queue of asynchronous calls.



-gpu::Stream::queryIfComplete
----------------------------
+cuda::Stream::queryIfComplete
+-----------------------------
 Returns ``true`` if the current stream queue is finished. Otherwise, it returns false.

-.. ocv:function:: bool gpu::Stream::queryIfComplete()
+.. ocv:function:: bool cuda::Stream::queryIfComplete()



-gpu::Stream::waitForCompletion
------------------------------
+cuda::Stream::waitForCompletion
+-------------------------------
 Blocks the current CPU thread until all operations in the stream are complete.

-.. ocv:function:: void gpu::Stream::waitForCompletion()
+.. ocv:function:: void cuda::Stream::waitForCompletion()



-gpu::Stream::waitEvent
----------------------
+cuda::Stream::waitEvent
+-----------------------
 Makes a compute stream wait on an event.

-.. ocv:function:: void gpu::Stream::waitEvent(const Event& event)
+.. ocv:function:: void cuda::Stream::waitEvent(const Event& event)



-gpu::Stream::enqueueHostCallback
--------------------------------
+cuda::Stream::enqueueHostCallback
+---------------------------------
 Adds a callback to be called on the host after all currently enqueued items in the stream have completed.

-.. ocv:function:: void gpu::Stream::enqueueHostCallback(StreamCallback callback, void* userData)
+.. ocv:function:: void cuda::Stream::enqueueHostCallback(StreamCallback callback, void* userData)

 .. note:: Callbacks must not make any CUDA API calls. Callbacks must not perform any synchronization that may depend on outstanding device work or other callbacks that are not mandated to run earlier.  Callbacks without a mandated order (in independent streams) execute in undefined order and may be serialized.



-gpu::StreamAccessor
-------------------
-.. ocv:struct:: gpu::StreamAccessor
+cuda::StreamAccessor
+--------------------
+.. ocv:struct:: cuda::StreamAccessor

-Class that enables getting ``cudaStream_t`` from :ocv:class:`gpu::Stream` and is declared in ``stream_accessor.hpp`` because it is the only public header that depends on the CUDA Runtime API. Including it brings a dependency to your code. ::
+Class that enables getting ``cudaStream_t`` from :ocv:class:`cuda::Stream` and is declared in ``stream_accessor.hpp`` because it is the only public header that depends on the CUDA Runtime API. Including it brings a dependency to your code. ::

    struct StreamAccessor
    {
--- a/modules/cuda/doc/initalization_and_information.rst
+++ b/modules/cuda/doc/initalization_and_information.rst
@ -5,51 +5,51 @@ Initalization and Information



-gpu::getCudaEnabledDeviceCount
------------------------------
+cuda::getCudaEnabledDeviceCount
+-------------------------------
 Returns the number of installed CUDA-enabled devices.

-.. ocv:function:: int gpu::getCudaEnabledDeviceCount()
+.. ocv:function:: int cuda::getCudaEnabledDeviceCount()

-Use this function before any other GPU functions calls. If OpenCV is compiled without GPU support, this function returns 0.
+Use this function before any other CUDA functions calls. If OpenCV is compiled without CUDA support, this function returns 0.



-gpu::setDevice
--------------
+cuda::setDevice
+---------------
 Sets a device and initializes it for the current thread.

-.. ocv:function:: void gpu::setDevice(int device)
+.. ocv:function:: void cuda::setDevice(int device)

-    :param device: System index of a GPU device starting with 0.
+    :param device: System index of a CUDA device starting with 0.

-If the call of this function is omitted, a default device is initialized at the fist GPU usage.
+If the call of this function is omitted, a default device is initialized at the fist CUDA usage.



-gpu::getDevice
--------------
-Returns the current device index set by :ocv:func:`gpu::setDevice` or initialized by default.
+cuda::getDevice
+---------------
+Returns the current device index set by :ocv:func:`cuda::setDevice` or initialized by default.

-.. ocv:function:: int gpu::getDevice()
+.. ocv:function:: int cuda::getDevice()



-gpu::resetDevice
----------------
+cuda::resetDevice
+-----------------
 Explicitly destroys and cleans up all resources associated with the current device in the current process.

-.. ocv:function:: void gpu::resetDevice()
+.. ocv:function:: void cuda::resetDevice()

 Any subsequent API call to this device will reinitialize the device.



-gpu::FeatureSet
---------------
-Enumeration providing GPU computing features.
+cuda::FeatureSet
+----------------
+Enumeration providing CUDA computing features.

-.. ocv:enum:: gpu::FeatureSet
+.. ocv:enum:: cuda::FeatureSet

  .. ocv:emember:: FEATURE_SET_COMPUTE_10
  .. ocv:emember:: FEATURE_SET_COMPUTE_11
@ -62,33 +62,34 @@ Enumeration providing GPU computing features.
  .. ocv:emember:: NATIVE_DOUBLE


-gpu::TargetArchs
----------------
-.. ocv:class:: gpu::TargetArchs

-Class providing a set of static methods to check what NVIDIA* card architecture the GPU module was built for.
+cuda::TargetArchs
+-----------------
+.. ocv:class:: cuda::TargetArchs
+
+Class providing a set of static methods to check what NVIDIA* card architecture the CUDA module was built for.

 The following method checks whether the module was built with the support of the given feature:

-    .. ocv:function:: static bool gpu::TargetArchs::builtWith( FeatureSet feature_set )
+    .. ocv:function:: static bool cuda::TargetArchs::builtWith( FeatureSet feature_set )

-        :param feature_set: Features to be checked. See :ocv:enum:`gpu::FeatureSet`.
+        :param feature_set: Features to be checked. See :ocv:enum:`cuda::FeatureSet`.

-There is a set of methods to check whether the module contains intermediate (PTX) or binary GPU code for the given architecture(s):
+There is a set of methods to check whether the module contains intermediate (PTX) or binary CUDA code for the given architecture(s):

-    .. ocv:function:: static bool gpu::TargetArchs::has(int major, int minor)
+    .. ocv:function:: static bool cuda::TargetArchs::has(int major, int minor)

-    .. ocv:function:: static bool gpu::TargetArchs::hasPtx(int major, int minor)
+    .. ocv:function:: static bool cuda::TargetArchs::hasPtx(int major, int minor)

-    .. ocv:function:: static bool gpu::TargetArchs::hasBin(int major, int minor)
+    .. ocv:function:: static bool cuda::TargetArchs::hasBin(int major, int minor)

-    .. ocv:function:: static bool gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor)
+    .. ocv:function:: static bool cuda::TargetArchs::hasEqualOrLessPtx(int major, int minor)

-    .. ocv:function:: static bool gpu::TargetArchs::hasEqualOrGreater(int major, int minor)
+    .. ocv:function:: static bool cuda::TargetArchs::hasEqualOrGreater(int major, int minor)

-    .. ocv:function:: static bool gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor)
+    .. ocv:function:: static bool cuda::TargetArchs::hasEqualOrGreaterPtx(int major, int minor)

-    .. ocv:function:: static bool gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor)
+    .. ocv:function:: static bool cuda::TargetArchs::hasEqualOrGreaterBin(int major, int minor)

        :param major: Major compute capability version.

@ -98,9 +99,9 @@ According to the CUDA C Programming Guide Version 3.2: "PTX code produced for so



-gpu::DeviceInfo
---------------
-.. ocv:class:: gpu::DeviceInfo
+cuda::DeviceInfo
+----------------
+.. ocv:class:: cuda::DeviceInfo

 Class providing functionality for querying the specified GPU properties. ::

@ -285,90 +286,90 @@ Class providing functionality for querying the specified GPU properties. ::
        //! checks whether device supports the given feature
        bool supports(FeatureSet feature_set) const;

-        //! checks whether the GPU module can be run on the given device
+        //! checks whether the CUDA module can be run on the given device
        bool isCompatible() const;
    };



-gpu::DeviceInfo::DeviceInfo
---------------------------
+cuda::DeviceInfo::DeviceInfo
+----------------------------
 The constructors.

-.. ocv:function:: gpu::DeviceInfo::DeviceInfo()
+.. ocv:function:: cuda::DeviceInfo::DeviceInfo()

-.. ocv:function:: gpu::DeviceInfo::DeviceInfo(int device_id)
+.. ocv:function:: cuda::DeviceInfo::DeviceInfo(int device_id)

-    :param device_id: System index of the GPU device starting with 0.
+    :param device_id: System index of the CUDA device starting with 0.

 Constructs the ``DeviceInfo`` object for the specified device. If ``device_id`` parameter is missed, it constructs an object for the current device.



-gpu::DeviceInfo::name
---------------------
+cuda::DeviceInfo::name
+----------------------
 Returns the device name.

-.. ocv:function:: const char* gpu::DeviceInfo::name() const
+.. ocv:function:: const char* cuda::DeviceInfo::name() const



-gpu::DeviceInfo::majorVersion
-----------------------------
+cuda::DeviceInfo::majorVersion
+------------------------------
 Returns the major compute capability version.

-.. ocv:function:: int gpu::DeviceInfo::majorVersion()
+.. ocv:function:: int cuda::DeviceInfo::majorVersion()



-gpu::DeviceInfo::minorVersion
-----------------------------
+cuda::DeviceInfo::minorVersion
+------------------------------
 Returns the minor compute capability version.

-.. ocv:function:: int gpu::DeviceInfo::minorVersion()
+.. ocv:function:: int cuda::DeviceInfo::minorVersion()



-gpu::DeviceInfo::freeMemory
---------------------------
+cuda::DeviceInfo::freeMemory
+----------------------------
 Returns the amount of free memory in bytes.

-.. ocv:function:: size_t gpu::DeviceInfo::freeMemory()
+.. ocv:function:: size_t cuda::DeviceInfo::freeMemory()



-gpu::DeviceInfo::totalMemory
----------------------------
+cuda::DeviceInfo::totalMemory
+-----------------------------
 Returns the amount of total memory in bytes.

-.. ocv:function:: size_t gpu::DeviceInfo::totalMemory()
+.. ocv:function:: size_t cuda::DeviceInfo::totalMemory()



-gpu::DeviceInfo::supports
-------------------------
-Provides information on GPU feature support.
+cuda::DeviceInfo::supports
+--------------------------
+Provides information on CUDA feature support.

-.. ocv:function:: bool gpu::DeviceInfo::supports(FeatureSet feature_set) const
+.. ocv:function:: bool cuda::DeviceInfo::supports(FeatureSet feature_set) const

-    :param feature_set: Features to be checked. See :ocv:enum:`gpu::FeatureSet`.
+    :param feature_set: Features to be checked. See :ocv:enum:`cuda::FeatureSet`.

-This function returns ``true`` if the device has the specified GPU feature. Otherwise, it returns ``false`` .
+This function returns ``true`` if the device has the specified CUDA feature. Otherwise, it returns ``false`` .



-gpu::DeviceInfo::isCompatible
-----------------------------
-Checks the GPU module and device compatibility.
+cuda::DeviceInfo::isCompatible
+------------------------------
+Checks the CUDA module and device compatibility.

-.. ocv:function:: bool gpu::DeviceInfo::isCompatible()
+.. ocv:function:: bool cuda::DeviceInfo::isCompatible()

-This function returns ``true`` if the GPU module can be run on the specified device. Otherwise, it returns ``false`` .
+This function returns ``true`` if the CUDA module can be run on the specified device. Otherwise, it returns ``false`` .



-gpu::DeviceInfo::deviceID
-------------------------
-Returns system index of the GPU device starting with 0.
+cuda::DeviceInfo::deviceID
+--------------------------
+Returns system index of the CUDA device starting with 0.

-.. ocv:function:: int gpu::DeviceInfo::deviceID()
+.. ocv:function:: int cuda::DeviceInfo::deviceID()
--- a/modules/cuda/doc/introduction.rst
+++ b/modules/cuda/doc/introduction.rst
@ -0,0 +1,61 @@
+CUDA Module Introduction
+========================
+
+.. highlight:: cpp
+
+
+
+General Information
+-------------------
+
+The OpenCV CUDA module is a set of classes and functions to utilize CUDA computational capabilities. It is implemented using NVIDIA* CUDA* Runtime API and supports only NVIDIA GPUs. The OpenCV CUDA module includes utility functions, low-level vision primitives, and high-level algorithms. The utility functions and low-level primitives provide a powerful infrastructure for developing fast vision algorithms taking advantage of CUDA whereas the high-level functionality includes some state-of-the-art algorithms (such as stereo correspondence, face and people detectors, and others) ready to be used by the application developers.
+
+The CUDA module is designed as a host-level API. This means that if you have pre-compiled OpenCV CUDA binaries, you are not required to have the CUDA Toolkit installed or write any extra code to make use of the CUDA.
+
+The OpenCV CUDA module is designed for ease of use and does not require any knowledge of CUDA. Though, such a knowledge will certainly be useful to handle non-trivial cases or achieve the highest performance. It is helpful to understand the cost of various operations, what the GPU does, what the preferred data formats are, and so on. The CUDA module is an effective instrument for quick implementation of CUDA-accelerated computer vision algorithms. However, if your algorithm involves many simple operations, then, for the best possible performance, you may still need to write your own kernels to avoid extra write and read operations on the intermediate results.
+
+To enable CUDA support, configure OpenCV using ``CMake`` with ``WITH_CUDA=ON`` . When the flag is set and if CUDA is installed, the full-featured OpenCV CUDA module is built. Otherwise, the module is still built but at runtime all functions from the module throw
+:ocv:class:`Exception` with ``CV_GpuNotSupported`` error code, except for
+:ocv:func:`cuda::getCudaEnabledDeviceCount()`. The latter function returns zero GPU count in this case. Building OpenCV without CUDA support does not perform device code compilation, so it does not require the CUDA Toolkit installed. Therefore, using the
+:ocv:func:`cuda::getCudaEnabledDeviceCount()` function, you can implement a high-level algorithm that will detect GPU presence at runtime and choose an appropriate implementation (CPU or GPU) accordingly.
+
+Compilation for Different NVIDIA* Platforms
+-------------------------------------------
+
+NVIDIA* compiler enables generating binary code (cubin and fatbin) and intermediate code (PTX). Binary code often implies a specific GPU architecture and generation, so the compatibility with other GPUs is not guaranteed. PTX is targeted for a virtual platform that is defined entirely by the set of capabilities or features. Depending on the selected virtual platform, some of the instructions are emulated or disabled, even if the real hardware supports all the features.
+
+At the first call, the PTX code is compiled to binary code for the particular GPU using a JIT compiler. When the target GPU has a compute capability (CC) lower than the PTX code, JIT fails.
+By default, the OpenCV CUDA module includes:
+
+*
+    Binaries for compute capabilities 1.3 and 2.0 (controlled by ``CUDA_ARCH_BIN``     in ``CMake``)
+
+*
+    PTX code for compute capabilities 1.1 and 1.3 (controlled by ``CUDA_ARCH_PTX``     in ``CMake``)
+
+This means that for devices with CC 1.3 and 2.0 binary images are ready to run. For all newer platforms, the PTX code for 1.3 is JIT'ed to a binary image. For devices with CC 1.1 and 1.2, the PTX for 1.1 is JIT'ed. For devices with CC 1.0, no code is available and the functions throw
+:ocv:class:`Exception`. For platforms where JIT compilation is performed first, the run is slow.
+
+On a GPU with CC 1.0, you can still compile the CUDA module and most of the functions will run flawlessly. To achieve this, add "1.0" to the list of binaries, for example, ``CUDA_ARCH_BIN="1.0 1.3 2.0"`` . The functions that cannot be run on CC 1.0 GPUs throw an exception.
+
+You can always determine at runtime whether the OpenCV GPU-built binaries (or PTX code) are compatible with your GPU. The function
+:ocv:func:`cuda::DeviceInfo::isCompatible` returns the compatibility status (true/false).
+
+Utilizing Multiple GPUs
+-----------------------
+
+In the current version, each of the OpenCV CUDA algorithms can use only a single GPU. So, to utilize multiple GPUs, you have to manually distribute the work between GPUs.
+Switching active devie can be done using :ocv:func:`cuda::setDevice()` function.  For more details please read Cuda C Programing Guide.
+
+While developing algorithms for multiple GPUs, note a data passing overhead. For primitive functions and small images, it can be significant, which may eliminate all the advantages of having multiple GPUs. But for high-level algorithms, consider using multi-GPU acceleration. For example, the Stereo Block Matching algorithm has been successfully parallelized using the following algorithm:
+
+
+ 1.   Split each image of the stereo pair into two horizontal overlapping stripes.
+
+
+ 2.   Process each pair of stripes (from the left and right images) on a separate Fermi* GPU.
+
+
+ 3.   Merge the results into a single disparity map.
+
+With this algorithm, a dual GPU gave a 180% performance increase comparing to the single Fermi GPU. For a source code example, see http://code.opencv.org/projects/opencv/repository/revisions/master/entry/samples/gpu/.
--- a/modules/cuda/doc/object_detection.rst
+++ b/modules/cuda/doc/object_detection.rst
@ -5,9 +5,9 @@ Object Detection



-gpu::HOGDescriptor
------------------
-.. ocv:struct:: gpu::HOGDescriptor
+cuda::HOGDescriptor
+-------------------
+.. ocv:struct:: cuda::HOGDescriptor

 The class implements Histogram of Oriented Gradients ([Dalal2005]_) object detector. ::

@ -65,15 +65,17 @@ Interfaces of all methods are kept similar to the ``CPU HOG`` descriptor and det
 .. note::

   * An example applying the HOG descriptor for people detection can be found at opencv_source_code/samples/cpp/peopledetect.cpp
-   * A GPU example applying the HOG descriptor for people detection can be found at opencv_source_code/samples/gpu/hog.cpp
+   * A CUDA example applying the HOG descriptor for people detection can be found at opencv_source_code/samples/gpu/hog.cpp

   * (Python) An example applying the HOG descriptor for people detection can be found at opencv_source_code/samples/python2/peopledetect.py

-gpu::HOGDescriptor::HOGDescriptor
-------------------------------------
+
+
+cuda::HOGDescriptor::HOGDescriptor
+----------------------------------
 Creates the ``HOG`` descriptor and detector.

-.. ocv:function:: gpu::HOGDescriptor::HOGDescriptor(Size win_size=Size(64, 128), Size block_size=Size(16, 16), Size block_stride=Size(8, 8), Size cell_size=Size(8, 8), int nbins=9, double win_sigma=DEFAULT_WIN_SIGMA, double threshold_L2hys=0.2, bool gamma_correction=true, int nlevels=DEFAULT_NLEVELS)
+.. ocv:function:: cuda::HOGDescriptor::HOGDescriptor(Size win_size=Size(64, 128), Size block_size=Size(16, 16), Size block_stride=Size(8, 8), Size cell_size=Size(8, 8), int nbins=9, double win_sigma=DEFAULT_WIN_SIGMA, double threshold_L2hys=0.2, bool gamma_correction=true, int nlevels=DEFAULT_NLEVELS)

   :param win_size: Detection window size. Align to block size and block stride.

@ -95,59 +97,59 @@ Creates the ``HOG`` descriptor and detector.



-gpu::HOGDescriptor::getDescriptorSize
-----------------------------------------
+cuda::HOGDescriptor::getDescriptorSize
+--------------------------------------
 Returns the number of coefficients required for the classification.

-.. ocv:function:: size_t gpu::HOGDescriptor::getDescriptorSize() const
+.. ocv:function:: size_t cuda::HOGDescriptor::getDescriptorSize() const



-gpu::HOGDescriptor::getBlockHistogramSize
---------------------------------------------
+cuda::HOGDescriptor::getBlockHistogramSize
+------------------------------------------
 Returns the block histogram size.

-.. ocv:function:: size_t gpu::HOGDescriptor::getBlockHistogramSize() const
+.. ocv:function:: size_t cuda::HOGDescriptor::getBlockHistogramSize() const



-gpu::HOGDescriptor::setSVMDetector
--------------------------------------
+cuda::HOGDescriptor::setSVMDetector
+-----------------------------------
 Sets coefficients for the linear SVM classifier.

-.. ocv:function:: void gpu::HOGDescriptor::setSVMDetector(const vector<float>& detector)
+.. ocv:function:: void cuda::HOGDescriptor::setSVMDetector(const vector<float>& detector)



-gpu::HOGDescriptor::getDefaultPeopleDetector
------------------------------------------------
+cuda::HOGDescriptor::getDefaultPeopleDetector
+---------------------------------------------
 Returns coefficients of the classifier trained for people detection (for default window size).

-.. ocv:function:: static vector<float> gpu::HOGDescriptor::getDefaultPeopleDetector()
+.. ocv:function:: static vector<float> cuda::HOGDescriptor::getDefaultPeopleDetector()



-gpu::HOGDescriptor::getPeopleDetector48x96
----------------------------------------------
+cuda::HOGDescriptor::getPeopleDetector48x96
+-------------------------------------------
 Returns coefficients of the classifier trained for people detection (for 48x96 windows).

-.. ocv:function:: static vector<float> gpu::HOGDescriptor::getPeopleDetector48x96()
+.. ocv:function:: static vector<float> cuda::HOGDescriptor::getPeopleDetector48x96()



-gpu::HOGDescriptor::getPeopleDetector64x128
-----------------------------------------------
+cuda::HOGDescriptor::getPeopleDetector64x128
+--------------------------------------------
 Returns coefficients of the classifier trained for people detection (for 64x128 windows).

-.. ocv:function:: static vector<float> gpu::HOGDescriptor::getPeopleDetector64x128()
+.. ocv:function:: static vector<float> cuda::HOGDescriptor::getPeopleDetector64x128()



-gpu::HOGDescriptor::detect
------------------------------
+cuda::HOGDescriptor::detect
+---------------------------
 Performs object detection without a multi-scale window.

-.. ocv:function:: void gpu::HOGDescriptor::detect(const GpuMat& img, vector<Point>& found_locations, double hit_threshold=0, Size win_stride=Size(), Size padding=Size())
+.. ocv:function:: void cuda::HOGDescriptor::detect(const GpuMat& img, vector<Point>& found_locations, double hit_threshold=0, Size win_stride=Size(), Size padding=Size())

   :param img: Source image.  ``CV_8UC1``  and  ``CV_8UC4`` types are supported for now.

@ -161,17 +163,17 @@ Performs object detection without a multi-scale window.



-gpu::HOGDescriptor::detectMultiScale
----------------------------------------
+cuda::HOGDescriptor::detectMultiScale
+-------------------------------------
 Performs object detection with a multi-scale window.

-.. ocv:function:: void gpu::HOGDescriptor::detectMultiScale(const GpuMat& img, vector<Rect>& found_locations, double hit_threshold=0, Size win_stride=Size(), Size padding=Size(), double scale0=1.05, int group_threshold=2)
+.. ocv:function:: void cuda::HOGDescriptor::detectMultiScale(const GpuMat& img, vector<Rect>& found_locations, double hit_threshold=0, Size win_stride=Size(), Size padding=Size(), double scale0=1.05, int group_threshold=2)

-   :param img: Source image. See  :ocv:func:`gpu::HOGDescriptor::detect`  for type limitations.
+   :param img: Source image. See  :ocv:func:`cuda::HOGDescriptor::detect`  for type limitations.

   :param found_locations: Detected objects boundaries.

-   :param hit_threshold: Threshold for the distance between features and SVM classifying plane. See  :ocv:func:`gpu::HOGDescriptor::detect`  for details.
+   :param hit_threshold: Threshold for the distance between features and SVM classifying plane. See  :ocv:func:`cuda::HOGDescriptor::detect`  for details.

   :param win_stride: Window stride. It must be a multiple of block stride.

@ -183,13 +185,13 @@ Performs object detection with a multi-scale window.



-gpu::HOGDescriptor::getDescriptors
--------------------------------------
+cuda::HOGDescriptor::getDescriptors
+-----------------------------------
 Returns block descriptors computed for the whole image.

-.. ocv:function:: void gpu::HOGDescriptor::getDescriptors(const GpuMat& img, Size win_stride, GpuMat& descriptors, int descr_format=DESCR_FORMAT_COL_BY_COL)
+.. ocv:function:: void cuda::HOGDescriptor::getDescriptors(const GpuMat& img, Size win_stride, GpuMat& descriptors, int descr_format=DESCR_FORMAT_COL_BY_COL)

-   :param img: Source image. See  :ocv:func:`gpu::HOGDescriptor::detect`  for type limitations.
+   :param img: Source image. See  :ocv:func:`cuda::HOGDescriptor::detect`  for type limitations.

   :param win_stride: Window stride. It must be a multiple of block stride.

@ -204,18 +206,19 @@ Returns block descriptors computed for the whole image.
 The function is mainly used to learn the classifier.


-gpu::CascadeClassifier_GPU
--------------------------
-.. ocv:class:: gpu::CascadeClassifier_GPU
+
+cuda::CascadeClassifier_CUDA
+----------------------------
+.. ocv:class:: cuda::CascadeClassifier_CUDA

 Cascade classifier class used for object detection. Supports HAAR and LBP cascades. ::

-    class CV_EXPORTS CascadeClassifier_GPU
+    class CV_EXPORTS CascadeClassifier_CUDA
    {
    public:
-            CascadeClassifier_GPU();
-            CascadeClassifier_GPU(const String& filename);
-            ~CascadeClassifier_GPU();
+            CascadeClassifier_CUDA();
+            CascadeClassifier_CUDA(const String& filename);
+            ~CascadeClassifier_CUDA();

            bool empty() const;
            bool load(const String& filename);
@ -239,48 +242,51 @@ Cascade classifier class used for object detection. Supports HAAR and LBP cascad
   * A cascade classifier example can be found at opencv_source_code/samples/gpu/cascadeclassifier.cpp
   * A Nvidea API specific cascade classifier example can be found at opencv_source_code/samples/gpu/cascadeclassifier_nvidia_api.cpp

-gpu::CascadeClassifier_GPU::CascadeClassifier_GPU
-----------------------------------------------------
+
+
+cuda::CascadeClassifier_CUDA::CascadeClassifier_CUDA
+----------------------------------------------------
 Loads the classifier from a file. Cascade type is detected automatically by constructor parameter.

-.. ocv:function:: gpu::CascadeClassifier_GPU::CascadeClassifier_GPU(const String& filename)
+.. ocv:function:: cuda::CascadeClassifier_CUDA::CascadeClassifier_CUDA(const String& filename)

    :param filename: Name of the file from which the classifier is loaded. Only the old ``haar`` classifier (trained by the ``haar`` training application) and NVIDIA's ``nvbin`` are supported for HAAR and only new type of OpenCV XML cascade supported for LBP.



-gpu::CascadeClassifier_GPU::empty
-------------------------------------
+cuda::CascadeClassifier_CUDA::empty
+-----------------------------------
 Checks whether the classifier is loaded or not.

-.. ocv:function:: bool gpu::CascadeClassifier_GPU::empty() const
+.. ocv:function:: bool cuda::CascadeClassifier_CUDA::empty() const



-gpu::CascadeClassifier_GPU::load
------------------------------------
+cuda::CascadeClassifier_CUDA::load
+----------------------------------
 Loads the classifier from a file. The previous content is destroyed.

-.. ocv:function:: bool gpu::CascadeClassifier_GPU::load(const String& filename)
+.. ocv:function:: bool cuda::CascadeClassifier_CUDA::load(const String& filename)

    :param filename: Name of the file from which the classifier is loaded. Only the old ``haar`` classifier (trained by the ``haar`` training application) and NVIDIA's ``nvbin`` are supported for HAAR and only new type of OpenCV XML cascade supported for LBP.


-gpu::CascadeClassifier_GPU::release
---------------------------------------
+
+cuda::CascadeClassifier_CUDA::release
+-------------------------------------
 Destroys the loaded classifier.

-.. ocv:function:: void gpu::CascadeClassifier_GPU::release()
+.. ocv:function:: void cuda::CascadeClassifier_CUDA::release()



-gpu::CascadeClassifier_GPU::detectMultiScale
------------------------------------------------
+cuda::CascadeClassifier_CUDA::detectMultiScale
+----------------------------------------------
 Detects objects of different sizes in the input image.

-.. ocv:function:: int gpu::CascadeClassifier_GPU::detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, double scaleFactor=1.2, int minNeighbors=4, Size minSize=Size())
+.. ocv:function:: int cuda::CascadeClassifier_CUDA::detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, double scaleFactor=1.2, int minNeighbors=4, Size minSize=Size())

-.. ocv:function:: int gpu::CascadeClassifier_GPU::detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4)
+.. ocv:function:: int cuda::CascadeClassifier_CUDA::detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4)

    :param image: Matrix of type  ``CV_8U``  containing an image where objects should be detected.

@ -298,7 +304,7 @@ The detected objects are returned as a list of rectangles.

 The function returns the number of detected objects, so you can retrieve them as in the following example: ::

-    gpu::CascadeClassifier_GPU cascade_gpu(...);
+    cuda::CascadeClassifier_CUDA cascade_gpu(...);

    Mat image_cpu = imread(...)
    GpuMat image_gpu(image_cpu);
--- a/modules/cuda/include/opencv2/cuda.hpp
+++ b/modules/cuda/include/opencv2/cuda.hpp
@ -40,56 +40,16 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_HPP__
-#define __OPENCV_GPU_HPP__
+#ifndef __OPENCV_CUDA_HPP__
+#define __OPENCV_CUDA_HPP__

 #ifndef __cplusplus
-#  error gpu.hpp header must be compiled as C++
+#  error cuda.hpp header must be compiled as C++
 #endif

-#include "opencv2/core/gpu.hpp"
+#include "opencv2/core/cuda.hpp"

-#if !defined(__OPENCV_BUILD) && !defined(OPENCV_GPU_SKIP_INCLUDE)
-    #include "opencv2/opencv_modules.hpp"
-
-    #ifdef HAVE_OPENCV_GPUARITHM
-        #include "opencv2/gpuarithm.hpp"
-    #endif
-
-    #ifdef HAVE_OPENCV_GPUWARPING
-        #include "opencv2/gpuwarping.hpp"
-    #endif
-
-    #ifdef HAVE_OPENCV_GPUFILTERS
-        #include "opencv2/gpufilters.hpp"
-    #endif
-
-    #ifdef HAVE_OPENCV_GPUIMGPROC
-        #include "opencv2/gpuimgproc.hpp"
-    #endif
-
-    #ifdef HAVE_OPENCV_GPUFEATURES2D
-        #include "opencv2/gpufeatures2d.hpp"
-    #endif
-
-    #ifdef HAVE_OPENCV_GPUOPTFLOW
-        #include "opencv2/gpuoptflow.hpp"
-    #endif
-
-    #ifdef HAVE_OPENCV_GPUBGSEGM
-        #include "opencv2/gpubgsegm.hpp"
-    #endif
-
-    #ifdef HAVE_OPENCV_GPUSTEREO
-        #include "opencv2/gpustereo.hpp"
-    #endif
-
-    #ifdef HAVE_OPENCV_GPUCODEC
-        #include "opencv2/gpucodec.hpp"
-    #endif
-#endif
-
-namespace cv { namespace gpu {
+namespace cv { namespace cuda {

 //////////////// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector //////////////

@ -186,12 +146,12 @@ protected:
 //////////////////////////// CascadeClassifier ////////////////////////////

 // The cascade classifier class for object detection: supports old haar and new lbp xlm formats and nvbin for haar cascades olny.
-class CV_EXPORTS CascadeClassifier_GPU
+class CV_EXPORTS CascadeClassifier_CUDA
 {
 public:
-    CascadeClassifier_GPU();
-    CascadeClassifier_GPU(const String& filename);
-    ~CascadeClassifier_GPU();
+    CascadeClassifier_CUDA();
+    CascadeClassifier_CUDA(const String& filename);
+    ~CascadeClassifier_CUDA();

    bool empty() const;
    bool load(const String& filename);
@ -211,7 +171,7 @@ private:
    CascadeClassifierImpl* impl;
    struct HaarCascade;
    struct LbpCascade;
-    friend class CascadeClassifier_GPU_LBP;
+    friend class CascadeClassifier_CUDA_LBP;
 };

 //////////////////////////// Labeling ////////////////////////////
@ -255,6 +215,6 @@ CV_EXPORTS void calcWobbleSuppressionMaps(
        int left, int idx, int right, Size size, const Mat &ml, const Mat &mr,
        GpuMat &mapx, GpuMat &mapy);

-}} // namespace cv { namespace gpu {
+}} // namespace cv { namespace cuda {

-#endif /* __OPENCV_GPU_HPP__ */
+#endif /* __OPENCV_CUDA_HPP__ */
--- a/modules/cuda/perf/perf_calib3d.cpp
+++ b/modules/cuda/perf/perf_calib3d.cpp
@ -63,14 +63,14 @@ PERF_TEST_P(Count, Calib3D_ProjectPoints,
    const cv::Mat tvec = cv::Mat::ones(1, 3, CV_32FC1);
    const cv::Mat camera_mat = cv::Mat::ones(3, 3, CV_32FC1);

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;

-        TEST_CYCLE() cv::gpu::projectPoints(d_src, rvec, tvec, camera_mat, cv::Mat(), dst);
+        TEST_CYCLE() cv::cuda::projectPoints(d_src, rvec, tvec, camera_mat, cv::Mat(), dst);

-        GPU_SANITY_CHECK(dst);
+        CUDA_SANITY_CHECK(dst);
    }
    else
    {
@ -118,12 +118,12 @@ PERF_TEST_P(Count, Calib3D_SolvePnPRansac,
    cv::Mat rvec;
    cv::Mat tvec;

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        TEST_CYCLE() cv::gpu::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
+        TEST_CYCLE() cv::cuda::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);

-        GPU_SANITY_CHECK(rvec, 1e-3);
-        GPU_SANITY_CHECK(tvec, 1e-3);
+        CUDA_SANITY_CHECK(rvec, 1e-3);
+        CUDA_SANITY_CHECK(tvec, 1e-3);
    }
    else
    {
--- a/modules/cuda/perf/perf_labeling.cpp
+++ b/modules/cuda/perf/perf_labeling.cpp
@ -149,14 +149,14 @@ PERF_TEST_P(Image, DISABLED_Labeling_ConnectivityMask,
    const cv::Mat image = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(image.empty());

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        cv::gpu::GpuMat d_image(image);
-        cv::gpu::GpuMat mask;
+        cv::cuda::GpuMat d_image(image);
+        cv::cuda::GpuMat mask;

-        TEST_CYCLE() cv::gpu::connectivityMask(d_image, mask, cv::Scalar::all(0), cv::Scalar::all(2));
+        TEST_CYCLE() cv::cuda::connectivityMask(d_image, mask, cv::Scalar::all(0), cv::Scalar::all(2));

-        GPU_SANITY_CHECK(mask);
+        CUDA_SANITY_CHECK(mask);
    }
    else
    {
@ -172,16 +172,16 @@ PERF_TEST_P(Image, DISABLED_Labeling_ConnectedComponents,
    const cv::Mat image = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(image.empty());

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        cv::gpu::GpuMat d_mask;
-        cv::gpu::connectivityMask(cv::gpu::GpuMat(image), d_mask, cv::Scalar::all(0), cv::Scalar::all(2));
+        cv::cuda::GpuMat d_mask;
+        cv::cuda::connectivityMask(cv::cuda::GpuMat(image), d_mask, cv::Scalar::all(0), cv::Scalar::all(2));

-        cv::gpu::GpuMat components;
+        cv::cuda::GpuMat components;

-        TEST_CYCLE() cv::gpu::labelComponents(d_mask, components);
+        TEST_CYCLE() cv::cuda::labelComponents(d_mask, components);

-        GPU_SANITY_CHECK(components);
+        CUDA_SANITY_CHECK(components);
    }
    else
    {
--- a/modules/cuda/perf/perf_main.cpp
+++ b/modules/cuda/perf/perf_main.cpp
@ -44,4 +44,4 @@

 using namespace perf;

-CV_PERF_TEST_CUDA_MAIN(gpu)
+CV_PERF_TEST_CUDA_MAIN(cuda)
--- a/modules/cuda/perf/perf_matop.cpp
+++ b/modules/cuda/perf/perf_matop.cpp
@ -50,9 +50,9 @@ using namespace perf;
 // SetTo

 PERF_TEST_P(Sz_Depth_Cn, MatOp_SetTo,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
-                    GPU_CHANNELS_1_3_4))
+                    CUDA_CHANNELS_1_3_4))
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@ -62,13 +62,13 @@ PERF_TEST_P(Sz_Depth_Cn, MatOp_SetTo,

    const cv::Scalar val(1, 2, 3, 4);

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        cv::gpu::GpuMat dst(size, type);
+        cv::cuda::GpuMat dst(size, type);

        TEST_CYCLE() dst.setTo(val);

-        GPU_SANITY_CHECK(dst);
+        CUDA_SANITY_CHECK(dst);
    }
    else
    {
@ -84,9 +84,9 @@ PERF_TEST_P(Sz_Depth_Cn, MatOp_SetTo,
 // SetToMasked

 PERF_TEST_P(Sz_Depth_Cn, MatOp_SetToMasked,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
-                    GPU_CHANNELS_1_3_4))
+                    CUDA_CHANNELS_1_3_4))
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@ -100,14 +100,14 @@ PERF_TEST_P(Sz_Depth_Cn, MatOp_SetToMasked,

    const cv::Scalar val(1, 2, 3, 4);

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        cv::gpu::GpuMat dst(src);
-        const cv::gpu::GpuMat d_mask(mask);
+        cv::cuda::GpuMat dst(src);
+        const cv::cuda::GpuMat d_mask(mask);

        TEST_CYCLE() dst.setTo(val, d_mask);

-        GPU_SANITY_CHECK(dst, 1e-10);
+        CUDA_SANITY_CHECK(dst, 1e-10);
    }
    else
    {
@ -123,9 +123,9 @@ PERF_TEST_P(Sz_Depth_Cn, MatOp_SetToMasked,
 // CopyToMasked

 PERF_TEST_P(Sz_Depth_Cn, MatOp_CopyToMasked,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
-                    GPU_CHANNELS_1_3_4))
+                    CUDA_CHANNELS_1_3_4))
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@ -137,15 +137,15 @@ PERF_TEST_P(Sz_Depth_Cn, MatOp_CopyToMasked,
    cv::Mat mask(size, CV_8UC1);
    declare.in(src, mask, WARMUP_RNG);

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        const cv::gpu::GpuMat d_src(src);
-        const cv::gpu::GpuMat d_mask(mask);
-        cv::gpu::GpuMat dst(d_src.size(), d_src.type(), cv::Scalar::all(0));
+        const cv::cuda::GpuMat d_src(src);
+        const cv::cuda::GpuMat d_mask(mask);
+        cv::cuda::GpuMat dst(d_src.size(), d_src.type(), cv::Scalar::all(0));

        TEST_CYCLE() d_src.copyTo(dst, d_mask);

-        GPU_SANITY_CHECK(dst, 1e-10);
+        CUDA_SANITY_CHECK(dst, 1e-10);
    }
    else
    {
@ -163,7 +163,7 @@ PERF_TEST_P(Sz_Depth_Cn, MatOp_CopyToMasked,
 DEF_PARAM_TEST(Sz_2Depth, cv::Size, MatDepth, MatDepth);

 PERF_TEST_P(Sz_2Depth, MatOp_ConvertTo,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
                    Values(CV_8U, CV_16U, CV_32F, CV_64F)))
 {
@ -177,14 +177,14 @@ PERF_TEST_P(Sz_2Depth, MatOp_ConvertTo,
    const double a = 0.5;
    const double b = 1.0;

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;

        TEST_CYCLE() d_src.convertTo(dst, depth2, a, b);

-        GPU_SANITY_CHECK(dst, 1e-10);
+        CUDA_SANITY_CHECK(dst, 1e-10);
    }
    else
    {
--- a/modules/cuda/perf/perf_objdetect.cpp
+++ b/modules/cuda/perf/perf_objdetect.cpp
@ -66,13 +66,13 @@ PERF_TEST_P(Image, ObjDetect_HOG,
    const cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(img.empty());

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        const cv::gpu::GpuMat d_img(img);
+        const cv::cuda::GpuMat d_img(img);
        std::vector<cv::Rect> gpu_found_locations;

-        cv::gpu::HOGDescriptor d_hog;
-        d_hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
+        cv::cuda::HOGDescriptor d_hog;
+        d_hog.setSVMDetector(cv::cuda::HOGDescriptor::getDefaultPeopleDetector());

        TEST_CYCLE() d_hog.detectMultiScale(d_img, gpu_found_locations);

@ -83,7 +83,7 @@ PERF_TEST_P(Image, ObjDetect_HOG,
        std::vector<cv::Rect> cpu_found_locations;

        cv::HOGDescriptor hog;
-        hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
+        hog.setSVMDetector(cv::cuda::HOGDescriptor::getDefaultPeopleDetector());

        TEST_CYCLE() hog.detectMultiScale(img, cpu_found_locations);

@ -103,13 +103,13 @@ PERF_TEST_P(ImageAndCascade, ObjDetect_HaarClassifier,
    const cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(img.empty());

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        cv::gpu::CascadeClassifier_GPU d_cascade;
+        cv::cuda::CascadeClassifier_CUDA d_cascade;
        ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));

-        const cv::gpu::GpuMat d_img(img);
-        cv::gpu::GpuMat objects_buffer;
+        const cv::cuda::GpuMat d_img(img);
+        cv::cuda::GpuMat objects_buffer;
        int detections_num = 0;

        TEST_CYCLE() detections_num = d_cascade.detectMultiScale(d_img, objects_buffer);
@ -142,13 +142,13 @@ PERF_TEST_P(ImageAndCascade, ObjDetect_LBPClassifier,
    const cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(img.empty());

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        cv::gpu::CascadeClassifier_GPU d_cascade;
+        cv::cuda::CascadeClassifier_CUDA d_cascade;
        ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));

-        const cv::gpu::GpuMat d_img(img);
-        cv::gpu::GpuMat objects_buffer;
+        const cv::cuda::GpuMat d_img(img);
+        cv::cuda::GpuMat objects_buffer;
        int detections_num = 0;

        TEST_CYCLE() detections_num = d_cascade.detectMultiScale(d_img, objects_buffer);
--- a/modules/cuda/perf/perf_precomp.hpp
+++ b/modules/cuda/perf/perf_precomp.hpp
@ -52,9 +52,9 @@
 #define __OPENCV_PERF_PRECOMP_HPP__

 #include "opencv2/ts.hpp"
-#include "opencv2/ts/gpu_perf.hpp"
+#include "opencv2/ts/cuda_perf.hpp"

-#include "opencv2/gpu.hpp"
+#include "opencv2/cuda.hpp"
 #include "opencv2/calib3d.hpp"
 #include "opencv2/objdetect.hpp"

--- a/modules/cuda/src/calib3d.cpp
+++ b/modules/cuda/src/calib3d.cpp
@ -43,19 +43,19 @@
 #include "precomp.hpp"

 using namespace cv;
-using namespace cv::gpu;
+using namespace cv::cuda;

 #if !defined HAVE_CUDA || defined(CUDA_DISABLER)

-void cv::gpu::transformPoints(const GpuMat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::cuda::transformPoints(const GpuMat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_no_cuda(); }

-void cv::gpu::projectPoints(const GpuMat&, const Mat&, const Mat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::cuda::projectPoints(const GpuMat&, const Mat&, const Mat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_no_cuda(); }

-void cv::gpu::solvePnPRansac(const Mat&, const Mat&, const Mat&, const Mat&, Mat&, Mat&, bool, int, float, int, std::vector<int>*) { throw_no_cuda(); }
+void cv::cuda::solvePnPRansac(const Mat&, const Mat&, const Mat&, const Mat&, Mat&, Mat&, bool, int, float, int, std::vector<int>*) { throw_no_cuda(); }

 #else

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    namespace transform_points
    {
@ -78,7 +78,7 @@ namespace cv { namespace gpu { namespace cudev
    }
 }}}

-using namespace ::cv::gpu::cudev;
+using namespace ::cv::cuda::device;

 namespace
 {
@ -97,7 +97,7 @@ namespace
    }
 }

-void cv::gpu::transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, GpuMat& dst, Stream& stream)
+void cv::cuda::transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, GpuMat& dst, Stream& stream)
 {
    transformPointsCaller(src, rvec, tvec, dst, StreamAccessor::getStream(stream));
 }
@ -121,7 +121,7 @@ namespace
    }
 }

-void cv::gpu::projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, Stream& stream)
+void cv::cuda::projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, Stream& stream)
 {
    projectPointsCaller(src, rvec, tvec, camera_mat, dist_coef, dst, StreamAccessor::getStream(stream));
 }
@ -208,7 +208,7 @@ namespace
    };
 }

-void cv::gpu::solvePnPRansac(const Mat& object, const Mat& image, const Mat& camera_mat,
+void cv::cuda::solvePnPRansac(const Mat& object, const Mat& image, const Mat& camera_mat,
                             const Mat& dist_coef, Mat& rvec, Mat& tvec, bool use_extrinsic_guess,
                             int num_iters, float max_dist, int min_inlier_count,
                             std::vector<int>* inliers)
@ -252,7 +252,7 @@ void cv::gpu::solvePnPRansac(const Mat& object, const Mat& image, const Mat& cam
    // Find the best hypothesis index
    Point best_idx;
    double best_score;
-    gpu::minMaxLoc(d_hypothesis_scores, NULL, &best_score, NULL, &best_idx);
+    cuda::minMaxLoc(d_hypothesis_scores, NULL, &best_score, NULL, &best_idx);
    int num_inliers = static_cast<int>(best_score);

    // Extract the best hypothesis data
--- a/modules/cuda/src/cascadeclassifier.cpp
+++ b/modules/cuda/src/cascadeclassifier.cpp
@ -44,23 +44,23 @@
 #include "opencv2/objdetect/objdetect_c.h"

 using namespace cv;
-using namespace cv::gpu;
+using namespace cv::cuda;

 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)

-cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU()               { throw_no_cuda(); }
-cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU(const String&)  { throw_no_cuda(); }
-cv::gpu::CascadeClassifier_GPU::~CascadeClassifier_GPU()              { throw_no_cuda(); }
-bool cv::gpu::CascadeClassifier_GPU::empty() const                    { throw_no_cuda(); return true; }
-bool cv::gpu::CascadeClassifier_GPU::load(const String&)              { throw_no_cuda(); return true; }
-Size cv::gpu::CascadeClassifier_GPU::getClassifierSize() const        { throw_no_cuda(); return Size();}
-void cv::gpu::CascadeClassifier_GPU::release()                        { throw_no_cuda(); }
-int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat&, GpuMat&, double, int, Size)       {throw_no_cuda(); return -1;}
-int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat&, GpuMat&, Size, Size, double, int) {throw_no_cuda(); return -1;}
+cv::cuda::CascadeClassifier_CUDA::CascadeClassifier_CUDA()               { throw_no_cuda(); }
+cv::cuda::CascadeClassifier_CUDA::CascadeClassifier_CUDA(const String&)  { throw_no_cuda(); }
+cv::cuda::CascadeClassifier_CUDA::~CascadeClassifier_CUDA()              { throw_no_cuda(); }
+bool cv::cuda::CascadeClassifier_CUDA::empty() const                    { throw_no_cuda(); return true; }
+bool cv::cuda::CascadeClassifier_CUDA::load(const String&)              { throw_no_cuda(); return true; }
+Size cv::cuda::CascadeClassifier_CUDA::getClassifierSize() const        { throw_no_cuda(); return Size();}
+void cv::cuda::CascadeClassifier_CUDA::release()                        { throw_no_cuda(); }
+int cv::cuda::CascadeClassifier_CUDA::detectMultiScale( const GpuMat&, GpuMat&, double, int, Size)       {throw_no_cuda(); return -1;}
+int cv::cuda::CascadeClassifier_CUDA::detectMultiScale( const GpuMat&, GpuMat&, Size, Size, double, int) {throw_no_cuda(); return -1;}

 #else

-struct cv::gpu::CascadeClassifier_GPU::CascadeClassifierImpl
+struct cv::cuda::CascadeClassifier_CUDA::CascadeClassifierImpl
 {
 public:
    CascadeClassifierImpl(){}
@ -73,9 +73,9 @@ public:
    virtual bool read(const String& classifierAsXml) = 0;
 };

-#ifndef HAVE_OPENCV_GPULEGACY
+#ifndef HAVE_OPENCV_CUDALEGACY

-struct cv::gpu::CascadeClassifier_GPU::HaarCascade : cv::gpu::CascadeClassifier_GPU::CascadeClassifierImpl
+struct cv::cuda::CascadeClassifier_CUDA::HaarCascade : cv::cuda::CascadeClassifier_CUDA::CascadeClassifierImpl
 {
 public:
    HaarCascade()
@ -104,7 +104,7 @@ public:

 #else

-struct cv::gpu::CascadeClassifier_GPU::HaarCascade : cv::gpu::CascadeClassifier_GPU::CascadeClassifierImpl
+struct cv::cuda::CascadeClassifier_CUDA::HaarCascade : cv::cuda::CascadeClassifier_CUDA::CascadeClassifierImpl
 {
 public:
    HaarCascade() : lastAllocatedFrameSize(-1, -1)
@ -203,7 +203,7 @@ private:

    NCVStatus load(const String& classifierFile)
    {
-        int devId = cv::gpu::getDevice();
+        int devId = cv::cuda::getDevice();
        ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), NCV_CUDA_ERROR);

        // Load the classifier from file (assuming its size is about 1 mb) using a simple allocator
@ -372,7 +372,7 @@ struct PyrLavel
    cv::Size sWindow;
 };

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    namespace lbp
    {
@ -398,7 +398,7 @@ namespace cv { namespace gpu { namespace cudev
    }
 }}}

-struct cv::gpu::CascadeClassifier_GPU::LbpCascade : cv::gpu::CascadeClassifier_GPU::CascadeClassifierImpl
+struct cv::cuda::CascadeClassifier_CUDA::LbpCascade : cv::cuda::CascadeClassifier_CUDA::CascadeClassifierImpl
 {
 public:
    struct Stage
@ -457,8 +457,8 @@ public:
                GpuMat buff = integralBuffer;

                // generate integral for scale
-                gpu::resize(image, src, level.sFrame, 0, 0, cv::INTER_LINEAR);
-                gpu::integral(src, sint, buff);
+                cuda::resize(image, src, level.sFrame, 0, 0, cv::INTER_LINEAR);
+                cuda::integral(src, sint, buff);

                // calculate job
                int totalWidth = level.workArea.width / step;
@ -473,7 +473,7 @@ public:
                acc += level.sFrame.width + 1;
            }

-            cudev::lbp::classifyPyramid(image.cols, image.rows, NxM.width - 1, NxM.height - 1, iniScale, scaleFactor, total, stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat,
+            device::lbp::classifyPyramid(image.cols, image.rows, NxM.width - 1, NxM.height - 1, iniScale, scaleFactor, total, stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat,
                leaves_mat, subsets_mat, features_mat, subsetSize, candidates, dclassified.ptr<unsigned int>(), integral);
        }

@ -481,7 +481,7 @@ public:
            return 0;

        cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );
-        cudev::lbp::connectedConmonents(candidates, classified, objects, groupThreshold, grouping_eps, dclassified.ptr<unsigned int>());
+        device::lbp::connectedConmonents(candidates, classified, objects, groupThreshold, grouping_eps, dclassified.ptr<unsigned int>());

        cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );
        cudaSafeCall( cudaDeviceSynchronize() );
@ -509,13 +509,13 @@ private:

            integral.create(frame.height + 1, integralFactor * (frame.width + 1), CV_32SC1);

-#ifdef HAVE_OPENCV_GPULEGACY
+#ifdef HAVE_OPENCV_CUDALEGACY
            NcvSize32u roiSize;
            roiSize.width = frame.width;
            roiSize.height = frame.height;

            cudaDeviceProp prop;
-            cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
+            cudaSafeCall( cudaGetDeviceProperties(&prop, cv::cuda::getDevice()) );

            Ncv32u bufSize;
            ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
@ -528,48 +528,48 @@ private:

    bool read(const FileNode &root)
    {
-        const char *GPU_CC_STAGE_TYPE       = "stageType";
-        const char *GPU_CC_FEATURE_TYPE     = "featureType";
-        const char *GPU_CC_BOOST            = "BOOST";
-        const char *GPU_CC_LBP              = "LBP";
-        const char *GPU_CC_MAX_CAT_COUNT    = "maxCatCount";
-        const char *GPU_CC_HEIGHT           = "height";
-        const char *GPU_CC_WIDTH            = "width";
-        const char *GPU_CC_STAGE_PARAMS     = "stageParams";
-        const char *GPU_CC_MAX_DEPTH        = "maxDepth";
-        const char *GPU_CC_FEATURE_PARAMS   = "featureParams";
-        const char *GPU_CC_STAGES           = "stages";
-        const char *GPU_CC_STAGE_THRESHOLD  = "stageThreshold";
-        const float GPU_THRESHOLD_EPS       = 1e-5f;
-        const char *GPU_CC_WEAK_CLASSIFIERS = "weakClassifiers";
-        const char *GPU_CC_INTERNAL_NODES   = "internalNodes";
-        const char *GPU_CC_LEAF_VALUES      = "leafValues";
-        const char *GPU_CC_FEATURES         = "features";
-        const char *GPU_CC_RECT             = "rect";
-
-        String stageTypeStr = (String)root[GPU_CC_STAGE_TYPE];
-        CV_Assert(stageTypeStr == GPU_CC_BOOST);
-
-        String featureTypeStr = (String)root[GPU_CC_FEATURE_TYPE];
-        CV_Assert(featureTypeStr == GPU_CC_LBP);
-
-        NxM.width =  (int)root[GPU_CC_WIDTH];
-        NxM.height = (int)root[GPU_CC_HEIGHT];
+        const char *CUDA_CC_STAGE_TYPE       = "stageType";
+        const char *CUDA_CC_FEATURE_TYPE     = "featureType";
+        const char *CUDA_CC_BOOST            = "BOOST";
+        const char *CUDA_CC_LBP              = "LBP";
+        const char *CUDA_CC_MAX_CAT_COUNT    = "maxCatCount";
+        const char *CUDA_CC_HEIGHT           = "height";
+        const char *CUDA_CC_WIDTH            = "width";
+        const char *CUDA_CC_STAGE_PARAMS     = "stageParams";
+        const char *CUDA_CC_MAX_DEPTH        = "maxDepth";
+        const char *CUDA_CC_FEATURE_PARAMS   = "featureParams";
+        const char *CUDA_CC_STAGES           = "stages";
+        const char *CUDA_CC_STAGE_THRESHOLD  = "stageThreshold";
+        const float CUDA_THRESHOLD_EPS       = 1e-5f;
+        const char *CUDA_CC_WEAK_CLASSIFIERS = "weakClassifiers";
+        const char *CUDA_CC_INTERNAL_NODES   = "internalNodes";
+        const char *CUDA_CC_LEAF_VALUES      = "leafValues";
+        const char *CUDA_CC_FEATURES         = "features";
+        const char *CUDA_CC_RECT             = "rect";
+
+        String stageTypeStr = (String)root[CUDA_CC_STAGE_TYPE];
+        CV_Assert(stageTypeStr == CUDA_CC_BOOST);
+
+        String featureTypeStr = (String)root[CUDA_CC_FEATURE_TYPE];
+        CV_Assert(featureTypeStr == CUDA_CC_LBP);
+
+        NxM.width =  (int)root[CUDA_CC_WIDTH];
+        NxM.height = (int)root[CUDA_CC_HEIGHT];
        CV_Assert( NxM.height > 0 && NxM.width > 0 );

-        isStumps = ((int)(root[GPU_CC_STAGE_PARAMS][GPU_CC_MAX_DEPTH]) == 1) ? true : false;
+        isStumps = ((int)(root[CUDA_CC_STAGE_PARAMS][CUDA_CC_MAX_DEPTH]) == 1) ? true : false;
        CV_Assert(isStumps);

-        FileNode fn = root[GPU_CC_FEATURE_PARAMS];
+        FileNode fn = root[CUDA_CC_FEATURE_PARAMS];
        if (fn.empty())
            return false;

-        ncategories = fn[GPU_CC_MAX_CAT_COUNT];
+        ncategories = fn[CUDA_CC_MAX_CAT_COUNT];

        subsetSize = (ncategories + 31) / 32;
        nodeStep = 3 + ( ncategories > 0 ? subsetSize : 1 );

-        fn = root[GPU_CC_STAGES];
+        fn = root[CUDA_CC_STAGES];
        if (fn.empty())
            return false;

@ -586,9 +586,9 @@ private:
        {
            FileNode fns = *it;
            Stage st;
-            st.threshold = (float)fns[GPU_CC_STAGE_THRESHOLD] - GPU_THRESHOLD_EPS;
+            st.threshold = (float)fns[CUDA_CC_STAGE_THRESHOLD] - CUDA_THRESHOLD_EPS;

-            fns = fns[GPU_CC_WEAK_CLASSIFIERS];
+            fns = fns[CUDA_CC_WEAK_CLASSIFIERS];
            if (fns.empty())
                return false;

@ -605,8 +605,8 @@ private:
            {
                FileNode fnw = *it1;

-                FileNode internalNodes = fnw[GPU_CC_INTERNAL_NODES];
-                FileNode leafValues = fnw[GPU_CC_LEAF_VALUES];
+                FileNode internalNodes = fnw[CUDA_CC_INTERNAL_NODES];
+                FileNode leafValues = fnw[CUDA_CC_LEAF_VALUES];
                if ( internalNodes.empty() || leafValues.empty() )
                    return false;

@ -640,7 +640,7 @@ private:
            }
        }

-        fn = root[GPU_CC_FEATURES];
+        fn = root[CUDA_CC_FEATURES];
        if( fn.empty() )
            return false;
        std::vector<uchar> features;
@ -648,7 +648,7 @@ private:
        FileNodeIterator f_it = fn.begin(), f_end = fn.end();
        for (; f_it != f_end; ++f_it)
        {
-            FileNode rect = (*f_it)[GPU_CC_RECT];
+            FileNode rect = (*f_it)[CUDA_CC_RECT];
            FileNodeIterator r_it = rect.begin();
            features.push_back(saturate_cast<uchar>((int)*(r_it++)));
            features.push_back(saturate_cast<uchar>((int)*(r_it++)));
@ -694,36 +694,36 @@ private:
    static const int integralFactor = 4;
 };

-cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU()
+cv::cuda::CascadeClassifier_CUDA::CascadeClassifier_CUDA()
 : findLargestObject(false), visualizeInPlace(false), impl(0) {}

-cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU(const String& filename)
+cv::cuda::CascadeClassifier_CUDA::CascadeClassifier_CUDA(const String& filename)
 : findLargestObject(false), visualizeInPlace(false), impl(0) { load(filename); }

-cv::gpu::CascadeClassifier_GPU::~CascadeClassifier_GPU() { release(); }
+cv::cuda::CascadeClassifier_CUDA::~CascadeClassifier_CUDA() { release(); }

-void cv::gpu::CascadeClassifier_GPU::release() { if (impl) { delete impl; impl = 0; } }
+void cv::cuda::CascadeClassifier_CUDA::release() { if (impl) { delete impl; impl = 0; } }

-bool cv::gpu::CascadeClassifier_GPU::empty() const { return impl == 0; }
+bool cv::cuda::CascadeClassifier_CUDA::empty() const { return impl == 0; }

-Size cv::gpu::CascadeClassifier_GPU::getClassifierSize() const
+Size cv::cuda::CascadeClassifier_CUDA::getClassifierSize() const
 {
    return this->empty() ? Size() : impl->getClassifierCvSize();
 }

-int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, double scaleFactor, int minNeighbors, Size minSize)
+int cv::cuda::CascadeClassifier_CUDA::detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, double scaleFactor, int minNeighbors, Size minSize)
 {
    CV_Assert( !this->empty());
    return impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, minSize, cv::Size());
 }

-int cv::gpu::CascadeClassifier_GPU::detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize, double scaleFactor, int minNeighbors)
+int cv::cuda::CascadeClassifier_CUDA::detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize, double scaleFactor, int minNeighbors)
 {
    CV_Assert( !this->empty());
    return impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, minSize, maxObjectSize);
 }

-bool cv::gpu::CascadeClassifier_GPU::load(const String& filename)
+bool cv::cuda::CascadeClassifier_CUDA::load(const String& filename)
 {
    release();

@ -744,9 +744,9 @@ bool cv::gpu::CascadeClassifier_GPU::load(const String& filename)
        return impl->read(filename);
    }

-    const char *GPU_CC_LBP = "LBP";
+    const char *CUDA_CC_LBP = "LBP";
    String featureTypeStr = (String)fs.getFirstTopLevelNode()["featureType"];
-    if (featureTypeStr == GPU_CC_LBP)
+    if (featureTypeStr == CUDA_CC_LBP)
        impl = new LbpCascade();
    else
        impl = new HaarCascade();
--- a/modules/cuda/src/cuda/calib3d.cu
+++ b/modules/cuda/src/cuda/calib3d.cu
@ -47,7 +47,7 @@
 #include "opencv2/core/cuda/functional.hpp"
 #include "opencv2/core/cuda/reduce.hpp"

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    #define SOLVE_PNP_RANSAC_MAX_NUM_ITERS 200

@ -79,7 +79,7 @@ namespace cv { namespace gpu { namespace cudev
            cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
            cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
            cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
-            cv::gpu::cudev::transform(src, dst, TransformOp(), WithOutMask(), stream);
+            cv::cuda::device::transform(src, dst, TransformOp(), WithOutMask(), stream);
        }
    } // namespace transform_points

@ -120,7 +120,7 @@ namespace cv { namespace gpu { namespace cudev
            cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
            cudaSafeCall(cudaMemcpyToSymbol(cproj0, proj, sizeof(float) * 3));
            cudaSafeCall(cudaMemcpyToSymbol(cproj1, proj + 3, sizeof(float) * 3));
-            cv::gpu::cudev::transform(src, dst, ProjectOp(), WithOutMask(), stream);
+            cv::cuda::device::transform(src, dst, ProjectOp(), WithOutMask(), stream);
        }
    } // namespace project_points

@ -187,7 +187,7 @@ namespace cv { namespace gpu { namespace cudev
            cudaSafeCall( cudaDeviceSynchronize() );
        }
    } // namespace solvepnp_ransac
-}}} // namespace cv { namespace gpu { namespace cudev
+}}} // namespace cv { namespace cuda { namespace cudev


 #endif /* CUDA_DISABLER */
--- a/modules/cuda/src/cuda/ccomponetns.cu
+++ b/modules/cuda/src/cuda/ccomponetns.cu
@ -50,7 +50,7 @@
 #include <iostream>
 #include <stdio.h>

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    namespace ccl
    {
--- a/modules/cuda/src/cuda/global_motion.cu
+++ b/modules/cuda/src/cuda/global_motion.cu
@ -47,7 +47,7 @@
 #include <thrust/functional.h>
 #include "opencv2/core/cuda/common.hpp"

-namespace cv { namespace gpu { namespace cudev { namespace globmotion {
+namespace cv { namespace cuda { namespace device { namespace globmotion {

 __constant__ float cml[9];
 __constant__ float cmr[9];
--- a/modules/cuda/src/cuda/hog.cu
+++ b/modules/cuda/src/cuda/hog.cu
@ -47,7 +47,7 @@
 #include "opencv2/core/cuda/functional.hpp"
 #include "opencv2/core/cuda/warp_shuffle.hpp"

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    // Other values are not supported
    #define CELL_WIDTH 8
@ -808,7 +808,7 @@ namespace cv { namespace gpu { namespace cudev
        void resize_8UC1(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); }
        void resize_8UC4(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); }
    } // namespace hog
-}}} // namespace cv { namespace gpu { namespace cudev
+}}} // namespace cv { namespace cuda { namespace cudev


 #endif /* CUDA_DISABLER */
--- a/modules/cuda/src/cuda/lbp.cu
+++ b/modules/cuda/src/cuda/lbp.cu
@ -46,7 +46,7 @@
 #include "opencv2/core/cuda/vec_traits.hpp"
 #include "opencv2/core/cuda/saturate_cast.hpp"

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    namespace lbp
    {
--- a/modules/cuda/src/cuda/lbp.hpp
+++ b/modules/cuda/src/cuda/lbp.hpp
@ -40,13 +40,13 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_DEVICE_LBP_HPP_
-#define __OPENCV_GPU_DEVICE_LBP_HPP_
+#ifndef __OPENCV_CUDA_DEVICE_LBP_HPP_
+#define __OPENCV_CUDA_DEVICE_LBP_HPP_

 #include "opencv2/core/cuda/common.hpp"
 #include "opencv2/core/cuda/emulation.hpp"

-namespace cv { namespace gpu { namespace cudev {
+namespace cv { namespace cuda { namespace device {

 namespace lbp {

--- a/modules/cuda/src/global_motion.cpp
+++ b/modules/cuda/src/global_motion.cpp
@ -43,17 +43,17 @@
 #include "precomp.hpp"

 using namespace cv;
-using namespace cv::gpu;
+using namespace cv::cuda;

 #if !defined HAVE_CUDA || defined(CUDA_DISABLER)

-void cv::gpu::compactPoints(GpuMat&, GpuMat&, const GpuMat&) { throw_no_cuda(); }
-void cv::gpu::calcWobbleSuppressionMaps(
+void cv::cuda::compactPoints(GpuMat&, GpuMat&, const GpuMat&) { throw_no_cuda(); }
+void cv::cuda::calcWobbleSuppressionMaps(
        int, int, int, Size, const Mat&, const Mat&, GpuMat&, GpuMat&) { throw_no_cuda(); }

 #else

-namespace cv { namespace gpu { namespace cudev { namespace globmotion {
+namespace cv { namespace cuda { namespace device { namespace globmotion {

    int compactPoints(int N, float *points0, float *points1, const uchar *mask);

@ -63,14 +63,14 @@ namespace cv { namespace gpu { namespace cudev { namespace globmotion {

 }}}}

-void cv::gpu::compactPoints(GpuMat &points0, GpuMat &points1, const GpuMat &mask)
+void cv::cuda::compactPoints(GpuMat &points0, GpuMat &points1, const GpuMat &mask)
 {
    CV_Assert(points0.rows == 1 && points1.rows == 1 && mask.rows == 1);
    CV_Assert(points0.type() == CV_32FC2 && points1.type() == CV_32FC2 && mask.type() == CV_8U);
    CV_Assert(points0.cols == mask.cols && points1.cols == mask.cols);

    int npoints = points0.cols;
-    int remaining = cv::gpu::cudev::globmotion::compactPoints(
+    int remaining = cv::cuda::device::globmotion::compactPoints(
            npoints, (float*)points0.data, (float*)points1.data, mask.data);

    points0 = points0.colRange(0, remaining);
@ -78,7 +78,7 @@ void cv::gpu::compactPoints(GpuMat &points0, GpuMat &points1, const GpuMat &mask
 }


-void cv::gpu::calcWobbleSuppressionMaps(
+void cv::cuda::calcWobbleSuppressionMaps(
        int left, int idx, int right, Size size, const Mat &ml, const Mat &mr,
        GpuMat &mapx, GpuMat &mapy)
 {
@ -88,7 +88,7 @@ void cv::gpu::calcWobbleSuppressionMaps(
    mapx.create(size, CV_32F);
    mapy.create(size, CV_32F);

-    cv::gpu::cudev::globmotion::calcWobbleSuppressionMaps(
+    cv::cuda::device::globmotion::calcWobbleSuppressionMaps(
                left, idx, right, size.width, size.height,
                ml.ptr<float>(), mr.ptr<float>(), mapx, mapy);
 }
--- a/modules/cuda/src/graphcuts.cpp
+++ b/modules/cuda/src/graphcuts.cpp
@ -44,15 +44,15 @@

 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)

-void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::cuda::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::cuda::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }

-void cv::gpu::connectivityMask(const GpuMat&, GpuMat&, const cv::Scalar&, const cv::Scalar&, Stream&) { throw_no_cuda(); }
-void cv::gpu::labelComponents(const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
+void cv::cuda::connectivityMask(const GpuMat&, GpuMat&, const cv::Scalar&, const cv::Scalar&, Stream&) { throw_no_cuda(); }
+void cv::cuda::labelComponents(const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }

 #else /* !defined (HAVE_CUDA) */

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    namespace ccl
    {
@ -68,7 +68,7 @@ static float4 scalarToCudaType(const cv::Scalar& in)
  return make_float4((float)in[0], (float)in[1], (float)in[2], (float)in[3]);
 }

-void cv::gpu::connectivityMask(const GpuMat& image, GpuMat& mask, const cv::Scalar& lo, const cv::Scalar& hi, Stream& s)
+void cv::cuda::connectivityMask(const GpuMat& image, GpuMat& mask, const cv::Scalar& lo, const cv::Scalar& hi, Stream& s)
 {
    CV_Assert(!image.empty());

@ -81,12 +81,12 @@ void cv::gpu::connectivityMask(const GpuMat& image, GpuMat& mask, const cv::Scal

    static const func_t suppotLookup[8][4] =
    {   //    1,    2,     3,     4
-        { cudev::ccl::computeEdges<uchar>,  0,  cudev::ccl::computeEdges<uchar3>,  cudev::ccl::computeEdges<uchar4>  },// CV_8U
+        { device::ccl::computeEdges<uchar>,  0,  device::ccl::computeEdges<uchar3>,  device::ccl::computeEdges<uchar4>  },// CV_8U
        { 0,                                 0,  0,                                  0                                  },// CV_16U
-        { cudev::ccl::computeEdges<ushort>, 0,  cudev::ccl::computeEdges<ushort3>, cudev::ccl::computeEdges<ushort4> },// CV_8S
+        { device::ccl::computeEdges<ushort>, 0,  device::ccl::computeEdges<ushort3>, device::ccl::computeEdges<ushort4> },// CV_8S
        { 0,                                 0,  0,                                  0                                  },// CV_16S
-        { cudev::ccl::computeEdges<int>,    0,  0,                                  0                                  },// CV_32S
-        { cudev::ccl::computeEdges<float>,  0,  0,                                  0                                  },// CV_32F
+        { device::ccl::computeEdges<int>,    0,  0,                                  0                                  },// CV_32S
+        { device::ccl::computeEdges<float>,  0,  0,                                  0                                  },// CV_32F
        { 0,                                 0,  0,                                  0                                  },// CV_64F
        { 0,                                 0,  0,                                  0                                  } // CV_USRTYPE1
    };
@ -102,7 +102,7 @@ void cv::gpu::connectivityMask(const GpuMat& image, GpuMat& mask, const cv::Scal
    f(image, mask, culo, cuhi, stream);
 }

-void cv::gpu::labelComponents(const GpuMat& mask, GpuMat& components, int flags, Stream& s)
+void cv::cuda::labelComponents(const GpuMat& mask, GpuMat& components, int flags, Stream& s)
 {
    CV_Assert(!mask.empty() && mask.type() == CV_8U);

@ -112,7 +112,7 @@ void cv::gpu::labelComponents(const GpuMat& mask, GpuMat& components, int flags,
    components.create(mask.size(), CV_32SC1);

    cudaStream_t stream = StreamAccessor::getStream(s);
-    cudev::ccl::labelComponents(mask, components, flags, stream);
+    device::ccl::labelComponents(mask, components, flags, stream);
 }

 namespace
@ -142,7 +142,7 @@ namespace
    };
 }

-void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, GpuMat& buf, Stream& s)
+void cv::cuda::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, GpuMat& buf, Stream& s)
 {
 #if (CUDA_VERSION < 5000)
    CV_Assert(terminals.type() == CV_32S);
@ -201,7 +201,7 @@ void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTrans
        cudaSafeCall( cudaDeviceSynchronize() );
 }

-void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& topLeft, GpuMat& topRight,
+void cv::cuda::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& topLeft, GpuMat& topRight,
              GpuMat& bottom, GpuMat& bottomLeft, GpuMat& bottomRight, GpuMat& labels, GpuMat& buf, Stream& s)
 {
 #if (CUDA_VERSION < 5000)
--- a/modules/cuda/src/hog.cpp
+++ b/modules/cuda/src/hog.cpp
@ -44,25 +44,25 @@

 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)

-cv::gpu::HOGDescriptor::HOGDescriptor(Size, Size, Size, Size, int, double, double, bool, int) { throw_no_cuda(); }
-size_t cv::gpu::HOGDescriptor::getDescriptorSize() const { throw_no_cuda(); return 0; }
-size_t cv::gpu::HOGDescriptor::getBlockHistogramSize() const { throw_no_cuda(); return 0; }
-double cv::gpu::HOGDescriptor::getWinSigma() const { throw_no_cuda(); return 0; }
-bool cv::gpu::HOGDescriptor::checkDetectorSize() const { throw_no_cuda(); return false; }
-void cv::gpu::HOGDescriptor::setSVMDetector(const std::vector<float>&) { throw_no_cuda(); }
-void cv::gpu::HOGDescriptor::detect(const GpuMat&, std::vector<Point>&, double, Size, Size) { throw_no_cuda(); }
-void cv::gpu::HOGDescriptor::detectMultiScale(const GpuMat&, std::vector<Rect>&, double, Size, Size, double, int) { throw_no_cuda(); }
-void cv::gpu::HOGDescriptor::computeBlockHistograms(const GpuMat&) { throw_no_cuda(); }
-void cv::gpu::HOGDescriptor::getDescriptors(const GpuMat&, Size, GpuMat&, int) { throw_no_cuda(); }
-std::vector<float> cv::gpu::HOGDescriptor::getDefaultPeopleDetector() { throw_no_cuda(); return std::vector<float>(); }
-std::vector<float> cv::gpu::HOGDescriptor::getPeopleDetector48x96() { throw_no_cuda(); return std::vector<float>(); }
-std::vector<float> cv::gpu::HOGDescriptor::getPeopleDetector64x128() { throw_no_cuda(); return std::vector<float>(); }
-void cv::gpu::HOGDescriptor::computeConfidence(const GpuMat&, std::vector<Point>&, double, Size, Size, std::vector<Point>&, std::vector<double>&) { throw_no_cuda(); }
-void cv::gpu::HOGDescriptor::computeConfidenceMultiScale(const GpuMat&, std::vector<Rect>&, double, Size, Size, std::vector<HOGConfidence>&, int) { throw_no_cuda(); }
+cv::cuda::HOGDescriptor::HOGDescriptor(Size, Size, Size, Size, int, double, double, bool, int) { throw_no_cuda(); }
+size_t cv::cuda::HOGDescriptor::getDescriptorSize() const { throw_no_cuda(); return 0; }
+size_t cv::cuda::HOGDescriptor::getBlockHistogramSize() const { throw_no_cuda(); return 0; }
+double cv::cuda::HOGDescriptor::getWinSigma() const { throw_no_cuda(); return 0; }
+bool cv::cuda::HOGDescriptor::checkDetectorSize() const { throw_no_cuda(); return false; }
+void cv::cuda::HOGDescriptor::setSVMDetector(const std::vector<float>&) { throw_no_cuda(); }
+void cv::cuda::HOGDescriptor::detect(const GpuMat&, std::vector<Point>&, double, Size, Size) { throw_no_cuda(); }
+void cv::cuda::HOGDescriptor::detectMultiScale(const GpuMat&, std::vector<Rect>&, double, Size, Size, double, int) { throw_no_cuda(); }
+void cv::cuda::HOGDescriptor::computeBlockHistograms(const GpuMat&) { throw_no_cuda(); }
+void cv::cuda::HOGDescriptor::getDescriptors(const GpuMat&, Size, GpuMat&, int) { throw_no_cuda(); }
+std::vector<float> cv::cuda::HOGDescriptor::getDefaultPeopleDetector() { throw_no_cuda(); return std::vector<float>(); }
+std::vector<float> cv::cuda::HOGDescriptor::getPeopleDetector48x96() { throw_no_cuda(); return std::vector<float>(); }
+std::vector<float> cv::cuda::HOGDescriptor::getPeopleDetector64x128() { throw_no_cuda(); return std::vector<float>(); }
+void cv::cuda::HOGDescriptor::computeConfidence(const GpuMat&, std::vector<Point>&, double, Size, Size, std::vector<Point>&, std::vector<double>&) { throw_no_cuda(); }
+void cv::cuda::HOGDescriptor::computeConfidenceMultiScale(const GpuMat&, std::vector<Rect>&, double, Size, Size, std::vector<HOGConfidence>&, int) { throw_no_cuda(); }

 #else

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    namespace hog
    {
@ -70,8 +70,8 @@ namespace cv { namespace gpu { namespace cudev
                              int nblocks_win_x, int nblocks_win_y);

        void compute_hists(int nbins, int block_stride_x, int blovck_stride_y,
-                           int height, int width, const cv::gpu::PtrStepSzf& grad,
-                           const cv::gpu::PtrStepSzb& qangle, float sigma, float* block_hists);
+                           int height, int width, const cv::cuda::PtrStepSzf& grad,
+                           const cv::cuda::PtrStepSzb& qangle, float sigma, float* block_hists);

        void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
                             int height, int width, float* block_hists, float threshold);
@ -87,24 +87,24 @@ namespace cv { namespace gpu { namespace cudev

        void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x,
                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
-                                    cv::gpu::PtrStepSzf descriptors);
+                                    cv::cuda::PtrStepSzf descriptors);
        void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
-                                    cv::gpu::PtrStepSzf descriptors);
+                                    cv::cuda::PtrStepSzf descriptors);

-        void compute_gradients_8UC1(int nbins, int height, int width, const cv::gpu::PtrStepSzb& img,
-                                    float angle_scale, cv::gpu::PtrStepSzf grad, cv::gpu::PtrStepSzb qangle, bool correct_gamma);
-        void compute_gradients_8UC4(int nbins, int height, int width, const cv::gpu::PtrStepSzb& img,
-                                    float angle_scale, cv::gpu::PtrStepSzf grad, cv::gpu::PtrStepSzb qangle, bool correct_gamma);
+        void compute_gradients_8UC1(int nbins, int height, int width, const cv::cuda::PtrStepSzb& img,
+                                    float angle_scale, cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle, bool correct_gamma);
+        void compute_gradients_8UC4(int nbins, int height, int width, const cv::cuda::PtrStepSzb& img,
+                                    float angle_scale, cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle, bool correct_gamma);

-        void resize_8UC1(const cv::gpu::PtrStepSzb& src, cv::gpu::PtrStepSzb dst);
-        void resize_8UC4(const cv::gpu::PtrStepSzb& src, cv::gpu::PtrStepSzb dst);
+        void resize_8UC1(const cv::cuda::PtrStepSzb& src, cv::cuda::PtrStepSzb dst);
+        void resize_8UC4(const cv::cuda::PtrStepSzb& src, cv::cuda::PtrStepSzb dst);
    }
 }}}

-using namespace ::cv::gpu::cudev;
+using namespace ::cv::cuda::device;

-cv::gpu::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size block_stride_, Size cell_size_,
+cv::cuda::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size block_stride_, Size cell_size_,
                                      int nbins_, double win_sigma_, double threshold_L2hys_, bool gamma_correction_, int nlevels_)
        : win_size(win_size_),
          block_size(block_size_),
@ -132,30 +132,30 @@ cv::gpu::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size blo
    hog::set_up_constants(nbins, block_stride.width, block_stride.height, blocks_per_win.width, blocks_per_win.height);
 }

-size_t cv::gpu::HOGDescriptor::getDescriptorSize() const
+size_t cv::cuda::HOGDescriptor::getDescriptorSize() const
 {
    return numPartsWithin(win_size, block_size, block_stride).area() * getBlockHistogramSize();
 }

-size_t cv::gpu::HOGDescriptor::getBlockHistogramSize() const
+size_t cv::cuda::HOGDescriptor::getBlockHistogramSize() const
 {
    Size cells_per_block = Size(block_size.width / cell_size.width, block_size.height / cell_size.height);
    return (size_t)(nbins * cells_per_block.area());
 }

-double cv::gpu::HOGDescriptor::getWinSigma() const
+double cv::cuda::HOGDescriptor::getWinSigma() const
 {
    return win_sigma >= 0 ? win_sigma : (block_size.width + block_size.height) / 8.0;
 }

-bool cv::gpu::HOGDescriptor::checkDetectorSize() const
+bool cv::cuda::HOGDescriptor::checkDetectorSize() const
 {
    size_t detector_size = detector.rows * detector.cols;
    size_t descriptor_size = getDescriptorSize();
    return detector_size == 0 || detector_size == descriptor_size || detector_size == descriptor_size + 1;
 }

-void cv::gpu::HOGDescriptor::setSVMDetector(const std::vector<float>& _detector)
+void cv::cuda::HOGDescriptor::setSVMDetector(const std::vector<float>& _detector)
 {
    std::vector<float> detector_reordered(_detector.size());

@ -179,7 +179,7 @@ void cv::gpu::HOGDescriptor::setSVMDetector(const std::vector<float>& _detector)
    CV_Assert(checkDetectorSize());
 }

-cv::gpu::GpuMat cv::gpu::HOGDescriptor::getBuffer(const Size& sz, int type, GpuMat& buf)
+cv::cuda::GpuMat cv::cuda::HOGDescriptor::getBuffer(const Size& sz, int type, GpuMat& buf)
 {
    if (buf.empty() || buf.type() != type)
        buf.create(sz, type);
@ -190,13 +190,13 @@ cv::gpu::GpuMat cv::gpu::HOGDescriptor::getBuffer(const Size& sz, int type, GpuM
    return buf(Rect(Point(0,0), sz));
 }

-cv::gpu::GpuMat cv::gpu::HOGDescriptor::getBuffer(int rows, int cols, int type, GpuMat& buf)
+cv::cuda::GpuMat cv::cuda::HOGDescriptor::getBuffer(int rows, int cols, int type, GpuMat& buf)
 {
    return getBuffer(Size(cols, rows), type, buf);
 }


-void cv::gpu::HOGDescriptor::computeGradient(const GpuMat& img, GpuMat& _grad, GpuMat& _qangle)
+void cv::cuda::HOGDescriptor::computeGradient(const GpuMat& img, GpuMat& _grad, GpuMat& _qangle)
 {
    CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4);

@ -219,7 +219,7 @@ void cv::gpu::HOGDescriptor::computeGradient(const GpuMat& img, GpuMat& _grad, G
 }


-void cv::gpu::HOGDescriptor::computeBlockHistograms(const GpuMat& img)
+void cv::cuda::HOGDescriptor::computeBlockHistograms(const GpuMat& img)
 {
    computeGradient(img, grad, qangle);

@ -237,7 +237,7 @@ void cv::gpu::HOGDescriptor::computeBlockHistograms(const GpuMat& img)
 }


-void cv::gpu::HOGDescriptor::getDescriptors(const GpuMat& img, Size win_stride, GpuMat& descriptors, int descr_format)
+void cv::cuda::HOGDescriptor::getDescriptors(const GpuMat& img, Size win_stride, GpuMat& descriptors, int descr_format)
 {
    CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0);

@ -264,7 +264,7 @@ void cv::gpu::HOGDescriptor::getDescriptors(const GpuMat& img, Size win_stride,
    }
 }

-void cv::gpu::HOGDescriptor::computeConfidence(const GpuMat& img, std::vector<Point>& hits, double hit_threshold,
+void cv::cuda::HOGDescriptor::computeConfidence(const GpuMat& img, std::vector<Point>& hits, double hit_threshold,
                          Size win_stride, Size padding, std::vector<Point>& locations, std::vector<double>& confidences)
 {
  CV_Assert(padding == Size(0, 0));
@ -307,7 +307,7 @@ void cv::gpu::HOGDescriptor::computeConfidence(const GpuMat& img, std::vector<Po
    }
 }

-void cv::gpu::HOGDescriptor::computeConfidenceMultiScale(const GpuMat& img, std::vector<Rect>& found_locations,
+void cv::cuda::HOGDescriptor::computeConfidenceMultiScale(const GpuMat& img, std::vector<Rect>& found_locations,
                            double hit_threshold, Size win_stride, Size padding,
                            std::vector<HOGConfidence> &conf_out, int group_threshold)
 {
@ -359,7 +359,7 @@ void cv::gpu::HOGDescriptor::computeConfidenceMultiScale(const GpuMat& img, std:
 }


-void cv::gpu::HOGDescriptor::detect(const GpuMat& img, std::vector<Point>& hits, double hit_threshold, Size win_stride, Size padding)
+void cv::cuda::HOGDescriptor::detect(const GpuMat& img, std::vector<Point>& hits, double hit_threshold, Size win_stride, Size padding)
 {
    CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4);
    CV_Assert(padding == Size(0, 0));
@ -396,7 +396,7 @@ void cv::gpu::HOGDescriptor::detect(const GpuMat& img, std::vector<Point>& hits,



-void cv::gpu::HOGDescriptor::detectMultiScale(const GpuMat& img, std::vector<Rect>& found_locations, double hit_threshold,
+void cv::cuda::HOGDescriptor::detectMultiScale(const GpuMat& img, std::vector<Rect>& found_locations, double hit_threshold,
                                              Size win_stride, Size padding, double scale0, int group_threshold)
 {

@ -450,22 +450,22 @@ void cv::gpu::HOGDescriptor::detectMultiScale(const GpuMat& img, std::vector<Rec
    groupRectangles(found_locations, group_threshold, 0.2/*magic number copied from CPU version*/);
 }

-int cv::gpu::HOGDescriptor::numPartsWithin(int size, int part_size, int stride)
+int cv::cuda::HOGDescriptor::numPartsWithin(int size, int part_size, int stride)
 {
    return (size - part_size + stride) / stride;
 }

-cv::Size cv::gpu::HOGDescriptor::numPartsWithin(cv::Size size, cv::Size part_size, cv::Size stride)
+cv::Size cv::cuda::HOGDescriptor::numPartsWithin(cv::Size size, cv::Size part_size, cv::Size stride)
 {
    return Size(numPartsWithin(size.width, part_size.width, stride.width), numPartsWithin(size.height, part_size.height, stride.height));
 }

-std::vector<float> cv::gpu::HOGDescriptor::getDefaultPeopleDetector()
+std::vector<float> cv::cuda::HOGDescriptor::getDefaultPeopleDetector()
 {
    return getPeopleDetector64x128();
 }

-std::vector<float> cv::gpu::HOGDescriptor::getPeopleDetector48x96()
+std::vector<float> cv::cuda::HOGDescriptor::getPeopleDetector48x96()
 {
    static const float detector[] = {
        0.294350f, -0.098796f, -0.129522f, 0.078753f, 0.387527f, 0.261529f,
@ -805,7 +805,7 @@ std::vector<float> cv::gpu::HOGDescriptor::getPeopleDetector48x96()



-std::vector<float> cv::gpu::HOGDescriptor::getPeopleDetector64x128()
+std::vector<float> cv::cuda::HOGDescriptor::getPeopleDetector64x128()
 {
    static const float detector[] = {
       0.05359386f, -0.14721455f, -0.05532170f, 0.05077307f,
--- a/modules/cuda/src/precomp.hpp
+++ b/modules/cuda/src/precomp.hpp
@ -43,18 +43,18 @@
 #ifndef __OPENCV_PRECOMP_H__
 #define __OPENCV_PRECOMP_H__

-#include "opencv2/gpu.hpp"
-#include "opencv2/gpuarithm.hpp"
-#include "opencv2/gpuwarping.hpp"
+#include "opencv2/cuda.hpp"
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudawarping.hpp"
 #include "opencv2/calib3d.hpp"
 #include "opencv2/objdetect.hpp"

-#include "opencv2/core/private.gpu.hpp"
+#include "opencv2/core/private.cuda.hpp"

 #include "opencv2/opencv_modules.hpp"

-#ifdef HAVE_OPENCV_GPULEGACY
-#  include "opencv2/gpulegacy/private.hpp"
+#ifdef HAVE_OPENCV_CUDALEGACY
+#  include "opencv2/cudalegacy/private.hpp"
 #endif

 #endif /* __OPENCV_PRECOMP_H__ */
--- a/modules/cuda/test/test_calib3d.cpp
+++ b/modules/cuda/test/test_calib3d.cpp
@ -49,26 +49,26 @@ using namespace cvtest;
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // transformPoints

-struct TransformPoints : testing::TestWithParam<cv::gpu::DeviceInfo>
+struct TransformPoints : testing::TestWithParam<cv::cuda::DeviceInfo>
 {
-    cv::gpu::DeviceInfo devInfo;
+    cv::cuda::DeviceInfo devInfo;

    virtual void SetUp()
    {
        devInfo = GetParam();

-        cv::gpu::setDevice(devInfo.deviceID());
+        cv::cuda::setDevice(devInfo.deviceID());
    }
 };

-GPU_TEST_P(TransformPoints, Accuracy)
+CUDA_TEST_P(TransformPoints, Accuracy)
 {
    cv::Mat src = randomMat(cv::Size(1000, 1), CV_32FC3, 0, 10);
    cv::Mat rvec = randomMat(cv::Size(3, 1), CV_32F, 0, 1);
    cv::Mat tvec = randomMat(cv::Size(3, 1), CV_32F, 0, 1);

-    cv::gpu::GpuMat dst;
-    cv::gpu::transformPoints(loadMat(src), rvec, tvec, dst);
+    cv::cuda::GpuMat dst;
+    cv::cuda::transformPoints(loadMat(src), rvec, tvec, dst);

    ASSERT_EQ(src.size(), dst.size());
    ASSERT_EQ(src.type(), dst.type());
@ -92,24 +92,24 @@ GPU_TEST_P(TransformPoints, Accuracy)
    }
 }

-INSTANTIATE_TEST_CASE_P(GPU_Calib3D, TransformPoints, ALL_DEVICES);
+INSTANTIATE_TEST_CASE_P(CUDA_Calib3D, TransformPoints, ALL_DEVICES);

 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // ProjectPoints

-struct ProjectPoints : testing::TestWithParam<cv::gpu::DeviceInfo>
+struct ProjectPoints : testing::TestWithParam<cv::cuda::DeviceInfo>
 {
-    cv::gpu::DeviceInfo devInfo;
+    cv::cuda::DeviceInfo devInfo;

    virtual void SetUp()
    {
        devInfo = GetParam();

-        cv::gpu::setDevice(devInfo.deviceID());
+        cv::cuda::setDevice(devInfo.deviceID());
    }
 };

-GPU_TEST_P(ProjectPoints, Accuracy)
+CUDA_TEST_P(ProjectPoints, Accuracy)
 {
    cv::Mat src = randomMat(cv::Size(1000, 1), CV_32FC3, 0, 10);
    cv::Mat rvec = randomMat(cv::Size(3, 1), CV_32F, 0, 1);
@ -120,8 +120,8 @@ GPU_TEST_P(ProjectPoints, Accuracy)
    camera_mat.at<float>(2, 0) = 0.f;
    camera_mat.at<float>(2, 1) = 0.f;

-    cv::gpu::GpuMat dst;
-    cv::gpu::projectPoints(loadMat(src), rvec, tvec, camera_mat, cv::Mat(), dst);
+    cv::cuda::GpuMat dst;
+    cv::cuda::projectPoints(loadMat(src), rvec, tvec, camera_mat, cv::Mat(), dst);

    ASSERT_EQ(1, dst.rows);
    ASSERT_EQ(MatType(CV_32FC2), MatType(dst.type()));
@ -142,24 +142,24 @@ GPU_TEST_P(ProjectPoints, Accuracy)
    }
 }

-INSTANTIATE_TEST_CASE_P(GPU_Calib3D, ProjectPoints, ALL_DEVICES);
+INSTANTIATE_TEST_CASE_P(CUDA_Calib3D, ProjectPoints, ALL_DEVICES);

 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // SolvePnPRansac

-struct SolvePnPRansac : testing::TestWithParam<cv::gpu::DeviceInfo>
+struct SolvePnPRansac : testing::TestWithParam<cv::cuda::DeviceInfo>
 {
-    cv::gpu::DeviceInfo devInfo;
+    cv::cuda::DeviceInfo devInfo;

    virtual void SetUp()
    {
        devInfo = GetParam();

-        cv::gpu::setDevice(devInfo.deviceID());
+        cv::cuda::setDevice(devInfo.deviceID());
    }
 };

-GPU_TEST_P(SolvePnPRansac, Accuracy)
+CUDA_TEST_P(SolvePnPRansac, Accuracy)
 {
    cv::Mat object = randomMat(cv::Size(5000, 1), CV_32FC3, 0, 100);
    cv::Mat camera_mat = randomMat(cv::Size(3, 3), CV_32F, 0.5, 1);
@ -177,7 +177,7 @@ GPU_TEST_P(SolvePnPRansac, Accuracy)

    cv::Mat rvec, tvec;
    std::vector<int> inliers;
-    cv::gpu::solvePnPRansac(object, cv::Mat(1, (int)image_vec.size(), CV_32FC2, &image_vec[0]),
+    cv::cuda::solvePnPRansac(object, cv::Mat(1, (int)image_vec.size(), CV_32FC2, &image_vec[0]),
                            camera_mat, cv::Mat(1, 8, CV_32F, cv::Scalar::all(0)),
                            rvec, tvec, false, 200, 2.f, 100, &inliers);

@ -185,6 +185,6 @@ GPU_TEST_P(SolvePnPRansac, Accuracy)
    ASSERT_LE(cv::norm(tvec - tvec_gold), 1e-3);
 }

-INSTANTIATE_TEST_CASE_P(GPU_Calib3D, SolvePnPRansac, ALL_DEVICES);
+INSTANTIATE_TEST_CASE_P(CUDA_Calib3D, SolvePnPRansac, ALL_DEVICES);

 #endif // HAVE_CUDA
--- a/modules/cuda/test/test_global_motion.cpp
+++ b/modules/cuda/test/test_global_motion.cpp
@ -47,12 +47,12 @@
 using namespace std;
 using namespace cv;

-struct CompactPoints : testing::TestWithParam<gpu::DeviceInfo>
+struct CompactPoints : testing::TestWithParam<cuda::DeviceInfo>
 {
-    virtual void SetUp() { gpu::setDevice(GetParam().deviceID()); }
+    virtual void SetUp() { cuda::setDevice(GetParam().deviceID()); }
 };

-GPU_TEST_P(CompactPoints, CanCompactizeSmallInput)
+CUDA_TEST_P(CompactPoints, CanCompactizeSmallInput)
 {
    Mat src0(1, 3, CV_32FC2);
    src0.at<Point2f>(0,0) = Point2f(0,0);
@ -69,8 +69,8 @@ GPU_TEST_P(CompactPoints, CanCompactizeSmallInput)
    mask.at<uchar>(0,1) = 0;
    mask.at<uchar>(0,2) = 1;

-    gpu::GpuMat dsrc0(src0), dsrc1(src1), dmask(mask);
-    gpu::compactPoints(dsrc0, dsrc1, dmask);
+    cuda::GpuMat dsrc0(src0), dsrc1(src1), dmask(mask);
+    cuda::compactPoints(dsrc0, dsrc1, dmask);

    dsrc0.download(src0);
    dsrc1.download(src1);
@ -85,6 +85,6 @@ GPU_TEST_P(CompactPoints, CanCompactizeSmallInput)
    ASSERT_TRUE(src1.at<Point2f>(0,1) == Point2f(1,2));
 }

-INSTANTIATE_TEST_CASE_P(GPU_GlobalMotion, CompactPoints, ALL_DEVICES);
+INSTANTIATE_TEST_CASE_P(CUDA_GlobalMotion, CompactPoints, ALL_DEVICES);

 #endif // HAVE_CUDA
--- a/modules/cuda/test/test_gpumat.cpp
+++ b/modules/cuda/test/test_gpumat.cpp
@ -49,9 +49,9 @@ using namespace cvtest;
 ////////////////////////////////////////////////////////////////////////////////
 // SetTo

-PARAM_TEST_CASE(SetTo, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
+PARAM_TEST_CASE(SetTo, cv::cuda::DeviceInfo, cv::Size, MatType, UseRoi)
 {
-    cv::gpu::DeviceInfo devInfo;
+    cv::cuda::DeviceInfo devInfo;
    cv::Size size;
    int type;
    bool useRoi;
@ -63,29 +63,29 @@ PARAM_TEST_CASE(SetTo, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
        type = GET_PARAM(2);
        useRoi = GET_PARAM(3);

-        cv::gpu::setDevice(devInfo.deviceID());
+        cv::cuda::setDevice(devInfo.deviceID());
    }
 };

-GPU_TEST_P(SetTo, Zero)
+CUDA_TEST_P(SetTo, Zero)
 {
    cv::Scalar zero = cv::Scalar::all(0);

-    cv::gpu::GpuMat mat = createMat(size, type, useRoi);
+    cv::cuda::GpuMat mat = createMat(size, type, useRoi);
    mat.setTo(zero);

    EXPECT_MAT_NEAR(cv::Mat::zeros(size, type), mat, 0.0);
 }

-GPU_TEST_P(SetTo, SameVal)
+CUDA_TEST_P(SetTo, SameVal)
 {
    cv::Scalar val = cv::Scalar::all(randomDouble(0.0, 255.0));

-    if (CV_MAT_DEPTH(type) == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    if (CV_MAT_DEPTH(type) == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
    {
        try
        {
-            cv::gpu::GpuMat mat = createMat(size, type, useRoi);
+            cv::cuda::GpuMat mat = createMat(size, type, useRoi);
            mat.setTo(val);
        }
        catch (const cv::Exception& e)
@ -95,22 +95,22 @@ GPU_TEST_P(SetTo, SameVal)
    }
    else
    {
-        cv::gpu::GpuMat mat = createMat(size, type, useRoi);
+        cv::cuda::GpuMat mat = createMat(size, type, useRoi);
        mat.setTo(val);

        EXPECT_MAT_NEAR(cv::Mat(size, type, val), mat, 0.0);
    }
 }

-GPU_TEST_P(SetTo, DifferentVal)
+CUDA_TEST_P(SetTo, DifferentVal)
 {
    cv::Scalar val = randomScalar(0.0, 255.0);

-    if (CV_MAT_DEPTH(type) == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    if (CV_MAT_DEPTH(type) == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
    {
        try
        {
-            cv::gpu::GpuMat mat = createMat(size, type, useRoi);
+            cv::cuda::GpuMat mat = createMat(size, type, useRoi);
            mat.setTo(val);
        }
        catch (const cv::Exception& e)
@ -120,24 +120,24 @@ GPU_TEST_P(SetTo, DifferentVal)
    }
    else
    {
-        cv::gpu::GpuMat mat = createMat(size, type, useRoi);
+        cv::cuda::GpuMat mat = createMat(size, type, useRoi);
        mat.setTo(val);

        EXPECT_MAT_NEAR(cv::Mat(size, type, val), mat, 0.0);
    }
 }

-GPU_TEST_P(SetTo, Masked)
+CUDA_TEST_P(SetTo, Masked)
 {
    cv::Scalar val = randomScalar(0.0, 255.0);
    cv::Mat mat_gold = randomMat(size, type);
    cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);

-    if (CV_MAT_DEPTH(type) == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    if (CV_MAT_DEPTH(type) == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
    {
        try
        {
-            cv::gpu::GpuMat mat = createMat(size, type, useRoi);
+            cv::cuda::GpuMat mat = createMat(size, type, useRoi);
            mat.setTo(val, loadMat(mask));
        }
        catch (const cv::Exception& e)
@ -147,7 +147,7 @@ GPU_TEST_P(SetTo, Masked)
    }
    else
    {
-        cv::gpu::GpuMat mat = loadMat(mat_gold, useRoi);
+        cv::cuda::GpuMat mat = loadMat(mat_gold, useRoi);
        mat.setTo(val, loadMat(mask, useRoi));

        mat_gold.setTo(val, mask);
@ -156,7 +156,7 @@ GPU_TEST_P(SetTo, Masked)
    }
 }

-INSTANTIATE_TEST_CASE_P(GPU_GpuMat, SetTo, testing::Combine(
+INSTANTIATE_TEST_CASE_P(CUDA_GpuMat, SetTo, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    ALL_TYPES,
@ -165,9 +165,9 @@ INSTANTIATE_TEST_CASE_P(GPU_GpuMat, SetTo, testing::Combine(
 ////////////////////////////////////////////////////////////////////////////////
 // CopyTo

-PARAM_TEST_CASE(CopyTo, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
+PARAM_TEST_CASE(CopyTo, cv::cuda::DeviceInfo, cv::Size, MatType, UseRoi)
 {
-    cv::gpu::DeviceInfo devInfo;
+    cv::cuda::DeviceInfo devInfo;
    cv::Size size;
    int type;
    bool useRoi;
@ -180,32 +180,32 @@ PARAM_TEST_CASE(CopyTo, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
        type = GET_PARAM(2);
        useRoi = GET_PARAM(3);

-        cv::gpu::setDevice(devInfo.deviceID());
+        cv::cuda::setDevice(devInfo.deviceID());
    }
 };

-GPU_TEST_P(CopyTo, WithOutMask)
+CUDA_TEST_P(CopyTo, WithOutMask)
 {
    cv::Mat src = randomMat(size, type);

-    cv::gpu::GpuMat d_src = loadMat(src, useRoi);
-    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    cv::cuda::GpuMat d_src = loadMat(src, useRoi);
+    cv::cuda::GpuMat dst = createMat(size, type, useRoi);
    d_src.copyTo(dst);

    EXPECT_MAT_NEAR(src, dst, 0.0);
 }

-GPU_TEST_P(CopyTo, Masked)
+CUDA_TEST_P(CopyTo, Masked)
 {
    cv::Mat src = randomMat(size, type);
    cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);

-    if (CV_MAT_DEPTH(type) == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    if (CV_MAT_DEPTH(type) == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
    {
        try
        {
-            cv::gpu::GpuMat d_src = loadMat(src);
-            cv::gpu::GpuMat dst;
+            cv::cuda::GpuMat d_src = loadMat(src);
+            cv::cuda::GpuMat dst;
            d_src.copyTo(dst, loadMat(mask, useRoi));
        }
        catch (const cv::Exception& e)
@ -215,8 +215,8 @@ GPU_TEST_P(CopyTo, Masked)
    }
    else
    {
-        cv::gpu::GpuMat d_src = loadMat(src, useRoi);
-        cv::gpu::GpuMat dst = loadMat(cv::Mat::zeros(size, type), useRoi);
+        cv::cuda::GpuMat d_src = loadMat(src, useRoi);
+        cv::cuda::GpuMat dst = loadMat(cv::Mat::zeros(size, type), useRoi);
        d_src.copyTo(dst, loadMat(mask, useRoi));

        cv::Mat dst_gold = cv::Mat::zeros(size, type);
@ -226,7 +226,7 @@ GPU_TEST_P(CopyTo, Masked)
    }
 }

-INSTANTIATE_TEST_CASE_P(GPU_GpuMat, CopyTo, testing::Combine(
+INSTANTIATE_TEST_CASE_P(CUDA_GpuMat, CopyTo, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    ALL_TYPES,
@ -235,9 +235,9 @@ INSTANTIATE_TEST_CASE_P(GPU_GpuMat, CopyTo, testing::Combine(
 ////////////////////////////////////////////////////////////////////////////////
 // ConvertTo

-PARAM_TEST_CASE(ConvertTo, cv::gpu::DeviceInfo, cv::Size, MatDepth, MatDepth, UseRoi)
+PARAM_TEST_CASE(ConvertTo, cv::cuda::DeviceInfo, cv::Size, MatDepth, MatDepth, UseRoi)
 {
-    cv::gpu::DeviceInfo devInfo;
+    cv::cuda::DeviceInfo devInfo;
    cv::Size size;
    int depth1;
    int depth2;
@ -251,20 +251,20 @@ PARAM_TEST_CASE(ConvertTo, cv::gpu::DeviceInfo, cv::Size, MatDepth, MatDepth, Us
        depth2 = GET_PARAM(3);
        useRoi = GET_PARAM(4);

-        cv::gpu::setDevice(devInfo.deviceID());
+        cv::cuda::setDevice(devInfo.deviceID());
    }
 };

-GPU_TEST_P(ConvertTo, WithOutScaling)
+CUDA_TEST_P(ConvertTo, WithOutScaling)
 {
    cv::Mat src = randomMat(size, depth1);

-    if ((depth1 == CV_64F || depth2 == CV_64F) && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    if ((depth1 == CV_64F || depth2 == CV_64F) && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
    {
        try
        {
-            cv::gpu::GpuMat d_src = loadMat(src);
-            cv::gpu::GpuMat dst;
+            cv::cuda::GpuMat d_src = loadMat(src);
+            cv::cuda::GpuMat dst;
            d_src.convertTo(dst, depth2);
        }
        catch (const cv::Exception& e)
@ -274,8 +274,8 @@ GPU_TEST_P(ConvertTo, WithOutScaling)
    }
    else
    {
-        cv::gpu::GpuMat d_src = loadMat(src, useRoi);
-        cv::gpu::GpuMat dst = createMat(size, depth2, useRoi);
+        cv::cuda::GpuMat d_src = loadMat(src, useRoi);
+        cv::cuda::GpuMat dst = createMat(size, depth2, useRoi);
        d_src.convertTo(dst, depth2);

        cv::Mat dst_gold;
@ -285,18 +285,18 @@ GPU_TEST_P(ConvertTo, WithOutScaling)
    }
 }

-GPU_TEST_P(ConvertTo, WithScaling)
+CUDA_TEST_P(ConvertTo, WithScaling)
 {
    cv::Mat src = randomMat(size, depth1);
    double a = randomDouble(0.0, 1.0);
    double b = randomDouble(-10.0, 10.0);

-    if ((depth1 == CV_64F || depth2 == CV_64F) && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    if ((depth1 == CV_64F || depth2 == CV_64F) && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
    {
        try
        {
-            cv::gpu::GpuMat d_src = loadMat(src);
-            cv::gpu::GpuMat dst;
+            cv::cuda::GpuMat d_src = loadMat(src);
+            cv::cuda::GpuMat dst;
            d_src.convertTo(dst, depth2, a, b);
        }
        catch (const cv::Exception& e)
@ -306,8 +306,8 @@ GPU_TEST_P(ConvertTo, WithScaling)
    }
    else
    {
-        cv::gpu::GpuMat d_src = loadMat(src, useRoi);
-        cv::gpu::GpuMat dst = createMat(size, depth2, useRoi);
+        cv::cuda::GpuMat d_src = loadMat(src, useRoi);
+        cv::cuda::GpuMat dst = createMat(size, depth2, useRoi);
        d_src.convertTo(dst, depth2, a, b);

        cv::Mat dst_gold;
@ -317,7 +317,7 @@ GPU_TEST_P(ConvertTo, WithScaling)
    }
 }

-INSTANTIATE_TEST_CASE_P(GPU_GpuMat, ConvertTo, testing::Combine(
+INSTANTIATE_TEST_CASE_P(CUDA_GpuMat, ConvertTo, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    ALL_DEPTH,
@ -327,35 +327,35 @@ INSTANTIATE_TEST_CASE_P(GPU_GpuMat, ConvertTo, testing::Combine(
 ////////////////////////////////////////////////////////////////////////////////
 // ensureSizeIsEnough

-struct EnsureSizeIsEnough : testing::TestWithParam<cv::gpu::DeviceInfo>
+struct EnsureSizeIsEnough : testing::TestWithParam<cv::cuda::DeviceInfo>
 {
    virtual void SetUp()
    {
-        cv::gpu::DeviceInfo devInfo = GetParam();
-        cv::gpu::setDevice(devInfo.deviceID());
+        cv::cuda::DeviceInfo devInfo = GetParam();
+        cv::cuda::setDevice(devInfo.deviceID());
    }
 };

-GPU_TEST_P(EnsureSizeIsEnough, BufferReuse)
+CUDA_TEST_P(EnsureSizeIsEnough, BufferReuse)
 {
-    cv::gpu::GpuMat buffer(100, 100, CV_8U);
-    cv::gpu::GpuMat old = buffer;
+    cv::cuda::GpuMat buffer(100, 100, CV_8U);
+    cv::cuda::GpuMat old = buffer;

    // don't reallocate memory
-    cv::gpu::ensureSizeIsEnough(10, 20, CV_8U, buffer);
+    cv::cuda::ensureSizeIsEnough(10, 20, CV_8U, buffer);
    EXPECT_EQ(10, buffer.rows);
    EXPECT_EQ(20, buffer.cols);
    EXPECT_EQ(CV_8UC1, buffer.type());
    EXPECT_EQ(reinterpret_cast<intptr_t>(old.data), reinterpret_cast<intptr_t>(buffer.data));

    // don't reallocate memory
-    cv::gpu::ensureSizeIsEnough(20, 30, CV_8U, buffer);
+    cv::cuda::ensureSizeIsEnough(20, 30, CV_8U, buffer);
    EXPECT_EQ(20, buffer.rows);
    EXPECT_EQ(30, buffer.cols);
    EXPECT_EQ(CV_8UC1, buffer.type());
    EXPECT_EQ(reinterpret_cast<intptr_t>(old.data), reinterpret_cast<intptr_t>(buffer.data));
 }

-INSTANTIATE_TEST_CASE_P(GPU_GpuMat, EnsureSizeIsEnough, ALL_DEVICES);
+INSTANTIATE_TEST_CASE_P(CUDA_GpuMat, EnsureSizeIsEnough, ALL_DEVICES);

 #endif // HAVE_CUDA
--- a/modules/cuda/test/test_labeling.cpp
+++ b/modules/cuda/test/test_labeling.cpp
@ -151,14 +151,14 @@ namespace
    };
 }

-struct Labeling : testing::TestWithParam<cv::gpu::DeviceInfo>
+struct Labeling : testing::TestWithParam<cv::cuda::DeviceInfo>
 {
-    cv::gpu::DeviceInfo devInfo;
+    cv::cuda::DeviceInfo devInfo;

    virtual void SetUp()
    {
        devInfo = GetParam();
-        cv::gpu::setDevice(devInfo.deviceID());
+        cv::cuda::setDevice(devInfo.deviceID());
    }

    cv::Mat loat_image()
@ -167,7 +167,7 @@ struct Labeling : testing::TestWithParam<cv::gpu::DeviceInfo>
    }
 };

-GPU_TEST_P(Labeling, DISABLED_ConnectedComponents)
+CUDA_TEST_P(Labeling, DISABLED_ConnectedComponents)
 {
    cv::Mat image;
    cvtColor(loat_image(), image, cv::COLOR_BGR2GRAY);
@ -179,19 +179,19 @@ GPU_TEST_P(Labeling, DISABLED_ConnectedComponents)
    GreedyLabeling host(image);
    host(host._labels);

-    cv::gpu::GpuMat mask;
+    cv::cuda::GpuMat mask;
    mask.create(image.rows, image.cols, CV_8UC1);

-    cv::gpu::GpuMat components;
+    cv::cuda::GpuMat components;
    components.create(image.rows, image.cols, CV_32SC1);

-    cv::gpu::connectivityMask(cv::gpu::GpuMat(image), mask, cv::Scalar::all(0), cv::Scalar::all(2));
+    cv::cuda::connectivityMask(cv::cuda::GpuMat(image), mask, cv::Scalar::all(0), cv::Scalar::all(2));

-    cv::gpu::labelComponents(mask, components);
+    cv::cuda::labelComponents(mask, components);

    host.checkCorrectness(cv::Mat(components));
 }

-INSTANTIATE_TEST_CASE_P(GPU_ConnectedComponents, Labeling, ALL_DEVICES);
+INSTANTIATE_TEST_CASE_P(CUDA_ConnectedComponents, Labeling, ALL_DEVICES);

 #endif // HAVE_CUDA
--- a/modules/gpuarithm/test/test_main.cpp
+++ b/modules/gpuarithm/test/test_main.cpp
@ -42,4 +42,4 @@

 #include "test_precomp.hpp"

-CV_GPU_TEST_MAIN("gpu")
+CV_CUDA_TEST_MAIN("gpu")
--- a/modules/cuda/test/test_objdetect.cpp
+++ b/modules/cuda/test/test_objdetect.cpp
@ -48,9 +48,9 @@ using namespace cvtest;

 //#define DUMP

-struct HOG : testing::TestWithParam<cv::gpu::DeviceInfo>, cv::gpu::HOGDescriptor
+struct HOG : testing::TestWithParam<cv::cuda::DeviceInfo>, cv::cuda::HOGDescriptor
 {
-    cv::gpu::DeviceInfo devInfo;
+    cv::cuda::DeviceInfo devInfo;

 #ifdef DUMP
    std::ofstream f;
@ -68,7 +68,7 @@ struct HOG : testing::TestWithParam<cv::gpu::DeviceInfo>, cv::gpu::HOGDescriptor
    {
        devInfo = GetParam();

-        cv::gpu::setDevice(devInfo.deviceID());
+        cv::cuda::setDevice(devInfo.deviceID());
    }

 #ifdef DUMP
@ -127,7 +127,7 @@ struct HOG : testing::TestWithParam<cv::gpu::DeviceInfo>, cv::gpu::HOGDescriptor
    void testDetect(const cv::Mat& img)
    {
        gamma_correction = false;
-        setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
+        setSVMDetector(cv::cuda::HOGDescriptor::getDefaultPeopleDetector());

        std::vector<cv::Point> locations;

@ -177,7 +177,7 @@ struct HOG : testing::TestWithParam<cv::gpu::DeviceInfo>, cv::gpu::HOGDescriptor
 };

 // desabled while resize does not fixed
-GPU_TEST_P(HOG, Detect)
+CUDA_TEST_P(HOG, Detect)
 {
    cv::Mat img_rgb = readImage("hog/road.png");
    ASSERT_FALSE(img_rgb.empty());
@ -202,7 +202,7 @@ GPU_TEST_P(HOG, Detect)
    f.close();
 }

-GPU_TEST_P(HOG, GetDescriptors)
+CUDA_TEST_P(HOG, GetDescriptors)
 {
    // Load image (e.g. train data, composed from windows)
    cv::Mat img_rgb = readImage("hog/train_data.png");
@ -212,10 +212,10 @@ GPU_TEST_P(HOG, GetDescriptors)
    cv::Mat img;
    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);

-    cv::gpu::GpuMat d_img(img);
+    cv::cuda::GpuMat d_img(img);

    // Convert train images into feature vectors (train table)
-    cv::gpu::GpuMat descriptors, descriptors_by_cols;
+    cv::cuda::GpuMat descriptors, descriptors_by_cols;
    getDescriptors(d_img, win_size, descriptors, DESCR_FORMAT_ROW_BY_ROW);
    getDescriptors(d_img, win_size, descriptors_by_cols, DESCR_FORMAT_COL_BY_COL);

@ -251,67 +251,67 @@ GPU_TEST_P(HOG, GetDescriptors)
    img_rgb = readImage("hog/positive1.png");
    ASSERT_TRUE(!img_rgb.empty());
    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
-    computeBlockHistograms(cv::gpu::GpuMat(img));
+    computeBlockHistograms(cv::cuda::GpuMat(img));
    // Everything is fine with interpolation for left top subimage
    ASSERT_EQ(0.0, cv::norm((cv::Mat)block_hists, (cv::Mat)descriptors.rowRange(0, 1)));

    img_rgb = readImage("hog/positive2.png");
    ASSERT_TRUE(!img_rgb.empty());
    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
-    computeBlockHistograms(cv::gpu::GpuMat(img));
+    computeBlockHistograms(cv::cuda::GpuMat(img));
    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(1, 2)));

    img_rgb = readImage("hog/negative1.png");
    ASSERT_TRUE(!img_rgb.empty());
    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
-    computeBlockHistograms(cv::gpu::GpuMat(img));
+    computeBlockHistograms(cv::cuda::GpuMat(img));
    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(2, 3)));

    img_rgb = readImage("hog/negative2.png");
    ASSERT_TRUE(!img_rgb.empty());
    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
-    computeBlockHistograms(cv::gpu::GpuMat(img));
+    computeBlockHistograms(cv::cuda::GpuMat(img));
    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(3, 4)));

    img_rgb = readImage("hog/positive3.png");
    ASSERT_TRUE(!img_rgb.empty());
    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
-    computeBlockHistograms(cv::gpu::GpuMat(img));
+    computeBlockHistograms(cv::cuda::GpuMat(img));
    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(4, 5)));

    img_rgb = readImage("hog/negative3.png");
    ASSERT_TRUE(!img_rgb.empty());
    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
-    computeBlockHistograms(cv::gpu::GpuMat(img));
+    computeBlockHistograms(cv::cuda::GpuMat(img));
    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(5, 6)));
 }

-INSTANTIATE_TEST_CASE_P(GPU_ObjDetect, HOG, ALL_DEVICES);
+INSTANTIATE_TEST_CASE_P(CUDA_ObjDetect, HOG, ALL_DEVICES);

 //============== caltech hog tests =====================//

-struct CalTech : public ::testing::TestWithParam<std::tr1::tuple<cv::gpu::DeviceInfo, std::string> >
+struct CalTech : public ::testing::TestWithParam<std::tr1::tuple<cv::cuda::DeviceInfo, std::string> >
 {
-    cv::gpu::DeviceInfo devInfo;
+    cv::cuda::DeviceInfo devInfo;
    cv::Mat img;

    virtual void SetUp()
    {
        devInfo = GET_PARAM(0);
-        cv::gpu::setDevice(devInfo.deviceID());
+        cv::cuda::setDevice(devInfo.deviceID());

        img = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
        ASSERT_FALSE(img.empty());
    }
 };

-GPU_TEST_P(CalTech, HOG)
+CUDA_TEST_P(CalTech, HOG)
 {
-    cv::gpu::GpuMat d_img(img);
+    cv::cuda::GpuMat d_img(img);
    cv::Mat markedImage(img.clone());

-    cv::gpu::HOGDescriptor d_hog;
-    d_hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
+    cv::cuda::HOGDescriptor d_hog;
+    d_hog.setSVMDetector(cv::cuda::HOGDescriptor::getDefaultPeopleDetector());
    d_hog.nlevels = d_hog.nlevels + 32;

    std::vector<cv::Rect> found_locations;
@ -341,40 +341,40 @@ INSTANTIATE_TEST_CASE_P(detect, CalTech, testing::Combine(ALL_DEVICES,
 //////////////////////////////////////////////////////////////////////////////////////////
 /// LBP classifier

-PARAM_TEST_CASE(LBP_Read_classifier, cv::gpu::DeviceInfo, int)
+PARAM_TEST_CASE(LBP_Read_classifier, cv::cuda::DeviceInfo, int)
 {
-    cv::gpu::DeviceInfo devInfo;
+    cv::cuda::DeviceInfo devInfo;

    virtual void SetUp()
    {
        devInfo = GET_PARAM(0);
-        cv::gpu::setDevice(devInfo.deviceID());
+        cv::cuda::setDevice(devInfo.deviceID());
    }
 };

-GPU_TEST_P(LBP_Read_classifier, Accuracy)
+CUDA_TEST_P(LBP_Read_classifier, Accuracy)
 {
-    cv::gpu::CascadeClassifier_GPU classifier;
+    cv::cuda::CascadeClassifier_CUDA classifier;
    std::string classifierXmlPath = std::string(cvtest::TS::ptr()->get_data_path()) + "lbpcascade/lbpcascade_frontalface.xml";
    ASSERT_TRUE(classifier.load(classifierXmlPath));
 }

-INSTANTIATE_TEST_CASE_P(GPU_ObjDetect, LBP_Read_classifier,
+INSTANTIATE_TEST_CASE_P(CUDA_ObjDetect, LBP_Read_classifier,
                        testing::Combine(ALL_DEVICES, testing::Values<int>(0)));


-PARAM_TEST_CASE(LBP_classify, cv::gpu::DeviceInfo, int)
+PARAM_TEST_CASE(LBP_classify, cv::cuda::DeviceInfo, int)
 {
-    cv::gpu::DeviceInfo devInfo;
+    cv::cuda::DeviceInfo devInfo;

    virtual void SetUp()
    {
        devInfo = GET_PARAM(0);
-        cv::gpu::setDevice(devInfo.deviceID());
+        cv::cuda::setDevice(devInfo.deviceID());
    }
 };

-GPU_TEST_P(LBP_classify, Accuracy)
+CUDA_TEST_P(LBP_classify, Accuracy)
 {
    std::string classifierXmlPath = std::string(cvtest::TS::ptr()->get_data_path()) + "lbpcascade/lbpcascade_frontalface.xml";
    std::string imagePath = std::string(cvtest::TS::ptr()->get_data_path()) + "lbpcascade/er.png";
@ -396,11 +396,11 @@ GPU_TEST_P(LBP_classify, Accuracy)
    for (; it != rects.end(); ++it)
        cv::rectangle(markedImage, *it, cv::Scalar(255, 0, 0));

-    cv::gpu::CascadeClassifier_GPU gpuClassifier;
+    cv::cuda::CascadeClassifier_CUDA gpuClassifier;
    ASSERT_TRUE(gpuClassifier.load(classifierXmlPath));

-    cv::gpu::GpuMat gpu_rects;
-    cv::gpu::GpuMat tested(grey);
+    cv::cuda::GpuMat gpu_rects;
+    cv::cuda::GpuMat tested(grey);
    int count = gpuClassifier.detectMultiScale(tested, gpu_rects);

 #if defined (LOG_CASCADE_STATISTIC)
@ -421,7 +421,7 @@ GPU_TEST_P(LBP_classify, Accuracy)
    (void)count;
 }

-INSTANTIATE_TEST_CASE_P(GPU_ObjDetect, LBP_classify,
+INSTANTIATE_TEST_CASE_P(CUDA_ObjDetect, LBP_classify,
                        testing::Combine(ALL_DEVICES, testing::Values<int>(0)));

 #endif // HAVE_CUDA
--- a/modules/cuda/test/test_opengl.cpp
+++ b/modules/cuda/test/test_opengl.cpp
@ -71,7 +71,7 @@ PARAM_TEST_CASE(Buffer, cv::Size, MatType)
    }
 };

-GPU_TEST_P(Buffer, Constructor1)
+CUDA_TEST_P(Buffer, Constructor1)
 {
    cv::ogl::Buffer buf(size.height, size.width, type, cv::ogl::Buffer::ARRAY_BUFFER, true);

@ -80,7 +80,7 @@ GPU_TEST_P(Buffer, Constructor1)
    EXPECT_EQ(type, buf.type());
 }

-GPU_TEST_P(Buffer, Constructor2)
+CUDA_TEST_P(Buffer, Constructor2)
 {
    cv::ogl::Buffer buf(size, type, cv::ogl::Buffer::ARRAY_BUFFER, true);

@ -89,7 +89,7 @@ GPU_TEST_P(Buffer, Constructor2)
    EXPECT_EQ(type, buf.type());
 }

-GPU_TEST_P(Buffer, ConstructorFromMat)
+CUDA_TEST_P(Buffer, ConstructorFromMat)
 {
    cv::Mat gold = randomMat(size, type);

@ -101,10 +101,10 @@ GPU_TEST_P(Buffer, ConstructorFromMat)
    EXPECT_MAT_NEAR(gold, bufData, 0);
 }

-GPU_TEST_P(Buffer, ConstructorFromGpuMat)
+CUDA_TEST_P(Buffer, ConstructorFromGpuMat)
 {
    cv::Mat gold = randomMat(size, type);
-    cv::gpu::GpuMat d_gold(gold);
+    cv::cuda::GpuMat d_gold(gold);

    cv::ogl::Buffer buf(d_gold, cv::ogl::Buffer::ARRAY_BUFFER);

@ -114,7 +114,7 @@ GPU_TEST_P(Buffer, ConstructorFromGpuMat)
    EXPECT_MAT_NEAR(gold, bufData, 0);
 }

-GPU_TEST_P(Buffer, ConstructorFromBuffer)
+CUDA_TEST_P(Buffer, ConstructorFromBuffer)
 {
    cv::ogl::Buffer buf_gold(size, type, cv::ogl::Buffer::ARRAY_BUFFER, true);

@ -126,7 +126,7 @@ GPU_TEST_P(Buffer, ConstructorFromBuffer)
    EXPECT_EQ(buf_gold.type(), buf.type());
 }

-GPU_TEST_P(Buffer, Create)
+CUDA_TEST_P(Buffer, Create)
 {
    cv::ogl::Buffer buf;
    buf.create(size.height, size.width, type, cv::ogl::Buffer::ARRAY_BUFFER, true);
@ -136,7 +136,7 @@ GPU_TEST_P(Buffer, Create)
    EXPECT_EQ(type, buf.type());
 }

-GPU_TEST_P(Buffer, CopyFromMat)
+CUDA_TEST_P(Buffer, CopyFromMat)
 {
    cv::Mat gold = randomMat(size, type);

@ -149,10 +149,10 @@ GPU_TEST_P(Buffer, CopyFromMat)
    EXPECT_MAT_NEAR(gold, bufData, 0);
 }

-GPU_TEST_P(Buffer, CopyFromGpuMat)
+CUDA_TEST_P(Buffer, CopyFromGpuMat)
 {
    cv::Mat gold = randomMat(size, type);
-    cv::gpu::GpuMat d_gold(gold);
+    cv::cuda::GpuMat d_gold(gold);

    cv::ogl::Buffer buf;
    buf.copyFrom(d_gold, cv::ogl::Buffer::ARRAY_BUFFER, true);
@ -163,7 +163,7 @@ GPU_TEST_P(Buffer, CopyFromGpuMat)
    EXPECT_MAT_NEAR(gold, bufData, 0);
 }

-GPU_TEST_P(Buffer, CopyFromBuffer)
+CUDA_TEST_P(Buffer, CopyFromBuffer)
 {
    cv::Mat gold = randomMat(size, type);
    cv::ogl::Buffer buf_gold(gold, cv::ogl::Buffer::ARRAY_BUFFER, true);
@ -179,19 +179,19 @@ GPU_TEST_P(Buffer, CopyFromBuffer)
    EXPECT_MAT_NEAR(gold, bufData, 0);
 }

-GPU_TEST_P(Buffer, CopyToGpuMat)
+CUDA_TEST_P(Buffer, CopyToGpuMat)
 {
    cv::Mat gold = randomMat(size, type);

    cv::ogl::Buffer buf(gold, cv::ogl::Buffer::ARRAY_BUFFER, true);

-    cv::gpu::GpuMat dst;
+    cv::cuda::GpuMat dst;
    buf.copyTo(dst);

    EXPECT_MAT_NEAR(gold, dst, 0);
 }

-GPU_TEST_P(Buffer, CopyToBuffer)
+CUDA_TEST_P(Buffer, CopyToBuffer)
 {
    cv::Mat gold = randomMat(size, type);

@ -209,7 +209,7 @@ GPU_TEST_P(Buffer, CopyToBuffer)
    EXPECT_MAT_NEAR(gold, bufData, 0);
 }

-GPU_TEST_P(Buffer, Clone)
+CUDA_TEST_P(Buffer, Clone)
 {
    cv::Mat gold = randomMat(size, type);

@ -225,7 +225,7 @@ GPU_TEST_P(Buffer, Clone)
    EXPECT_MAT_NEAR(gold, bufData, 0);
 }

-GPU_TEST_P(Buffer, MapHostRead)
+CUDA_TEST_P(Buffer, MapHostRead)
 {
    cv::Mat gold = randomMat(size, type);

@ -238,7 +238,7 @@ GPU_TEST_P(Buffer, MapHostRead)
    buf.unmapHost();
 }

-GPU_TEST_P(Buffer, MapHostWrite)
+CUDA_TEST_P(Buffer, MapHostWrite)
 {
    cv::Mat gold = randomMat(size, type);

@ -255,13 +255,13 @@ GPU_TEST_P(Buffer, MapHostWrite)
    EXPECT_MAT_NEAR(gold, bufData, 0);
 }

-GPU_TEST_P(Buffer, MapDevice)
+CUDA_TEST_P(Buffer, MapDevice)
 {
    cv::Mat gold = randomMat(size, type);

    cv::ogl::Buffer buf(gold, cv::ogl::Buffer::ARRAY_BUFFER, true);

-    cv::gpu::GpuMat dst = buf.mapDevice();
+    cv::cuda::GpuMat dst = buf.mapDevice();

    EXPECT_MAT_NEAR(gold, dst, 0);

@ -302,7 +302,7 @@ PARAM_TEST_CASE(Texture2D, cv::Size, MatType)
    }
 };

-GPU_TEST_P(Texture2D, Constructor1)
+CUDA_TEST_P(Texture2D, Constructor1)
 {
    cv::ogl::Texture2D tex(size.height, size.width, format, true);

@ -311,7 +311,7 @@ GPU_TEST_P(Texture2D, Constructor1)
    EXPECT_EQ(format, tex.format());
 }

-GPU_TEST_P(Texture2D, Constructor2)
+CUDA_TEST_P(Texture2D, Constructor2)
 {
    cv::ogl::Texture2D tex(size, format, true);

@ -320,7 +320,7 @@ GPU_TEST_P(Texture2D, Constructor2)
    EXPECT_EQ(format, tex.format());
 }

-GPU_TEST_P(Texture2D, ConstructorFromMat)
+CUDA_TEST_P(Texture2D, ConstructorFromMat)
 {
    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);

@ -332,10 +332,10 @@ GPU_TEST_P(Texture2D, ConstructorFromMat)
    EXPECT_MAT_NEAR(gold, texData, 1e-2);
 }

-GPU_TEST_P(Texture2D, ConstructorFromGpuMat)
+CUDA_TEST_P(Texture2D, ConstructorFromGpuMat)
 {
    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);
-    cv::gpu::GpuMat d_gold(gold);
+    cv::cuda::GpuMat d_gold(gold);

    cv::ogl::Texture2D tex(d_gold, true);

@ -345,7 +345,7 @@ GPU_TEST_P(Texture2D, ConstructorFromGpuMat)
    EXPECT_MAT_NEAR(gold, texData, 1e-2);
 }

-GPU_TEST_P(Texture2D, ConstructorFromBuffer)
+CUDA_TEST_P(Texture2D, ConstructorFromBuffer)
 {
    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);
    cv::ogl::Buffer buf_gold(gold, cv::ogl::Buffer::PIXEL_UNPACK_BUFFER, true);
@ -358,7 +358,7 @@ GPU_TEST_P(Texture2D, ConstructorFromBuffer)
    EXPECT_MAT_NEAR(gold, texData, 1e-2);
 }

-GPU_TEST_P(Texture2D, ConstructorFromTexture2D)
+CUDA_TEST_P(Texture2D, ConstructorFromTexture2D)
 {
    cv::ogl::Texture2D tex_gold(size, format, true);
    cv::ogl::Texture2D tex(tex_gold);
@ -369,7 +369,7 @@ GPU_TEST_P(Texture2D, ConstructorFromTexture2D)
    EXPECT_EQ(tex_gold.format(), tex.format());
 }

-GPU_TEST_P(Texture2D, Create)
+CUDA_TEST_P(Texture2D, Create)
 {
    cv::ogl::Texture2D tex;
    tex.create(size.height, size.width, format, true);
@ -379,7 +379,7 @@ GPU_TEST_P(Texture2D, Create)
    EXPECT_EQ(format, tex.format());
 }

-GPU_TEST_P(Texture2D, CopyFromMat)
+CUDA_TEST_P(Texture2D, CopyFromMat)
 {
    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);

@ -392,10 +392,10 @@ GPU_TEST_P(Texture2D, CopyFromMat)
    EXPECT_MAT_NEAR(gold, texData, 1e-2);
 }

-GPU_TEST_P(Texture2D, CopyFromGpuMat)
+CUDA_TEST_P(Texture2D, CopyFromGpuMat)
 {
    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);
-    cv::gpu::GpuMat d_gold(gold);
+    cv::cuda::GpuMat d_gold(gold);

    cv::ogl::Texture2D tex;
    tex.copyFrom(d_gold, true);
@ -406,7 +406,7 @@ GPU_TEST_P(Texture2D, CopyFromGpuMat)
    EXPECT_MAT_NEAR(gold, texData, 1e-2);
 }

-GPU_TEST_P(Texture2D, CopyFromBuffer)
+CUDA_TEST_P(Texture2D, CopyFromBuffer)
 {
    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);
    cv::ogl::Buffer buf_gold(gold, cv::ogl::Buffer::PIXEL_UNPACK_BUFFER, true);
@ -420,19 +420,19 @@ GPU_TEST_P(Texture2D, CopyFromBuffer)
    EXPECT_MAT_NEAR(gold, texData, 1e-2);
 }

-GPU_TEST_P(Texture2D, CopyToGpuMat)
+CUDA_TEST_P(Texture2D, CopyToGpuMat)
 {
    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);

    cv::ogl::Texture2D tex(gold, true);

-    cv::gpu::GpuMat dst;
+    cv::cuda::GpuMat dst;
    tex.copyTo(dst, depth);

    EXPECT_MAT_NEAR(gold, dst, 1e-2);
 }

-GPU_TEST_P(Texture2D, CopyToBuffer)
+CUDA_TEST_P(Texture2D, CopyToBuffer)
 {
    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);

--- a/modules/cuda/test/test_precomp.hpp
+++ b/modules/cuda/test/test_precomp.hpp
@ -54,9 +54,9 @@
 #include <fstream>

 #include "opencv2/ts.hpp"
-#include "opencv2/ts/gpu_test.hpp"
+#include "opencv2/ts/cuda_test.hpp"

-#include "opencv2/gpu.hpp"
+#include "opencv2/cuda.hpp"
 #include "opencv2/core.hpp"
 #include "opencv2/core/opengl.hpp"
 #include "opencv2/calib3d.hpp"
--- a/modules/cuda/test/test_stream.cpp
+++ b/modules/cuda/test/test_stream.cpp
@ -50,20 +50,20 @@

 using namespace cvtest;

-struct Async : testing::TestWithParam<cv::gpu::DeviceInfo>
+struct Async : testing::TestWithParam<cv::cuda::DeviceInfo>
 {
-    cv::gpu::CudaMem src;
-    cv::gpu::GpuMat d_src;
+    cv::cuda::CudaMem src;
+    cv::cuda::GpuMat d_src;

-    cv::gpu::CudaMem dst;
-    cv::gpu::GpuMat d_dst;
+    cv::cuda::CudaMem dst;
+    cv::cuda::GpuMat d_dst;

    virtual void SetUp()
    {
-        cv::gpu::DeviceInfo devInfo = GetParam();
-        cv::gpu::setDevice(devInfo.deviceID());
+        cv::cuda::DeviceInfo devInfo = GetParam();
+        cv::cuda::setDevice(devInfo.deviceID());

-        src = cv::gpu::CudaMem(cv::gpu::CudaMem::PAGE_LOCKED);
+        src = cv::cuda::CudaMem(cv::cuda::CudaMem::PAGE_LOCKED);

        cv::Mat m = randomMat(cv::Size(128, 128), CV_8UC1);
        m.copyTo(src);
@ -76,17 +76,17 @@ void checkMemSet(int status, void* userData)

    Async* test = reinterpret_cast<Async*>(userData);

-    cv::gpu::CudaMem src = test->src;
-    cv::gpu::CudaMem dst = test->dst;
+    cv::cuda::CudaMem src = test->src;
+    cv::cuda::CudaMem dst = test->dst;

    cv::Mat dst_gold = cv::Mat::zeros(src.size(), src.type());

    ASSERT_MAT_NEAR(dst_gold, dst, 0);
 }

-GPU_TEST_P(Async, MemSet)
+CUDA_TEST_P(Async, MemSet)
 {
-    cv::gpu::Stream stream;
+    cv::cuda::Stream stream;

    d_dst.upload(src);

@ -105,8 +105,8 @@ void checkConvert(int status, void* userData)

    Async* test = reinterpret_cast<Async*>(userData);

-    cv::gpu::CudaMem src = test->src;
-    cv::gpu::CudaMem dst = test->dst;
+    cv::cuda::CudaMem src = test->src;
+    cv::cuda::CudaMem dst = test->dst;

    cv::Mat dst_gold;
    src.createMatHeader().convertTo(dst_gold, CV_32S);
@ -114,9 +114,9 @@ void checkConvert(int status, void* userData)
    ASSERT_MAT_NEAR(dst_gold, dst, 0);
 }

-GPU_TEST_P(Async, Convert)
+CUDA_TEST_P(Async, Convert)
 {
-    cv::gpu::Stream stream;
+    cv::cuda::Stream stream;

    d_src.upload(src, stream);
    d_src.convertTo(d_dst, CV_32S, stream);
@ -128,7 +128,7 @@ GPU_TEST_P(Async, Convert)
    stream.waitForCompletion();
 }

-INSTANTIATE_TEST_CASE_P(GPU_Stream, Async, ALL_DEVICES);
+INSTANTIATE_TEST_CASE_P(CUDA_Stream, Async, ALL_DEVICES);

 #endif // CUDART_VERSION >= 5000

--- a/modules/cudaarithm/CMakeLists.txt
+++ b/modules/cudaarithm/CMakeLists.txt
@ -1,12 +1,12 @@
 if(ANDROID OR IOS)
-  ocv_module_disable(gpuarithm)
+  ocv_module_disable(cudaarithm)
 endif()

-set(the_description "GPU-accelerated Operations on Matrices")
+set(the_description "CUDA-accelerated Operations on Matrices")

 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations)

-ocv_add_module(gpuarithm opencv_core OPTIONAL opencv_gpulegacy)
+ocv_add_module(cudaarithm opencv_core OPTIONAL opencv_cudalegacy)

 ocv_module_include_directories()
 ocv_glob_module_sources()
--- a/modules/cudaarithm/doc/arithm.rst
+++ b/modules/cudaarithm/doc/arithm.rst
@ -5,11 +5,11 @@ Arithm Operations on Matrices



-gpu::gemm
---------
+cuda::gemm
+----------
 Performs generalized matrix multiplication.

-.. ocv:function:: void gpu::gemm(InputArray src1, InputArray src2, double alpha, InputArray src3, double beta, OutputArray dst, int flags = 0, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::gemm(InputArray src1, InputArray src2, double alpha, InputArray src3, double beta, OutputArray dst, int flags = 0, Stream& stream = Stream::Null())

    :param src1: First multiplied input matrix that should have  ``CV_32FC1`` , ``CV_64FC1`` , ``CV_32FC2`` , or  ``CV_64FC2``  type.

@ -43,11 +43,11 @@ The function performs generalized matrix multiplication similar to the ``gemm``



-gpu::mulSpectrums
-----------------
+cuda::mulSpectrums
+------------------
 Performs a per-element multiplication of two Fourier spectrums.

-.. ocv:function:: void gpu::mulSpectrums(InputArray src1, InputArray src2, OutputArray dst, int flags, bool conjB=false, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::mulSpectrums(InputArray src1, InputArray src2, OutputArray dst, int flags, bool conjB=false, Stream& stream = Stream::Null())

    :param src1: First spectrum.

@ -55,7 +55,7 @@ Performs a per-element multiplication of two Fourier spectrums.

    :param dst: Destination spectrum.

-    :param flags: Mock parameter used for CPU/GPU interfaces similarity.
+    :param flags: Mock parameter used for CPU/CUDA interfaces similarity.

    :param conjB: Optional flag to specify if the second spectrum needs to be conjugated before the multiplication.

@ -67,11 +67,11 @@ Only full (not packed) ``CV_32FC2`` complex spectrums in the interleaved format



-gpu::mulAndScaleSpectrums
-------------------------
+cuda::mulAndScaleSpectrums
+--------------------------
 Performs a per-element multiplication of two Fourier spectrums and scales the result.

-.. ocv:function:: void gpu::mulAndScaleSpectrums(InputArray src1, InputArray src2, OutputArray dst, int flags, float scale, bool conjB=false, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::mulAndScaleSpectrums(InputArray src1, InputArray src2, OutputArray dst, int flags, float scale, bool conjB=false, Stream& stream = Stream::Null())

    :param src1: First spectrum.

@ -79,7 +79,7 @@ Performs a per-element multiplication of two Fourier spectrums and scales the re

    :param dst: Destination spectrum.

-    :param flags: Mock parameter used for CPU/GPU interfaces similarity.
+    :param flags: Mock parameter used for CPU/CUDA interfaces similarity.

    :param scale: Scale constant.

@ -91,11 +91,11 @@ Only full (not packed) ``CV_32FC2`` complex spectrums in the interleaved format



-gpu::dft
--------
+cuda::dft
+---------
 Performs a forward or inverse discrete Fourier transform (1D or 2D) of the floating point matrix.

-.. ocv:function:: void gpu::dft(InputArray src, OutputArray dst, Size dft_size, int flags=0, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::dft(InputArray src, OutputArray dst, Size dft_size, int flags=0, Stream& stream = Stream::Null())

    :param src: Source matrix (real or complex).

@ -127,9 +127,9 @@ The source matrix should be continuous, otherwise reallocation and data copying



-gpu::Convolution
----------------
-.. ocv:class:: gpu::Convolution : public Algorithm
+cuda::Convolution
+-----------------
+.. ocv:class:: cuda::Convolution : public Algorithm

 Base class for convolution (or cross-correlation) operator. ::

@ -141,11 +141,11 @@ Base class for convolution (or cross-correlation) operator. ::



-gpu::Convolution::convolve
+cuda::Convolution::convolve
 ---------------------------
 Computes a convolution (or cross-correlation) of two images.

-.. ocv:function:: void gpu::Convolution::convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr = false, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::Convolution::convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr = false, Stream& stream = Stream::Null())

    :param image: Source image. Only  ``CV_32FC1`` images are supported for now.

@ -159,9 +159,9 @@ Computes a convolution (or cross-correlation) of two images.



-gpu::createConvolution
----------------------
-Creates implementation for :ocv:class:`gpu::Convolution` .
+cuda::createConvolution
+-----------------------
+Creates implementation for :ocv:class:`cuda::Convolution` .

 .. ocv:function:: Ptr<Convolution> createConvolution(Size user_block_size = Size())

--- a/modules/cudaarithm/doc/core.rst
+++ b/modules/cudaarithm/doc/core.rst
@ -5,13 +5,13 @@ Core Operations on Matrices



-gpu::merge
----------
+cuda::merge
+-----------
 Makes a multi-channel matrix out of several single-channel matrices.

-.. ocv:function:: void gpu::merge(const GpuMat* src, size_t n, OutputArray dst, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::merge(const GpuMat* src, size_t n, OutputArray dst, Stream& stream = Stream::Null())

-.. ocv:function:: void gpu::merge(const std::vector<GpuMat>& src, OutputArray dst, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::merge(const std::vector<GpuMat>& src, OutputArray dst, Stream& stream = Stream::Null())

    :param src: Array/vector of source matrices.

@ -25,13 +25,13 @@ Makes a multi-channel matrix out of several single-channel matrices.



-gpu::split
----------
+cuda::split
+-----------
 Copies each plane of a multi-channel matrix into an array.

-.. ocv:function:: void gpu::split(InputArray src, GpuMat* dst, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::split(InputArray src, GpuMat* dst, Stream& stream = Stream::Null())

-.. ocv:function:: void gpu::split(InputArray src, vector<GpuMat>& dst, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::split(InputArray src, vector<GpuMat>& dst, Stream& stream = Stream::Null())

    :param src: Source matrix.

@ -43,11 +43,11 @@ Copies each plane of a multi-channel matrix into an array.



-gpu::transpose
--------------
+cuda::transpose
+---------------
 Transposes a matrix.

-.. ocv:function:: void gpu::transpose(InputArray src1, OutputArray dst, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::transpose(InputArray src1, OutputArray dst, Stream& stream = Stream::Null())

    :param src1: Source matrix. 1-, 4-, 8-byte element sizes are supported for now.

@ -59,11 +59,11 @@ Transposes a matrix.



-gpu::flip
---------
+cuda::flip
+----------
 Flips a 2D matrix around vertical, horizontal, or both axes.

-.. ocv:function:: void gpu::flip(InputArray src, OutputArray dst, int flipCode, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::flip(InputArray src, OutputArray dst, int flipCode, Stream& stream = Stream::Null())

    :param src: Source matrix. Supports 1, 3 and 4 channels images with ``CV_8U``, ``CV_16U``, ``CV_32S`` or ``CV_32F`` depth.

@ -83,9 +83,9 @@ Flips a 2D matrix around vertical, horizontal, or both axes.



-gpu::LookUpTable
----------------
-.. ocv:class:: gpu::LookUpTable : public Algorithm
+cuda::LookUpTable
+-----------------
+.. ocv:class:: cuda::LookUpTable : public Algorithm

 Base class for transform using lookup table. ::

@ -99,11 +99,11 @@ Base class for transform using lookup table. ::



-gpu::LookUpTable::transform
---------------------------
+cuda::LookUpTable::transform
+----------------------------
 Transforms the source matrix into the destination matrix using the given look-up table: ``dst(I) = lut(src(I))`` .

-.. ocv:function:: void gpu::LookUpTable::transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::LookUpTable::transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null())

    :param src: Source matrix.  ``CV_8UC1``  and  ``CV_8UC3``  matrices are supported for now.

@ -113,9 +113,9 @@ Transforms the source matrix into the destination matrix using the given look-up



-gpu::createLookUpTable
----------------------
-Creates implementation for :ocv:class:`gpu::LookUpTable` .
+cuda::createLookUpTable
+-----------------------
+Creates implementation for :ocv:class:`cuda::LookUpTable` .

 .. ocv:function:: Ptr<LookUpTable> createLookUpTable(InputArray lut)

@ -123,11 +123,11 @@ Creates implementation for :ocv:class:`gpu::LookUpTable` .



-gpu::copyMakeBorder
-----------------------
+cuda::copyMakeBorder
+--------------------
 Forms a border around an image.

-.. ocv:function:: void gpu::copyMakeBorder(InputArray src, OutputArray dst, int top, int bottom, int left, int right, int borderType, Scalar value = Scalar(), Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::copyMakeBorder(InputArray src, OutputArray dst, int top, int bottom, int left, int right, int borderType, Scalar value = Scalar(), Stream& stream = Stream::Null())

    :param src: Source image. ``CV_8UC1`` , ``CV_8UC4`` , ``CV_32SC1`` , and ``CV_32FC1`` types are supported.

--- a/modules/cudaarithm/doc/cudaarithm.rst
+++ b/modules/cudaarithm/doc/cudaarithm.rst
@ -0,0 +1,11 @@
+***************************************************
+cudaarithm. CUDA-accelerated Operations on Matrices
+***************************************************
+
+.. toctree::
+    :maxdepth: 1
+
+    core
+    element_operations
+    reductions
+    arithm
--- a/modules/cudaarithm/doc/element_operations.rst
+++ b/modules/cudaarithm/doc/element_operations.rst
@ -5,11 +5,11 @@ Per-element Operations



-gpu::add
--------
+cuda::add
+---------
 Computes a matrix-matrix or matrix-scalar sum.

-.. ocv:function:: void gpu::add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), int dtype = -1, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), int dtype = -1, Stream& stream = Stream::Null())

    :param src1: First source matrix or scalar.

@ -27,11 +27,11 @@ Computes a matrix-matrix or matrix-scalar sum.



-gpu::subtract
-------------
+cuda::subtract
+--------------
 Computes a matrix-matrix or matrix-scalar difference.

-.. ocv:function:: void gpu::subtract(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), int dtype = -1, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::subtract(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), int dtype = -1, Stream& stream = Stream::Null())

    :param src1: First source matrix or scalar.

@ -49,11 +49,11 @@ Computes a matrix-matrix or matrix-scalar difference.



-gpu::multiply
-------------
+cuda::multiply
+--------------
 Computes a matrix-matrix or matrix-scalar per-element product.

-.. ocv:function:: void gpu::multiply(InputArray src1, InputArray src2, OutputArray dst, double scale = 1, int dtype = -1, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::multiply(InputArray src1, InputArray src2, OutputArray dst, double scale = 1, int dtype = -1, Stream& stream = Stream::Null())

    :param src1: First source matrix or scalar.

@ -71,13 +71,13 @@ Computes a matrix-matrix or matrix-scalar per-element product.



-gpu::divide
-----------
+cuda::divide
+------------
 Computes a matrix-matrix or matrix-scalar division.

-.. ocv:function:: void gpu::divide(InputArray src1, InputArray src2, OutputArray dst, double scale = 1, int dtype = -1, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::divide(InputArray src1, InputArray src2, OutputArray dst, double scale = 1, int dtype = -1, Stream& stream = Stream::Null())

-.. ocv:function:: void gpu::divide(double src1, InputArray src2, OutputArray dst, int dtype = -1, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::divide(double src1, InputArray src2, OutputArray dst, int dtype = -1, Stream& stream = Stream::Null())

    :param src1: First source matrix or a scalar.

@ -97,11 +97,11 @@ This function, in contrast to :ocv:func:`divide`, uses a round-down rounding mod



-gpu::absdiff
------------
+cuda::absdiff
+-------------
 Computes per-element absolute difference of two matrices (or of a matrix and scalar).

-.. ocv:function:: void gpu::absdiff(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::absdiff(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null())

    :param src1: First source matrix or scalar.

@ -115,11 +115,11 @@ Computes per-element absolute difference of two matrices (or of a matrix and sca



-gpu::abs
--------
+cuda::abs
+---------
 Computes an absolute value of each matrix element.

-.. ocv:function:: void gpu::abs(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::abs(InputArray src, OutputArray dst, Stream& stream = Stream::Null())

    :param src: Source matrix.

@ -131,11 +131,11 @@ Computes an absolute value of each matrix element.



-gpu::sqr
--------
+cuda::sqr
+---------
 Computes a square value of each matrix element.

-.. ocv:function:: void gpu::sqr(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::sqr(InputArray src, OutputArray dst, Stream& stream = Stream::Null())

    :param src: Source matrix.

@ -145,11 +145,11 @@ Computes a square value of each matrix element.



-gpu::sqrt
---------
+cuda::sqrt
+----------
 Computes a square root of each matrix element.

-.. ocv:function:: void gpu::sqrt(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::sqrt(InputArray src, OutputArray dst, Stream& stream = Stream::Null())

    :param src: Source matrix.

@ -161,11 +161,11 @@ Computes a square root of each matrix element.



-gpu::exp
--------
+cuda::exp
+---------
 Computes an exponent of each matrix element.

-.. ocv:function:: void gpu::exp(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::exp(InputArray src, OutputArray dst, Stream& stream = Stream::Null())

    :param src: Source matrix.

@ -177,11 +177,11 @@ Computes an exponent of each matrix element.



-gpu::log
--------
+cuda::log
+---------
 Computes a natural logarithm of absolute value of each matrix element.

-.. ocv:function:: void gpu::log(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::log(InputArray src, OutputArray dst, Stream& stream = Stream::Null())

    :param src: Source matrix.

@ -193,11 +193,11 @@ Computes a natural logarithm of absolute value of each matrix element.



-gpu::pow
--------
+cuda::pow
+---------
 Raises every matrix element to a power.

-.. ocv:function:: void gpu::pow(InputArray src, double power, OutputArray dst, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::pow(InputArray src, double power, OutputArray dst, Stream& stream = Stream::Null())

    :param src: Source matrix.

@ -217,11 +217,11 @@ The function ``pow`` raises every element of the input matrix to ``power`` :



-gpu::compare
------------
+cuda::compare
+-------------
 Compares elements of two matrices (or of a matrix and scalar).

-.. ocv:function:: void gpu::compare(InputArray src1, InputArray src2, OutputArray dst, int cmpop, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::compare(InputArray src1, InputArray src2, OutputArray dst, int cmpop, Stream& stream = Stream::Null())

    :param src1: First source matrix or scalar.

@ -244,11 +244,11 @@ Compares elements of two matrices (or of a matrix and scalar).



-gpu::bitwise_not
----------------
+cuda::bitwise_not
+-----------------
 Performs a per-element bitwise inversion.

-.. ocv:function:: void gpu::bitwise_not(InputArray src, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::bitwise_not(InputArray src, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null())

    :param src: Source matrix.

@ -260,11 +260,11 @@ Performs a per-element bitwise inversion.



-gpu::bitwise_or
---------------
+cuda::bitwise_or
+----------------
 Performs a per-element bitwise disjunction of two matrices (or of matrix and scalar).

-.. ocv:function:: void gpu::bitwise_or(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::bitwise_or(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null())

    :param src1: First source matrix or scalar.

@ -278,11 +278,11 @@ Performs a per-element bitwise disjunction of two matrices (or of matrix and sca



-gpu::bitwise_and
----------------
+cuda::bitwise_and
+-----------------
 Performs a per-element bitwise conjunction of two matrices (or of matrix and scalar).

-.. ocv:function:: void gpu::bitwise_and(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::bitwise_and(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null())

    :param src1: First source matrix or scalar.

@ -296,11 +296,11 @@ Performs a per-element bitwise conjunction of two matrices (or of matrix and sca



-gpu::bitwise_xor
----------------
+cuda::bitwise_xor
+-----------------
 Performs a per-element bitwise ``exclusive or`` operation of two matrices (or of matrix and scalar).

-.. ocv:function:: void gpu::bitwise_xor(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::bitwise_xor(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null())

    :param src1: First source matrix or scalar.

@ -314,11 +314,11 @@ Performs a per-element bitwise ``exclusive or`` operation of two matrices (or of



-gpu::rshift
-----------
+cuda::rshift
+------------
 Performs pixel by pixel right shift of an image by a constant value.

-.. ocv:function:: void gpu::rshift(InputArray src, Scalar_<int> val, OutputArray dst, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::rshift(InputArray src, Scalar_<int> val, OutputArray dst, Stream& stream = Stream::Null())

    :param src: Source matrix. Supports 1, 3 and 4 channels images with integers elements.

@ -330,11 +330,11 @@ Performs pixel by pixel right shift of an image by a constant value.



-gpu::lshift
-----------
+cuda::lshift
+------------
 Performs pixel by pixel right left of an image by a constant value.

-.. ocv:function:: void gpu::lshift(InputArray src, Scalar_<int> val, OutputArray dst, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::lshift(InputArray src, Scalar_<int> val, OutputArray dst, Stream& stream = Stream::Null())

    :param src: Source matrix. Supports 1, 3 and 4 channels images with ``CV_8U`` , ``CV_16U`` or ``CV_32S`` depth.

@ -346,11 +346,11 @@ Performs pixel by pixel right left of an image by a constant value.



-gpu::min
--------
+cuda::min
+---------
 Computes the per-element minimum of two matrices (or a matrix and a scalar).

-.. ocv:function:: void gpu::min(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::min(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null())

    :param src1: First source matrix or scalar.

@ -364,11 +364,11 @@ Computes the per-element minimum of two matrices (or a matrix and a scalar).



-gpu::max
--------
+cuda::max
+---------
 Computes the per-element maximum of two matrices (or a matrix and a scalar).

-.. ocv:function:: void gpu::max(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::max(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null())

    :param src1: First source matrix or scalar.

@ -382,11 +382,11 @@ Computes the per-element maximum of two matrices (or a matrix and a scalar).



-gpu::addWeighted
----------------
+cuda::addWeighted
+-----------------
 Computes the weighted sum of two arrays.

-.. ocv:function:: void gpu::addWeighted(InputArray src1, double alpha, InputArray src2, double beta, double gamma, OutputArray dst, int dtype = -1, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::addWeighted(InputArray src1, double alpha, InputArray src2, double beta, double gamma, OutputArray dst, int dtype = -1, Stream& stream = Stream::Null())

    :param src1: First source array.

@ -416,11 +416,11 @@ where ``I`` is a multi-dimensional index of array elements. In case of multi-cha



-gpu::threshold
--------------
+cuda::threshold
+---------------
 Applies a fixed-level threshold to each array element.

-.. ocv:function:: double gpu::threshold(InputArray src, OutputArray dst, double thresh, double maxval, int type, Stream& stream = Stream::Null())
+.. ocv:function:: double cuda::threshold(InputArray src, OutputArray dst, double thresh, double maxval, int type, Stream& stream = Stream::Null())

    :param src: Source array (single-channel).

@ -438,13 +438,13 @@ Applies a fixed-level threshold to each array element.



-gpu::magnitude
--------------
+cuda::magnitude
+---------------
 Computes magnitudes of complex matrix elements.

-.. ocv:function:: void gpu::magnitude(InputArray xy, OutputArray magnitude, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::magnitude(InputArray xy, OutputArray magnitude, Stream& stream = Stream::Null())

-.. ocv:function:: void gpu::magnitude(InputArray x, InputArray y, OutputArray magnitude, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::magnitude(InputArray x, InputArray y, OutputArray magnitude, Stream& stream = Stream::Null())

    :param xy: Source complex matrix in the interleaved format ( ``CV_32FC2`` ).

@ -460,13 +460,13 @@ Computes magnitudes of complex matrix elements.



-gpu::magnitudeSqr
-----------------
+cuda::magnitudeSqr
+------------------
 Computes squared magnitudes of complex matrix elements.

-.. ocv:function:: void gpu::magnitudeSqr(InputArray xy, OutputArray magnitude, Stream& stream=Stream::Null() )
+.. ocv:function:: void cuda::magnitudeSqr(InputArray xy, OutputArray magnitude, Stream& stream=Stream::Null() )

-.. ocv:function:: void gpu::magnitudeSqr(InputArray x, InputArray y, OutputArray magnitude, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::magnitudeSqr(InputArray x, InputArray y, OutputArray magnitude, Stream& stream = Stream::Null())

    :param xy: Source complex matrix in the interleaved format ( ``CV_32FC2`` ).

@ -480,11 +480,11 @@ Computes squared magnitudes of complex matrix elements.



-gpu::phase
----------
+cuda::phase
+-----------
 Computes polar angles of complex matrix elements.

-.. ocv:function:: void gpu::phase(InputArray x, InputArray y, OutputArray angle, bool angleInDegrees = false, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::phase(InputArray x, InputArray y, OutputArray angle, bool angleInDegrees = false, Stream& stream = Stream::Null())

    :param x: Source matrix containing real components ( ``CV_32FC1`` ).

@ -500,11 +500,11 @@ Computes polar angles of complex matrix elements.



-gpu::cartToPolar
----------------
+cuda::cartToPolar
+-----------------
 Converts Cartesian coordinates into polar.

-.. ocv:function:: void gpu::cartToPolar(InputArray x, InputArray y, OutputArray magnitude, OutputArray angle, bool angleInDegrees = false, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::cartToPolar(InputArray x, InputArray y, OutputArray magnitude, OutputArray angle, bool angleInDegrees = false, Stream& stream = Stream::Null())

    :param x: Source matrix containing real components ( ``CV_32FC1`` ).

@ -522,11 +522,11 @@ Converts Cartesian coordinates into polar.



-gpu::polarToCart
----------------
+cuda::polarToCart
+-----------------
 Converts polar coordinates into Cartesian.

-.. ocv:function:: void gpu::polarToCart(InputArray magnitude, InputArray angle, OutputArray x, OutputArray y, bool angleInDegrees = false, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::polarToCart(InputArray magnitude, InputArray angle, OutputArray x, OutputArray y, bool angleInDegrees = false, Stream& stream = Stream::Null())

    :param magnitude: Source matrix containing magnitudes ( ``CV_32FC1`` ).

--- a/modules/cudaarithm/doc/reductions.rst
+++ b/modules/cudaarithm/doc/reductions.rst
@ -5,17 +5,17 @@ Matrix Reductions



-gpu::norm
---------
+cuda::norm
+----------
 Returns the norm of a matrix (or difference of two matrices).

-.. ocv:function:: double gpu::norm(InputArray src1, int normType)
+.. ocv:function:: double cuda::norm(InputArray src1, int normType)

-.. ocv:function:: double gpu::norm(InputArray src1, int normType, GpuMat& buf)
+.. ocv:function:: double cuda::norm(InputArray src1, int normType, GpuMat& buf)

-.. ocv:function:: double gpu::norm(InputArray src1, int normType, InputArray mask, GpuMat& buf)
+.. ocv:function:: double cuda::norm(InputArray src1, int normType, InputArray mask, GpuMat& buf)

-.. ocv:function:: double gpu::norm(InputArray src1, InputArray src2, int normType=NORM_L2)
+.. ocv:function:: double cuda::norm(InputArray src1, InputArray src2, int normType=NORM_L2)

    :param src1: Source matrix. Any matrices except 64F are supported.

@ -31,15 +31,15 @@ Returns the norm of a matrix (or difference of two matrices).



-gpu::sum
--------
+cuda::sum
+---------
 Returns the sum of matrix elements.

-.. ocv:function:: Scalar gpu::sum(InputArray src)
+.. ocv:function:: Scalar cuda::sum(InputArray src)

-.. ocv:function:: Scalar gpu::sum(InputArray src, GpuMat& buf)
+.. ocv:function:: Scalar cuda::sum(InputArray src, GpuMat& buf)

-.. ocv:function:: Scalar gpu::sum(InputArray src, InputArray mask, GpuMat& buf)
+.. ocv:function:: Scalar cuda::sum(InputArray src, InputArray mask, GpuMat& buf)

    :param src: Source image of any depth except for ``CV_64F`` .

@ -51,15 +51,15 @@ Returns the sum of matrix elements.



-gpu::absSum
-----------
+cuda::absSum
+------------
 Returns the sum of absolute values for matrix elements.

-.. ocv:function:: Scalar gpu::absSum(InputArray src)
+.. ocv:function:: Scalar cuda::absSum(InputArray src)

-.. ocv:function:: Scalar gpu::absSum(InputArray src, GpuMat& buf)
+.. ocv:function:: Scalar cuda::absSum(InputArray src, GpuMat& buf)

-.. ocv:function:: Scalar gpu::absSum(InputArray src, InputArray mask, GpuMat& buf)
+.. ocv:function:: Scalar cuda::absSum(InputArray src, InputArray mask, GpuMat& buf)

    :param src: Source image of any depth except for ``CV_64F`` .

@ -69,15 +69,15 @@ Returns the sum of absolute values for matrix elements.



-gpu::sqrSum
-----------
+cuda::sqrSum
+------------
 Returns the squared sum of matrix elements.

-.. ocv:function:: Scalar gpu::sqrSum(InputArray src)
+.. ocv:function:: Scalar cuda::sqrSum(InputArray src)

-.. ocv:function:: Scalar gpu::sqrSum(InputArray src, GpuMat& buf)
+.. ocv:function:: Scalar cuda::sqrSum(InputArray src, GpuMat& buf)

-.. ocv:function:: Scalar gpu::sqrSum(InputArray src, InputArray mask, GpuMat& buf)
+.. ocv:function:: Scalar cuda::sqrSum(InputArray src, InputArray mask, GpuMat& buf)

    :param src: Source image of any depth except for ``CV_64F`` .

@ -87,13 +87,13 @@ Returns the squared sum of matrix elements.



-gpu::minMax
-----------
+cuda::minMax
+------------
 Finds global minimum and maximum matrix elements and returns their values.

-.. ocv:function:: void gpu::minMax(InputArray src, double* minVal, double* maxVal=0, InputArray mask=noArray())
+.. ocv:function:: void cuda::minMax(InputArray src, double* minVal, double* maxVal=0, InputArray mask=noArray())

-.. ocv:function:: void gpu::minMax(InputArray src, double* minVal, double* maxVal, InputArray mask, GpuMat& buf)
+.. ocv:function:: void cuda::minMax(InputArray src, double* minVal, double* maxVal, InputArray mask, GpuMat& buf)

    :param src: Single-channel source image.

@ -111,13 +111,13 @@ The function does not work with ``CV_64F`` images on GPUs with the compute capab



-gpu::minMaxLoc
--------------
+cuda::minMaxLoc
+---------------
 Finds global minimum and maximum matrix elements and returns their values with locations.

-.. ocv:function:: void gpu::minMaxLoc(InputArray src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0, InputArray mask=noArray())
+.. ocv:function:: void cuda::minMaxLoc(InputArray src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0, InputArray mask=noArray())

-.. ocv:function:: void gpu::minMaxLoc(InputArray src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, InputArray mask, GpuMat& valbuf, GpuMat& locbuf)
+.. ocv:function:: void cuda::minMaxLoc(InputArray src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, InputArray mask, GpuMat& valbuf, GpuMat& locbuf)

    :param src: Single-channel source image.

@ -141,13 +141,13 @@ Finds global minimum and maximum matrix elements and returns their values with l



-gpu::countNonZero
-----------------
+cuda::countNonZero
+------------------
 Counts non-zero matrix elements.

-.. ocv:function:: int gpu::countNonZero(InputArray src)
+.. ocv:function:: int cuda::countNonZero(InputArray src)

-.. ocv:function:: int gpu::countNonZero(InputArray src, GpuMat& buf)
+.. ocv:function:: int cuda::countNonZero(InputArray src, GpuMat& buf)

    :param src: Single-channel source image.

@ -159,11 +159,11 @@ The function does not work with ``CV_64F`` images on GPUs with the compute capab



-gpu::reduce
-----------
+cuda::reduce
+------------
 Reduces a matrix to a vector.

-.. ocv:function:: void gpu::reduce(InputArray mtx, OutputArray vec, int dim, int reduceOp, int dtype = -1, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::reduce(InputArray mtx, OutputArray vec, int dim, int reduceOp, int dtype = -1, Stream& stream = Stream::Null())

    :param mtx: Source 2D matrix.

@ -191,12 +191,12 @@ The function ``reduce`` reduces the matrix to a vector by treating the matrix ro



-gpu::meanStdDev
---------------
+cuda::meanStdDev
+----------------
 Computes a mean value and a standard deviation of matrix elements.

-.. ocv:function:: void gpu::meanStdDev(InputArray mtx, Scalar& mean, Scalar& stddev)
-.. ocv:function:: void gpu::meanStdDev(InputArray mtx, Scalar& mean, Scalar& stddev, GpuMat& buf)
+.. ocv:function:: void cuda::meanStdDev(InputArray mtx, Scalar& mean, Scalar& stddev)
+.. ocv:function:: void cuda::meanStdDev(InputArray mtx, Scalar& mean, Scalar& stddev, GpuMat& buf)

    :param mtx: Source matrix.  ``CV_8UC1``  matrices are supported for now.

@ -210,11 +210,11 @@ Computes a mean value and a standard deviation of matrix elements.



-gpu::rectStdDev
---------------
+cuda::rectStdDev
+----------------
 Computes a standard deviation of integral images.

-.. ocv:function:: void gpu::rectStdDev(InputArray src, InputArray sqr, OutputArray dst, Rect rect, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::rectStdDev(InputArray src, InputArray sqr, OutputArray dst, Rect rect, Stream& stream = Stream::Null())

    :param src: Source image. Only the ``CV_32SC1`` type is supported.

@ -228,13 +228,13 @@ Computes a standard deviation of integral images.



-gpu::normalize
--------------
+cuda::normalize
+---------------
 Normalizes the norm or value range of an array.

-.. ocv:function:: void gpu::normalize(InputArray src, OutputArray dst, double alpha = 1, double beta = 0, int norm_type = NORM_L2, int dtype = -1, InputArray mask = noArray())
+.. ocv:function:: void cuda::normalize(InputArray src, OutputArray dst, double alpha = 1, double beta = 0, int norm_type = NORM_L2, int dtype = -1, InputArray mask = noArray())

-.. ocv:function:: void gpu::normalize(InputArray src, OutputArray dst, double alpha, double beta, int norm_type, int dtype, InputArray mask, GpuMat& norm_buf, GpuMat& cvt_buf)
+.. ocv:function:: void cuda::normalize(InputArray src, OutputArray dst, double alpha, double beta, int norm_type, int dtype, InputArray mask, GpuMat& norm_buf, GpuMat& cvt_buf)

    :param src: Input array.

@ -258,13 +258,13 @@ Normalizes the norm or value range of an array.



-gpu::integral
-------------
+cuda::integral
+--------------
 Computes an integral image.

-.. ocv:function:: void gpu::integral(InputArray src, OutputArray sum, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::integral(InputArray src, OutputArray sum, Stream& stream = Stream::Null())

-.. ocv:function:: void gpu::integral(InputArray src, OutputArray sum, GpuMat& buffer, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::integral(InputArray src, OutputArray sum, GpuMat& buffer, Stream& stream = Stream::Null())

    :param src: Source image. Only  ``CV_8UC1`` images are supported for now.

@ -278,13 +278,13 @@ Computes an integral image.



-gpu::sqrIntegral
----------------
+cuda::sqrIntegral
+-----------------
 Computes a squared integral image.

-.. ocv:function:: void gpu::sqrIntegral(InputArray src, OutputArray sqsum, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::sqrIntegral(InputArray src, OutputArray sqsum, Stream& stream = Stream::Null())

-.. ocv:function:: void gpu::sqrIntegral(InputArray src, OutputArray sqsum, GpuMat& buf, Stream& stream = Stream::Null())
+.. ocv:function:: void cuda::sqrIntegral(InputArray src, OutputArray sqsum, GpuMat& buf, Stream& stream = Stream::Null())

    :param src: Source image. Only  ``CV_8UC1`` images are supported for now.

--- a/modules/cudaarithm/include/opencv2/cudaarithm.hpp
+++ b/modules/cudaarithm/include/opencv2/cudaarithm.hpp
@ -40,16 +40,16 @@
 //
 //M*/

-#ifndef __OPENCV_GPUARITHM_HPP__
-#define __OPENCV_GPUARITHM_HPP__
+#ifndef __OPENCV_CUDAARITHM_HPP__
+#define __OPENCV_CUDAARITHM_HPP__

 #ifndef __cplusplus
-#  error gpuarithm.hpp header must be compiled as C++
+#  error cudaarithm.hpp header must be compiled as C++
 #endif

-#include "opencv2/core/gpu.hpp"
+#include "opencv2/core/cuda.hpp"

-namespace cv { namespace gpu {
+namespace cv { namespace cuda {

 //! adds one matrix to another (dst = src1 + src2)
 CV_EXPORTS void add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), int dtype = -1, Stream& stream = Stream::Null());
@ -369,6 +369,6 @@ public:

 CV_EXPORTS Ptr<Convolution> createConvolution(Size user_block_size = Size());

-}} // namespace cv { namespace gpu {
+}} // namespace cv { namespace cuda {

-#endif /* __OPENCV_GPUARITHM_HPP__ */
+#endif /* __OPENCV_CUDAARITHM_HPP__ */
--- a/modules/cudaarithm/perf/perf_arithm.cpp
+++ b/modules/cudaarithm/perf/perf_arithm.cpp
@ -73,18 +73,18 @@ PERF_TEST_P(Sz_Type_Flags, GEMM,
    cv::Mat src3(size, type);
    declare.in(src3, WARMUP_RNG);

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
        declare.time(5.0);

-        const cv::gpu::GpuMat d_src1(src1);
-        const cv::gpu::GpuMat d_src2(src2);
-        const cv::gpu::GpuMat d_src3(src3);
-        cv::gpu::GpuMat dst;
+        const cv::cuda::GpuMat d_src1(src1);
+        const cv::cuda::GpuMat d_src2(src2);
+        const cv::cuda::GpuMat d_src3(src3);
+        cv::cuda::GpuMat dst;

-        TEST_CYCLE() cv::gpu::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, dst, flags);
+        TEST_CYCLE() cv::cuda::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, dst, flags);

-        GPU_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+        CUDA_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
    }
    else
    {
@ -106,7 +106,7 @@ CV_FLAGS(DftFlags, 0, cv::DFT_INVERSE, cv::DFT_SCALE, cv::DFT_ROWS, cv::DFT_COMP
 DEF_PARAM_TEST(Sz_Flags, cv::Size, DftFlags);

 PERF_TEST_P(Sz_Flags, MulSpectrums,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
                    Values(0, DftFlags(cv::DFT_ROWS))))
 {
    const cv::Size size = GET_PARAM(0);
@ -116,15 +116,15 @@ PERF_TEST_P(Sz_Flags, MulSpectrums,
    cv::Mat b(size, CV_32FC2);
    declare.in(a, b, WARMUP_RNG);

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        const cv::gpu::GpuMat d_a(a);
-        const cv::gpu::GpuMat d_b(b);
-        cv::gpu::GpuMat dst;
+        const cv::cuda::GpuMat d_a(a);
+        const cv::cuda::GpuMat d_b(b);
+        cv::cuda::GpuMat dst;

-        TEST_CYCLE() cv::gpu::mulSpectrums(d_a, d_b, dst, flag);
+        TEST_CYCLE() cv::cuda::mulSpectrums(d_a, d_b, dst, flag);

-        GPU_SANITY_CHECK(dst);
+        CUDA_SANITY_CHECK(dst);
    }
    else
    {
@ -140,7 +140,7 @@ PERF_TEST_P(Sz_Flags, MulSpectrums,
 // MulAndScaleSpectrums

 PERF_TEST_P(Sz, MulAndScaleSpectrums,
-            GPU_TYPICAL_MAT_SIZES)
+            CUDA_TYPICAL_MAT_SIZES)
 {
    const cv::Size size = GetParam();

@ -150,15 +150,15 @@ PERF_TEST_P(Sz, MulAndScaleSpectrums,
    cv::Mat src2(size, CV_32FC2);
    declare.in(src1,src2, WARMUP_RNG);

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        const cv::gpu::GpuMat d_src1(src1);
-        const cv::gpu::GpuMat d_src2(src2);
-        cv::gpu::GpuMat dst;
+        const cv::cuda::GpuMat d_src1(src1);
+        const cv::cuda::GpuMat d_src2(src2);
+        cv::cuda::GpuMat dst;

-        TEST_CYCLE() cv::gpu::mulAndScaleSpectrums(d_src1, d_src2, dst, cv::DFT_ROWS, scale, false);
+        TEST_CYCLE() cv::cuda::mulAndScaleSpectrums(d_src1, d_src2, dst, cv::DFT_ROWS, scale, false);

-        GPU_SANITY_CHECK(dst);
+        CUDA_SANITY_CHECK(dst);
    }
    else
    {
@ -170,7 +170,7 @@ PERF_TEST_P(Sz, MulAndScaleSpectrums,
 // Dft

 PERF_TEST_P(Sz_Flags, Dft,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
                    Values(0, DftFlags(cv::DFT_ROWS), DftFlags(cv::DFT_INVERSE))))
 {
    declare.time(10.0);
@ -181,14 +181,14 @@ PERF_TEST_P(Sz_Flags, Dft,
    cv::Mat src(size, CV_32FC2);
    declare.in(src, WARMUP_RNG);

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;

-        TEST_CYCLE() cv::gpu::dft(d_src, dst, size, flag);
+        TEST_CYCLE() cv::cuda::dft(d_src, dst, size, flag);

-        GPU_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+        CUDA_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
    }
    else
    {
@ -206,7 +206,7 @@ PERF_TEST_P(Sz_Flags, Dft,
 DEF_PARAM_TEST(Sz_KernelSz_Ccorr, cv::Size, int, bool);

 PERF_TEST_P(Sz_KernelSz_Ccorr, Convolve,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
                    Values(17, 27, 32, 64),
                    Bool()))
 {
@ -220,21 +220,21 @@ PERF_TEST_P(Sz_KernelSz_Ccorr, Convolve,
    const cv::Mat templ(templ_size, templ_size, CV_32FC1);
    declare.in(image, templ, WARMUP_RNG);

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        cv::gpu::GpuMat d_image = cv::gpu::createContinuous(size, CV_32FC1);
+        cv::cuda::GpuMat d_image = cv::cuda::createContinuous(size, CV_32FC1);
        d_image.upload(image);

-        cv::gpu::GpuMat d_templ = cv::gpu::createContinuous(templ_size, templ_size, CV_32FC1);
+        cv::cuda::GpuMat d_templ = cv::cuda::createContinuous(templ_size, templ_size, CV_32FC1);
        d_templ.upload(templ);

-        cv::Ptr<cv::gpu::Convolution> convolution = cv::gpu::createConvolution();
+        cv::Ptr<cv::cuda::Convolution> convolution = cv::cuda::createConvolution();

-        cv::gpu::GpuMat dst;
+        cv::cuda::GpuMat dst;

        TEST_CYCLE() convolution->convolve(d_image, d_templ, dst, ccorr);

-        GPU_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+        CUDA_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
    }
    else
    {
@ -253,22 +253,22 @@ PERF_TEST_P(Sz_KernelSz_Ccorr, Convolve,
 // Integral

 PERF_TEST_P(Sz, Integral,
-            GPU_TYPICAL_MAT_SIZES)
+            CUDA_TYPICAL_MAT_SIZES)
 {
    const cv::Size size = GetParam();

    cv::Mat src(size, CV_8UC1);
    declare.in(src, WARMUP_RNG);

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf;
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+        cv::cuda::GpuMat d_buf;

-        TEST_CYCLE() cv::gpu::integral(d_src, dst, d_buf);
+        TEST_CYCLE() cv::cuda::integral(d_src, dst, d_buf);

-        GPU_SANITY_CHECK(dst);
+        CUDA_SANITY_CHECK(dst);
    }
    else
    {
@ -284,21 +284,21 @@ PERF_TEST_P(Sz, Integral,
 // IntegralSqr

 PERF_TEST_P(Sz, IntegralSqr,
-            GPU_TYPICAL_MAT_SIZES)
+            CUDA_TYPICAL_MAT_SIZES)
 {
    const cv::Size size = GetParam();

    cv::Mat src(size, CV_8UC1);
    declare.in(src, WARMUP_RNG);

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst, buf;
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst, buf;

-        TEST_CYCLE() cv::gpu::sqrIntegral(d_src, dst, buf);
+        TEST_CYCLE() cv::cuda::sqrIntegral(d_src, dst, buf);

-        GPU_SANITY_CHECK(dst);
+        CUDA_SANITY_CHECK(dst);
    }
    else
    {
--- a/modules/cudaarithm/perf/perf_core.cpp
+++ b/modules/cudaarithm/perf/perf_core.cpp
@ -52,7 +52,7 @@ using namespace perf;
 // Merge

 PERF_TEST_P(Sz_Depth_Cn, Merge,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
                    ARITHM_MAT_DEPTH,
                    Values(2, 3, 4)))
 {
@ -67,17 +67,17 @@ PERF_TEST_P(Sz_Depth_Cn, Merge,
        declare.in(src[i], WARMUP_RNG);
    }

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        std::vector<cv::gpu::GpuMat> d_src(channels);
+        std::vector<cv::cuda::GpuMat> d_src(channels);
        for (int i = 0; i < channels; ++i)
            d_src[i].upload(src[i]);

-        cv::gpu::GpuMat dst;
+        cv::cuda::GpuMat dst;

-        TEST_CYCLE() cv::gpu::merge(d_src, dst);
+        TEST_CYCLE() cv::cuda::merge(d_src, dst);

-        GPU_SANITY_CHECK(dst, 1e-10);
+        CUDA_SANITY_CHECK(dst, 1e-10);
    }
    else
    {
@ -93,7 +93,7 @@ PERF_TEST_P(Sz_Depth_Cn, Merge,
 // Split

 PERF_TEST_P(Sz_Depth_Cn, Split,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
                    ARITHM_MAT_DEPTH,
                    Values(2, 3, 4)))
 {
@ -104,18 +104,18 @@ PERF_TEST_P(Sz_Depth_Cn, Split,
    cv::Mat src(size, CV_MAKE_TYPE(depth, channels));
    declare.in(src, WARMUP_RNG);

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        const cv::gpu::GpuMat d_src(src);
-        std::vector<cv::gpu::GpuMat> dst;
+        const cv::cuda::GpuMat d_src(src);
+        std::vector<cv::cuda::GpuMat> dst;

-        TEST_CYCLE() cv::gpu::split(d_src, dst);
+        TEST_CYCLE() cv::cuda::split(d_src, dst);

-        const cv::gpu::GpuMat& dst0 = dst[0];
-        const cv::gpu::GpuMat& dst1 = dst[1];
+        const cv::cuda::GpuMat& dst0 = dst[0];
+        const cv::cuda::GpuMat& dst1 = dst[1];

-        GPU_SANITY_CHECK(dst0, 1e-10);
-        GPU_SANITY_CHECK(dst1, 1e-10);
+        CUDA_SANITY_CHECK(dst0, 1e-10);
+        CUDA_SANITY_CHECK(dst1, 1e-10);
    }
    else
    {
@ -135,7 +135,7 @@ PERF_TEST_P(Sz_Depth_Cn, Split,
 // Transpose

 PERF_TEST_P(Sz_Type, Transpose,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
                    Values(CV_8UC1, CV_8UC4, CV_16UC2, CV_16SC2, CV_32SC1, CV_32SC2, CV_64FC1)))
 {
    const cv::Size size = GET_PARAM(0);
@ -144,14 +144,14 @@ PERF_TEST_P(Sz_Type, Transpose,
    cv::Mat src(size, type);
    declare.in(src, WARMUP_RNG);

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;

-        TEST_CYCLE() cv::gpu::transpose(d_src, dst);
+        TEST_CYCLE() cv::cuda::transpose(d_src, dst);

-        GPU_SANITY_CHECK(dst, 1e-10);
+        CUDA_SANITY_CHECK(dst, 1e-10);
    }
    else
    {
@ -172,9 +172,9 @@ CV_ENUM(FlipCode, FLIP_BOTH, FLIP_X, FLIP_Y)
 DEF_PARAM_TEST(Sz_Depth_Cn_Code, cv::Size, MatDepth, MatCn, FlipCode);

 PERF_TEST_P(Sz_Depth_Cn_Code, Flip,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
                    Values(CV_8U, CV_16U, CV_32F),
-                    GPU_CHANNELS_1_3_4,
+                    CUDA_CHANNELS_1_3_4,
                    FlipCode::all()))
 {
    const cv::Size size = GET_PARAM(0);
@ -187,14 +187,14 @@ PERF_TEST_P(Sz_Depth_Cn_Code, Flip,
    cv::Mat src(size, type);
    declare.in(src, WARMUP_RNG);

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;

-        TEST_CYCLE() cv::gpu::flip(d_src, dst, flipCode);
+        TEST_CYCLE() cv::cuda::flip(d_src, dst, flipCode);

-        GPU_SANITY_CHECK(dst);
+        CUDA_SANITY_CHECK(dst);
    }
    else
    {
@ -210,7 +210,7 @@ PERF_TEST_P(Sz_Depth_Cn_Code, Flip,
 // LutOneChannel

 PERF_TEST_P(Sz_Type, LutOneChannel,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
                    Values(CV_8UC1, CV_8UC3)))
 {
    const cv::Size size = GET_PARAM(0);
@ -222,16 +222,16 @@ PERF_TEST_P(Sz_Type, LutOneChannel,
    cv::Mat lut(1, 256, CV_8UC1);
    declare.in(lut, WARMUP_RNG);

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        cv::Ptr<cv::gpu::LookUpTable> lutAlg = cv::gpu::createLookUpTable(lut);
+        cv::Ptr<cv::cuda::LookUpTable> lutAlg = cv::cuda::createLookUpTable(lut);

-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;

        TEST_CYCLE() lutAlg->transform(d_src, dst);

-        GPU_SANITY_CHECK(dst);
+        CUDA_SANITY_CHECK(dst);
    }
    else
    {
@ -247,7 +247,7 @@ PERF_TEST_P(Sz_Type, LutOneChannel,
 // LutMultiChannel

 PERF_TEST_P(Sz_Type, LutMultiChannel,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
                    Values<MatType>(CV_8UC3)))
 {
    const cv::Size size = GET_PARAM(0);
@ -259,16 +259,16 @@ PERF_TEST_P(Sz_Type, LutMultiChannel,
    cv::Mat lut(1, 256, CV_MAKE_TYPE(CV_8U, src.channels()));
    declare.in(lut, WARMUP_RNG);

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        cv::Ptr<cv::gpu::LookUpTable> lutAlg = cv::gpu::createLookUpTable(lut);
+        cv::Ptr<cv::cuda::LookUpTable> lutAlg = cv::cuda::createLookUpTable(lut);

-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;

        TEST_CYCLE() lutAlg->transform(d_src, dst);

-        GPU_SANITY_CHECK(dst);
+        CUDA_SANITY_CHECK(dst);
    }
    else
    {
@ -286,9 +286,9 @@ PERF_TEST_P(Sz_Type, LutMultiChannel,
 DEF_PARAM_TEST(Sz_Depth_Cn_Border, cv::Size, MatDepth, MatCn, BorderMode);

 PERF_TEST_P(Sz_Depth_Cn_Border, CopyMakeBorder,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
                    Values(CV_8U, CV_16U, CV_32F),
-                    GPU_CHANNELS_1_3_4,
+                    CUDA_CHANNELS_1_3_4,
                    ALL_BORDER_MODES))
 {
    const cv::Size size = GET_PARAM(0);
@ -301,14 +301,14 @@ PERF_TEST_P(Sz_Depth_Cn_Border, CopyMakeBorder,
    cv::Mat src(size, type);
    declare.in(src, WARMUP_RNG);

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;

-        TEST_CYCLE() cv::gpu::copyMakeBorder(d_src, dst, 5, 5, 5, 5, borderMode);
+        TEST_CYCLE() cv::cuda::copyMakeBorder(d_src, dst, 5, 5, 5, 5, borderMode);

-        GPU_SANITY_CHECK(dst);
+        CUDA_SANITY_CHECK(dst);
    }
    else
    {
--- a/modules/cudaarithm/perf/perf_element_operations.cpp
+++ b/modules/cudaarithm/perf/perf_element_operations.cpp
--- a/modules/cudaarithm/perf/perf_main.cpp
+++ b/modules/cudaarithm/perf/perf_main.cpp
@ -44,4 +44,4 @@

 using namespace perf;

-CV_PERF_TEST_CUDA_MAIN(gpuarithm)
+CV_PERF_TEST_CUDA_MAIN(cudaarithm)
--- a/modules/cudaarithm/perf/perf_precomp.hpp
+++ b/modules/cudaarithm/perf/perf_precomp.hpp
@ -52,9 +52,9 @@
 #define __OPENCV_PERF_PRECOMP_HPP__

 #include "opencv2/ts.hpp"
-#include "opencv2/ts/gpu_perf.hpp"
+#include "opencv2/ts/cuda_perf.hpp"

-#include "opencv2/gpuarithm.hpp"
+#include "opencv2/cudaarithm.hpp"
 #include "opencv2/core.hpp"
 #include "opencv2/imgproc.hpp"

--- a/modules/cudaarithm/perf/perf_reductions.cpp
+++ b/modules/cudaarithm/perf/perf_reductions.cpp
@ -52,7 +52,7 @@ using namespace perf;
 DEF_PARAM_TEST(Sz_Depth_Norm, cv::Size, MatDepth, NormType);

 PERF_TEST_P(Sz_Depth_Norm, Norm,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
                    Values(CV_8U, CV_16U, CV_32S, CV_32F),
                    Values(NormType(cv::NORM_INF), NormType(cv::NORM_L1), NormType(cv::NORM_L2))))
 {
@ -66,13 +66,13 @@ PERF_TEST_P(Sz_Depth_Norm, Norm,
    else
        declare.in(src, WARMUP_RNG);

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_buf;
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat d_buf;
        double gpu_dst;

-        TEST_CYCLE() gpu_dst = cv::gpu::norm(d_src, normType, d_buf);
+        TEST_CYCLE() gpu_dst = cv::cuda::norm(d_src, normType, d_buf);

        SANITY_CHECK(gpu_dst, 1e-6, ERROR_RELATIVE);
    }
@ -92,7 +92,7 @@ PERF_TEST_P(Sz_Depth_Norm, Norm,
 DEF_PARAM_TEST(Sz_Norm, cv::Size, NormType);

 PERF_TEST_P(Sz_Norm, NormDiff,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
                    Values(NormType(cv::NORM_INF), NormType(cv::NORM_L1), NormType(cv::NORM_L2))))
 {
    const cv::Size size = GET_PARAM(0);
@ -104,14 +104,14 @@ PERF_TEST_P(Sz_Norm, NormDiff,
    cv::Mat src2(size, CV_8UC1);
    declare.in(src2, WARMUP_RNG);

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        const cv::gpu::GpuMat d_src1(src1);
-        const cv::gpu::GpuMat d_src2(src2);
-        cv::gpu::GpuMat d_buf;
+        const cv::cuda::GpuMat d_src1(src1);
+        const cv::cuda::GpuMat d_src2(src2);
+        cv::cuda::GpuMat d_buf;
        double gpu_dst;

-        TEST_CYCLE() gpu_dst = cv::gpu::norm(d_src1, d_src2, d_buf, normType);
+        TEST_CYCLE() gpu_dst = cv::cuda::norm(d_src1, d_src2, d_buf, normType);

        SANITY_CHECK(gpu_dst);

@ -130,9 +130,9 @@ PERF_TEST_P(Sz_Norm, NormDiff,
 // Sum

 PERF_TEST_P(Sz_Depth_Cn, Sum,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
                    Values(CV_8U, CV_16U, CV_32F),
-                    GPU_CHANNELS_1_3_4))
+                    CUDA_CHANNELS_1_3_4))
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@ -143,13 +143,13 @@ PERF_TEST_P(Sz_Depth_Cn, Sum,
    cv::Mat src(size, type);
    declare.in(src, WARMUP_RNG);

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_buf;
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat d_buf;
        cv::Scalar gpu_dst;

-        TEST_CYCLE() gpu_dst = cv::gpu::sum(d_src, d_buf);
+        TEST_CYCLE() gpu_dst = cv::cuda::sum(d_src, d_buf);

        SANITY_CHECK(gpu_dst, 1e-5, ERROR_RELATIVE);
    }
@ -167,9 +167,9 @@ PERF_TEST_P(Sz_Depth_Cn, Sum,
 // SumAbs

 PERF_TEST_P(Sz_Depth_Cn, SumAbs,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
                    Values(CV_8U, CV_16U, CV_32F),
-                    GPU_CHANNELS_1_3_4))
+                    CUDA_CHANNELS_1_3_4))
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@ -180,13 +180,13 @@ PERF_TEST_P(Sz_Depth_Cn, SumAbs,
    cv::Mat src(size, type);
    declare.in(src, WARMUP_RNG);

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_buf;
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat d_buf;
        cv::Scalar gpu_dst;

-        TEST_CYCLE() gpu_dst = cv::gpu::absSum(d_src, d_buf);
+        TEST_CYCLE() gpu_dst = cv::cuda::absSum(d_src, d_buf);

        SANITY_CHECK(gpu_dst, 1e-6, ERROR_RELATIVE);
    }
@ -200,9 +200,9 @@ PERF_TEST_P(Sz_Depth_Cn, SumAbs,
 // SumSqr

 PERF_TEST_P(Sz_Depth_Cn, SumSqr,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
                    Values<MatDepth>(CV_8U, CV_16U, CV_32F),
-                    GPU_CHANNELS_1_3_4))
+                    CUDA_CHANNELS_1_3_4))
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@ -213,13 +213,13 @@ PERF_TEST_P(Sz_Depth_Cn, SumSqr,
    cv::Mat src(size, type);
    declare.in(src, WARMUP_RNG);

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_buf;
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat d_buf;
        cv::Scalar gpu_dst;

-        TEST_CYCLE() gpu_dst = cv::gpu::sqrSum(d_src, d_buf);
+        TEST_CYCLE() gpu_dst = cv::cuda::sqrSum(d_src, d_buf);

        SANITY_CHECK(gpu_dst, 1e-6, ERROR_RELATIVE);
    }
@ -233,7 +233,7 @@ PERF_TEST_P(Sz_Depth_Cn, SumSqr,
 // MinMax

 PERF_TEST_P(Sz_Depth, MinMax,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
                    Values(CV_8U, CV_16U, CV_32F, CV_64F)))
 {
    const cv::Size size = GET_PARAM(0);
@ -245,13 +245,13 @@ PERF_TEST_P(Sz_Depth, MinMax,
    else
        declare.in(src, WARMUP_RNG);

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_buf;
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat d_buf;
        double gpu_minVal, gpu_maxVal;

-        TEST_CYCLE() cv::gpu::minMax(d_src, &gpu_minVal, &gpu_maxVal, cv::gpu::GpuMat(), d_buf);
+        TEST_CYCLE() cv::cuda::minMax(d_src, &gpu_minVal, &gpu_maxVal, cv::cuda::GpuMat(), d_buf);

        SANITY_CHECK(gpu_minVal, 1e-10);
        SANITY_CHECK(gpu_maxVal, 1e-10);
@ -271,7 +271,7 @@ PERF_TEST_P(Sz_Depth, MinMax,
 // MinMaxLoc

 PERF_TEST_P(Sz_Depth, MinMaxLoc,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
                    Values(CV_8U, CV_16U, CV_32F, CV_64F)))
 {
    const cv::Size size = GET_PARAM(0);
@ -283,14 +283,14 @@ PERF_TEST_P(Sz_Depth, MinMaxLoc,
    else
        declare.in(src, WARMUP_RNG);

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_valbuf, d_locbuf;
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat d_valbuf, d_locbuf;
        double gpu_minVal, gpu_maxVal;
        cv::Point gpu_minLoc, gpu_maxLoc;

-        TEST_CYCLE() cv::gpu::minMaxLoc(d_src, &gpu_minVal, &gpu_maxVal, &gpu_minLoc, &gpu_maxLoc, cv::gpu::GpuMat(), d_valbuf, d_locbuf);
+        TEST_CYCLE() cv::cuda::minMaxLoc(d_src, &gpu_minVal, &gpu_maxVal, &gpu_minLoc, &gpu_maxLoc, cv::cuda::GpuMat(), d_valbuf, d_locbuf);

        SANITY_CHECK(gpu_minVal, 1e-10);
        SANITY_CHECK(gpu_maxVal, 1e-10);
@ -311,7 +311,7 @@ PERF_TEST_P(Sz_Depth, MinMaxLoc,
 // CountNonZero

 PERF_TEST_P(Sz_Depth, CountNonZero,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
                    Values(CV_8U, CV_16U, CV_32F, CV_64F)))
 {
    const cv::Size size = GET_PARAM(0);
@ -320,13 +320,13 @@ PERF_TEST_P(Sz_Depth, CountNonZero,
    cv::Mat src(size, depth);
    declare.in(src, WARMUP_RNG);

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_buf;
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat d_buf;
        int gpu_dst = 0;

-        TEST_CYCLE() gpu_dst = cv::gpu::countNonZero(d_src, d_buf);
+        TEST_CYCLE() gpu_dst = cv::cuda::countNonZero(d_src, d_buf);

        SANITY_CHECK(gpu_dst);
    }
@ -351,7 +351,7 @@ CV_ENUM(ReduceDim, Rows, Cols)
 DEF_PARAM_TEST(Sz_Depth_Cn_Code_Dim, cv::Size, MatDepth, MatCn, ReduceCode, ReduceDim);

 PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Reduce,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
                    Values(CV_8U, CV_16U, CV_16S, CV_32F),
                    Values(1, 2, 3, 4),
                    ReduceCode::all(),
@ -368,14 +368,14 @@ PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Reduce,
    cv::Mat src(size, type);
    declare.in(src, WARMUP_RNG);

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;

-        TEST_CYCLE() cv::gpu::reduce(d_src, dst, dim, reduceOp);
+        TEST_CYCLE() cv::cuda::reduce(d_src, dst, dim, reduceOp);

-        GPU_SANITY_CHECK(dst);
+        CUDA_SANITY_CHECK(dst);
    }
    else
    {
@ -393,7 +393,7 @@ PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Reduce,
 DEF_PARAM_TEST(Sz_Depth_NormType, cv::Size, MatDepth, NormType);

 PERF_TEST_P(Sz_Depth_NormType, Normalize,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
                    Values(NormType(cv::NORM_INF),
                           NormType(cv::NORM_L1),
@ -410,15 +410,15 @@ PERF_TEST_P(Sz_Depth_NormType, Normalize,
    cv::Mat src(size, type);
    declare.in(src, WARMUP_RNG);

-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_norm_buf, d_cvt_buf;
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+        cv::cuda::GpuMat d_norm_buf, d_cvt_buf;

-        TEST_CYCLE() cv::gpu::normalize(d_src, dst, alpha, beta, norm_type, type, cv::gpu::GpuMat(), d_norm_buf, d_cvt_buf);
+        TEST_CYCLE() cv::cuda::normalize(d_src, dst, alpha, beta, norm_type, type, cv::cuda::GpuMat(), d_norm_buf, d_cvt_buf);

-        GPU_SANITY_CHECK(dst, 1e-6);
+        CUDA_SANITY_CHECK(dst, 1e-6);
    }
    else
    {
@ -434,7 +434,7 @@ PERF_TEST_P(Sz_Depth_NormType, Normalize,
 // MeanStdDev

 PERF_TEST_P(Sz, MeanStdDev,
-            GPU_TYPICAL_MAT_SIZES)
+            CUDA_TYPICAL_MAT_SIZES)
 {
    const cv::Size size = GetParam();

@ -442,14 +442,14 @@ PERF_TEST_P(Sz, MeanStdDev,
    declare.in(src, WARMUP_RNG);


-    if (PERF_RUN_GPU())
+    if (PERF_RUN_CUDA())
    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_buf;
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat d_buf;
        cv::Scalar gpu_mean;
        cv::Scalar gpu_stddev;

-        TEST_CYCLE() cv::gpu::meanStdDev(d_src, gpu_mean, gpu_stddev, d_buf);
+        TEST_CYCLE() cv::cuda::meanStdDev(d_src, gpu_mean, gpu_stddev, d_buf);

        SANITY_CHECK(gpu_mean);
        SANITY_CHECK(gpu_stddev);
--- a/modules/cudaarithm/src/arithm.cpp
+++ b/modules/cudaarithm/src/arithm.cpp
@ -43,18 +43,18 @@
 #include "precomp.hpp"

 using namespace cv;
-using namespace cv::gpu;
+using namespace cv::cuda;

 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)

-void cv::gpu::gemm(InputArray, InputArray, double, InputArray, double, OutputArray, int, Stream&) { throw_no_cuda(); }
+void cv::cuda::gemm(InputArray, InputArray, double, InputArray, double, OutputArray, int, Stream&) { throw_no_cuda(); }

-void cv::gpu::mulSpectrums(InputArray, InputArray, OutputArray, int, bool, Stream&) { throw_no_cuda(); }
-void cv::gpu::mulAndScaleSpectrums(InputArray, InputArray, OutputArray, int, float, bool, Stream&) { throw_no_cuda(); }
+void cv::cuda::mulSpectrums(InputArray, InputArray, OutputArray, int, bool, Stream&) { throw_no_cuda(); }
+void cv::cuda::mulAndScaleSpectrums(InputArray, InputArray, OutputArray, int, float, bool, Stream&) { throw_no_cuda(); }

-void cv::gpu::dft(InputArray, OutputArray, Size, int, Stream&) { throw_no_cuda(); }
+void cv::cuda::dft(InputArray, OutputArray, Size, int, Stream&) { throw_no_cuda(); }

-Ptr<Convolution> cv::gpu::createConvolution(Size) { throw_no_cuda(); return Ptr<Convolution>(); }
+Ptr<Convolution> cv::cuda::createConvolution(Size) { throw_no_cuda(); return Ptr<Convolution>(); }

 #else /* !defined (HAVE_CUDA) */

@ -162,7 +162,7 @@ namespace
 ////////////////////////////////////////////////////////////////////////
 // gemm

-void cv::gpu::gemm(InputArray _src1, InputArray _src2, double alpha, InputArray _src3, double beta, OutputArray _dst, int flags, Stream& stream)
+void cv::cuda::gemm(InputArray _src1, InputArray _src2, double alpha, InputArray _src3, double beta, OutputArray _dst, int flags, Stream& stream)
 {
 #ifndef HAVE_CUBLAS
    (void) _src1;
@ -221,7 +221,7 @@ void cv::gpu::gemm(InputArray _src1, InputArray _src2, double alpha, InputArray
        {
            if (tr3)
            {
-                gpu::transpose(src3, dst, stream);
+                cuda::transpose(src3, dst, stream);
            }
            else
            {
@ -297,7 +297,7 @@ void cv::gpu::gemm(InputArray _src1, InputArray _src2, double alpha, InputArray

 #ifdef HAVE_CUFFT

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream);

@ -306,7 +306,7 @@ namespace cv { namespace gpu { namespace cudev

 #endif

-void cv::gpu::mulSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, bool conjB, Stream& stream)
+void cv::cuda::mulSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, bool conjB, Stream& stream)
 {
 #ifndef HAVE_CUFFT
    (void) _src1;
@ -320,7 +320,7 @@ void cv::gpu::mulSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst,
    (void) flags;

    typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, PtrStepSz<cufftComplex>, cudaStream_t stream);
-    static Caller callers[] = { cudev::mulSpectrums, cudev::mulSpectrums_CONJ };
+    static Caller callers[] = { device::mulSpectrums, device::mulSpectrums_CONJ };

    GpuMat src1 = _src1.getGpuMat();
    GpuMat src2 = _src2.getGpuMat();
@ -341,7 +341,7 @@ void cv::gpu::mulSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst,

 #ifdef HAVE_CUFFT

-namespace cv { namespace gpu { namespace cudev
+namespace cv { namespace cuda { namespace device
 {
    void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream);

@ -350,7 +350,7 @@ namespace cv { namespace gpu { namespace cudev

 #endif

-void cv::gpu::mulAndScaleSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, float scale, bool conjB, Stream& stream)
+void cv::cuda::mulAndScaleSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, float scale, bool conjB, Stream& stream)
 {
 #ifndef HAVE_CUFFT
    (void) _src1;
@ -365,7 +365,7 @@ void cv::gpu::mulAndScaleSpectrums(InputArray _src1, InputArray _src2, OutputArr
    (void)flags;

    typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, float scale, PtrStepSz<cufftComplex>, cudaStream_t stream);
-    static Caller callers[] = { cudev::mulAndScaleSpectrums, cudev::mulAndScaleSpectrums_CONJ };
+    static Caller callers[] = { device::mulAndScaleSpectrums, device::mulAndScaleSpectrums_CONJ };

    GpuMat src1 = _src1.getGpuMat();
    GpuMat src2 = _src2.getGpuMat();
@ -384,7 +384,7 @@ void cv::gpu::mulAndScaleSpectrums(InputArray _src1, InputArray _src2, OutputArr
 //////////////////////////////////////////////////////////////////////////////
 // dft

-void cv::gpu::dft(InputArray _src, OutputArray _dst, Size dft_size, int flags, Stream& stream)
+void cv::cuda::dft(InputArray _src, OutputArray _dst, Size dft_size, int flags, Stream& stream)
 {
 #ifndef HAVE_CUFFT
    (void) _src;
@ -478,7 +478,7 @@ void cv::gpu::dft(InputArray _src, OutputArray _dst, Size dft_size, int flags, S
    cufftSafeCall( cufftDestroy(plan) );

    if (is_scaled_dft)
-        gpu::multiply(_dst, Scalar::all(1. / dft_size.area()), _dst, 1, -1, stream);
+        cuda::multiply(_dst, Scalar::all(1. / dft_size.area()), _dst, 1, -1, stream);

 #endif
 }
@ -580,7 +580,7 @@ namespace
        cufftSafeCall( cufftSetStream(planC2R, stream) );

        GpuMat templ_roi(templ.size(), CV_32FC1, templ.data, templ.step);
-        gpu::copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0,
+        cuda::copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0,
                            templ_block.cols - templ_roi.cols, 0, Scalar(), _stream);

        cufftSafeCall( cufftExecR2C(planR2C, templ_block.ptr<cufftReal>(), templ_spect.ptr<cufftComplex>()) );
@ -594,12 +594,12 @@ namespace
                                    std::min(y + dft_size.height, image.rows) - y);
                GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr<float>(y) + x),
                                 image.step);
-                gpu::copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows,
+                cuda::copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows,
                                    0, image_block.cols - image_roi.cols, 0, Scalar(), _stream);

                cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr<cufftReal>(),
                                           image_spect.ptr<cufftComplex>()));
-                gpu::mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0,
+                cuda::mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0,
                                          1.f / dft_size.area(), ccorr, _stream);
                cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr<cufftComplex>(),
                                           result_data.ptr<cufftReal>()));
@ -622,7 +622,7 @@ namespace

 #endif

-Ptr<Convolution> cv::gpu::createConvolution(Size user_block_size)
+Ptr<Convolution> cv::cuda::createConvolution(Size user_block_size)
 {
 #ifndef HAVE_CUFFT
    (void) user_block_size;
--- a/Show More
+++ b/Show More