diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3f793f1070..2973f4ca00 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -201,6 +201,7 @@ OCV_OPTION(INSTALL_TO_MANGLED_PATHS "Enables mangled install paths, that help wi
 
 # OpenCV build options
 # ===================================================
+OCV_OPTION(ENABLE_DYNAMIC_CUDA        "Enabled dynamic CUDA linkage"                             ON   IF ANDROID )
 OCV_OPTION(ENABLE_PRECOMPILED_HEADERS "Use precompiled headers"                                  ON   IF (NOT IOS) )
 OCV_OPTION(ENABLE_SOLUTION_FOLDERS    "Solution folder in Visual Studio or in other IDEs"        (MSVC_IDE OR CMAKE_GENERATOR MATCHES Xcode) IF (CMAKE_VERSION VERSION_GREATER "2.8.0") )
 OCV_OPTION(ENABLE_PROFILING           "Enable profiling in the GCC compiler (Add flags: -g -pg)" OFF  IF CMAKE_COMPILER_IS_GNUCXX )
@@ -472,7 +473,11 @@ endif()
 # ----------------------------------------------------------------------------
 # Add CUDA libraries (needed for apps/tools, samples)
 # ----------------------------------------------------------------------------
-if(HAVE_CUDA)
+if(NOT HAVE_CUDA)
+  set(ENABLE_DYNAMIC_CUDA OFF)
+endif()
+
+if(HAVE_CUDA AND NOT ENABLE_DYNAMIC_CUDA)
   set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
   if(HAVE_CUBLAS)
     set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CUDA_cublas_LIBRARY})
@@ -481,6 +486,7 @@ if(HAVE_CUDA)
     set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CUDA_cufft_LIBRARY})
   endif()
 endif()
+
 # ----------------------------------------------------------------------------
 # Solution folders:
 # ----------------------------------------------------------------------------
diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake
index c923aba413..3dd749b053 100644
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@@ -488,7 +488,7 @@ macro(ocv_glob_module_sources)
   file(GLOB lib_cuda_srcs "src/cuda/*.cu")
   set(cuda_objs "")
   set(lib_cuda_hdrs "")
-  if(HAVE_CUDA AND lib_cuda_srcs)
+  if(HAVE_CUDA)
     ocv_include_directories(${CUDA_INCLUDE_DIRS})
     file(GLOB lib_cuda_hdrs "src/cuda/*.hpp")
 
@@ -537,9 +537,6 @@ macro(ocv_create_module)
     target_link_libraries(${the_module} ${OPENCV_MODULE_${the_module}_DEPS})
     target_link_libraries(${the_module} LINK_INTERFACE_LIBRARIES ${OPENCV_MODULE_${the_module}_DEPS})
     target_link_libraries(${the_module} ${OPENCV_MODULE_${the_module}_DEPS_EXT} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${ARGN})
-    if (HAVE_CUDA)
-      target_link_libraries(${the_module} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
-    endif()
   endif()
 
   add_dependencies(opencv_modules ${the_module})
diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index 66b8ae0d2f..a1e71bf4f7 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -1,11 +1,25 @@
 set(the_description "The Core Functionality")
-ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES})
-ocv_module_include_directories(${ZLIB_INCLUDE_DIR})
+
+if (NOT HAVE_CUDA OR ENABLE_DYNAMIC_CUDA)
+  ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES})
+else()
+  ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
+endif()
+
+ocv_module_include_directories("${OpenCV_SOURCE_DIR}/modules/dynamicuda/include/" ${ZLIB_INCLUDE_DIR})
 
 if(HAVE_WINRT)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /ZW /GS /Gm- /AI\"${WINDOWS_SDK_PATH}/References/CommonConfiguration/Neutral\" /AI\"${VISUAL_STUDIO_PATH}/vcpackages\"")
 endif()
 
+if(ENABLE_DYNAMIC_CUDA)
+  add_definitions(-DDYNAMIC_CUDA_SUPPORT)
+else()
+  if (HAVE_CUDA)
+    add_definitions(-DUSE_CUDA)
+  endif()
+endif()
+
 if(HAVE_CUDA)
   ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include")
   ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
@@ -14,11 +28,26 @@ endif()
 file(GLOB lib_cuda_hdrs        "include/opencv2/${name}/cuda/*.hpp"        "include/opencv2/${name}/cuda/*.h")
 file(GLOB lib_cuda_hdrs_detail "include/opencv2/${name}/cuda/detail/*.hpp" "include/opencv2/${name}/cuda/detail/*.h")
 
+if(HAVE_CUDA AND NOT ENABLE_DYNAMIC_CUDA)
+  file(GLOB lib_cuda           "../dynamicuda/src/cuda/*.cu*")
+  ocv_include_directories(${CUDA_INCLUDE_DIRS})
+  ocv_cuda_compile(cuda_objs ${lib_cuda})
+endif()
+
 source_group("Cuda Headers"         FILES ${lib_cuda_hdrs})
 source_group("Cuda Headers\\Detail" FILES ${lib_cuda_hdrs_detail})
 
-ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc"
-                        HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
+if (HAVE_CUDA AND NOT ENABLE_DYNAMIC_CUDA)
+  source_group("Src\\Cuda"      FILES ${lib_cuda} ${lib_cuda_hdrs})
+endif()
+
+if (NOT HAVE_CUDA OR ENABLE_DYNAMIC_CUDA)
+  ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc"
+                          HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
+else()
+  ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc" ${lib_cuda} ${cuda_objs}
+                          HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
+endif()
 
 ocv_create_module()
 ocv_add_precompiled_headers(${the_module})
diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp
index 4c4af61c47..94bb548235 100644
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@@ -44,7 +44,7 @@
 #include "opencv2/core/gpumat.hpp"
 #include <iostream>
 
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA)
     #include <cuda_runtime.h>
     #include <npp.h>
 
@@ -60,493 +60,232 @@
     #endif
 #endif
 
+#ifdef DYNAMIC_CUDA_SUPPORT
+#include <dlfcn.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <dirent.h>
+#endif
+
+#ifdef ANDROID
+# include <android/log.h>
+
+# define LOG_TAG "OpenCV::CUDA"
+# define LOGE(...) ((void)__android_log_print(ANDROID_LOG_ERROR, LOG_TAG, __VA_ARGS__))
+# define LOGD(...) ((void)__android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, __VA_ARGS__))
+# define LOGI(...) ((void)__android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__))
+#endif
+
 using namespace std;
 using namespace cv;
 using namespace cv::gpu;
 
-#ifndef HAVE_CUDA
-
 #define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")
 
-#else // HAVE_CUDA
-
-namespace
-{
-#define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, CV_Func)
-#define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, CV_Func)
-
-    inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
-    {
-        if (cudaSuccess != err)
-            cv::gpu::error(cudaGetErrorString(err), file, line, func);
-    }
-
-    inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
-    {
-        if (err < 0)
-        {
-            std::ostringstream msg;
-            msg << "NPP API Call Error: " << err;
-            cv::gpu::error(msg.str().c_str(), file, line, func);
-        }
-    }
-}
-
-#endif // HAVE_CUDA
-
-//////////////////////////////// Initialization & Info ////////////////////////
-
-#ifndef HAVE_CUDA
+#include "opencv2/dynamicuda/dynamicuda.hpp"
 
-int cv::gpu::getCudaEnabledDeviceCount() { return 0; }
+#ifdef DYNAMIC_CUDA_SUPPORT
 
-void cv::gpu::setDevice(int) { throw_nogpu; }
-int cv::gpu::getDevice() { throw_nogpu; return 0; }
+typedef GpuFuncTable* (*GpuFactoryType)();
+typedef DeviceInfoFuncTable* (*DeviceInfoFactoryType)();
 
-void cv::gpu::resetDevice() { throw_nogpu; }
+static GpuFactoryType gpuFactory = NULL;
+static DeviceInfoFactoryType deviceInfoFactory = NULL;
 
-bool cv::gpu::deviceSupports(FeatureSet) { throw_nogpu; return false; }
-
-bool cv::gpu::TargetArchs::builtWith(FeatureSet) { throw_nogpu; return false; }
-bool cv::gpu::TargetArchs::has(int, int) { throw_nogpu; return false; }
-bool cv::gpu::TargetArchs::hasPtx(int, int) { throw_nogpu; return false; }
-bool cv::gpu::TargetArchs::hasBin(int, int) { throw_nogpu; return false; }
-bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int, int) { throw_nogpu; return false; }
-bool cv::gpu::TargetArchs::hasEqualOrGreater(int, int) { throw_nogpu; return false; }
-bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int, int) { throw_nogpu; return false; }
-bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int, int) { throw_nogpu; return false; }
-
-size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { throw_nogpu; return 0; }
-void cv::gpu::DeviceInfo::queryMemory(size_t&, size_t&) const { throw_nogpu; }
-size_t cv::gpu::DeviceInfo::freeMemory() const { throw_nogpu; return 0; }
-size_t cv::gpu::DeviceInfo::totalMemory() const { throw_nogpu; return 0; }
-bool cv::gpu::DeviceInfo::supports(FeatureSet) const { throw_nogpu; return false; }
-bool cv::gpu::DeviceInfo::isCompatible() const { throw_nogpu; return false; }
-void cv::gpu::DeviceInfo::query() { throw_nogpu; }
-
-void cv::gpu::printCudaDeviceInfo(int) { throw_nogpu; }
-void cv::gpu::printShortCudaDeviceInfo(int) { throw_nogpu; }
-
-#else // HAVE_CUDA
-
-int cv::gpu::getCudaEnabledDeviceCount()
+# if defined(__linux__) || defined(__APPLE__) || defined (ANDROID)
+#  ifdef ANDROID
+static const std::string getCudaSupportLibName()
 {
-    int count;
-    cudaError_t error = cudaGetDeviceCount( &count );
-
-    if (error == cudaErrorInsufficientDriver)
-        return -1;
-
-    if (error == cudaErrorNoDevice)
-        return 0;
-
-    cudaSafeCall( error );
-    return count;
-}
-
-void cv::gpu::setDevice(int device)
-{
-    cudaSafeCall( cudaSetDevice( device ) );
-}
-
-int cv::gpu::getDevice()
-{
-    int device;
-    cudaSafeCall( cudaGetDevice( &device ) );
-    return device;
-}
-
-void cv::gpu::resetDevice()
-{
-    cudaSafeCall( cudaDeviceReset() );
-}
-
-namespace
-{
-    class CudaArch
+    Dl_info dl_info;
+    if(0 != dladdr((void *)getCudaSupportLibName, &dl_info))
     {
-    public:
-        CudaArch();
-
-        bool builtWith(FeatureSet feature_set) const;
-        bool hasPtx(int major, int minor) const;
-        bool hasBin(int major, int minor) const;
-        bool hasEqualOrLessPtx(int major, int minor) const;
-        bool hasEqualOrGreaterPtx(int major, int minor) const;
-        bool hasEqualOrGreaterBin(int major, int minor) const;
+        LOGD("Library name: %s", dl_info.dli_fname);
+        LOGD("Library base address: %p", dl_info.dli_fbase);
 
-    private:
-        static void fromStr(const string& set_as_str, vector<int>& arr);
+        const char* libName=dl_info.dli_fname;
+        while( ((*libName)=='/') || ((*libName)=='.') )
+        libName++;
 
-        vector<int> bin;
-        vector<int> ptx;
-        vector<int> features;
-    };
+        char lineBuf[2048];
+        FILE* file = fopen("/proc/self/smaps", "rt");
 
-    const CudaArch cudaArch;
-
-    CudaArch::CudaArch()
-    {
-        fromStr(CUDA_ARCH_BIN, bin);
-        fromStr(CUDA_ARCH_PTX, ptx);
-        fromStr(CUDA_ARCH_FEATURES, features);
-    }
-
-    bool CudaArch::builtWith(FeatureSet feature_set) const
-    {
-        return !features.empty() && (features.back() >= feature_set);
-    }
-
-    bool CudaArch::hasPtx(int major, int minor) const
-    {
-        return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end();
-    }
-
-    bool CudaArch::hasBin(int major, int minor) const
-    {
-        return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end();
-    }
+        if(file)
+        {
+            while (fgets(lineBuf, sizeof lineBuf, file) != NULL)
+            {
+                //verify that line ends with library name
+                int lineLength = strlen(lineBuf);
+                int libNameLength = strlen(libName);
 
-    bool CudaArch::hasEqualOrLessPtx(int major, int minor) const
-    {
-        return !ptx.empty() && (ptx.front() <= major * 10 + minor);
-    }
+                //trim end
+                for(int i = lineLength - 1; i >= 0 && isspace(lineBuf[i]); --i)
+                {
+                    lineBuf[i] = 0;
+                    --lineLength;
+                }
 
-    bool CudaArch::hasEqualOrGreaterPtx(int major, int minor) const
-    {
-        return !ptx.empty() && (ptx.back() >= major * 10 + minor);
-    }
+                if (0 != strncmp(lineBuf + lineLength - libNameLength, libName, libNameLength))
+                {
+                //the line does not contain the library name
+                    continue;
+                }
 
-    bool CudaArch::hasEqualOrGreaterBin(int major, int minor) const
-    {
-        return !bin.empty() && (bin.back() >= major * 10 + minor);
-    }
+                //extract path from smaps line
+                char* pathBegin = strchr(lineBuf, '/');
+                if (0 == pathBegin)
+                {
+                    LOGE("Strange error: could not find path beginning in lin \"%s\"", lineBuf);
+                    continue;
+                }
 
-    void CudaArch::fromStr(const string& set_as_str, vector<int>& arr)
-    {
-        if (set_as_str.find_first_not_of(" ") == string::npos)
-            return;
+                char* pathEnd = strrchr(pathBegin, '/');
+                pathEnd[1] = 0;
 
-        istringstream stream(set_as_str);
-        int cur_value;
+                LOGD("Libraries folder found: %s", pathBegin);
 
-        while (!stream.eof())
+                fclose(file);
+                return std::string(pathBegin) + "/libopencv_core_cuda.so";
+            }
+            fclose(file);
+            LOGE("Could not find library path");
+        }
+        else
         {
-            stream >> cur_value;
-            arr.push_back(cur_value);
+            LOGE("Could not read /proc/self/smaps");
         }
-
-        sort(arr.begin(), arr.end());
     }
-}
-
-bool cv::gpu::TargetArchs::builtWith(cv::gpu::FeatureSet feature_set)
-{
-    return cudaArch.builtWith(feature_set);
-}
-
-bool cv::gpu::TargetArchs::has(int major, int minor)
-{
-    return hasPtx(major, minor) || hasBin(major, minor);
-}
-
-bool cv::gpu::TargetArchs::hasPtx(int major, int minor)
-{
-    return cudaArch.hasPtx(major, minor);
-}
-
-bool cv::gpu::TargetArchs::hasBin(int major, int minor)
-{
-    return cudaArch.hasBin(major, minor);
-}
-
-bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor)
-{
-    return cudaArch.hasEqualOrLessPtx(major, minor);
-}
-
-bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor)
-{
-    return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
-}
-
-bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor)
-{
-    return cudaArch.hasEqualOrGreaterPtx(major, minor);
-}
-
-bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor)
-{
-    return cudaArch.hasEqualOrGreaterBin(major, minor);
-}
-
-bool cv::gpu::deviceSupports(FeatureSet feature_set)
-{
-    static int versions[] =
-    {
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
-    };
-    static const int cache_size = static_cast<int>(sizeof(versions) / sizeof(versions[0]));
-
-    const int devId = getDevice();
-
-    int version;
-
-    if (devId < cache_size && versions[devId] >= 0)
-        version = versions[devId];
     else
     {
-        DeviceInfo dev(devId);
-        version = dev.majorVersion() * 10 + dev.minorVersion();
-        if (devId < cache_size)
-            versions[devId] = version;
+        LOGE("Could not get library name and base address");
     }
 
-    return TargetArchs::builtWith(feature_set) && (version >= feature_set);
+    return string();
 }
 
-namespace
+#  else
+static const std::string getCudaSupportLibName()
 {
-    class DeviceProps
-    {
-    public:
-        DeviceProps();
-        ~DeviceProps();
-
-        cudaDeviceProp* get(int devID);
-
-    private:
-        std::vector<cudaDeviceProp*> props_;
-    };
+    return "libopencv_core_cuda.so";
+}
+#  endif
 
-    DeviceProps::DeviceProps()
-    {
-        props_.resize(10, 0);
-    }
+static bool loadCudaSupportLib()
+{
+    void* handle;
+    const std::string name = getCudaSupportLibName();
+    handle = dlopen(name.c_str(), RTLD_LAZY);
+    if (!handle)
+        return false;
 
-    DeviceProps::~DeviceProps()
+    deviceInfoFactory = (DeviceInfoFactoryType)dlsym(handle, "deviceInfoFactory");
+    if (!deviceInfoFactory)
     {
-        for (size_t i = 0; i < props_.size(); ++i)
-        {
-            if (props_[i])
-                delete props_[i];
-        }
-        props_.clear();
+        dlclose(handle);
+        return false;
     }
 
-    cudaDeviceProp* DeviceProps::get(int devID)
+    gpuFactory = (GpuFactoryType)dlsym(handle, "gpuFactory");
+    if (!gpuFactory)
     {
-        if (devID >= (int) props_.size())
-            props_.resize(devID + 5, 0);
-
-        if (!props_[devID])
-        {
-            props_[devID] = new cudaDeviceProp;
-            cudaSafeCall( cudaGetDeviceProperties(props_[devID], devID) );
-        }
-
-        return props_[devID];
+        dlclose(handle);
+        return false;
     }
 
-    DeviceProps deviceProps;
+    return true;
 }
 
-size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const
-{
-    return deviceProps.get(device_id_)->sharedMemPerBlock;
-}
+# else
+#  error "Dynamic CUDA support is not implemented for this platform!"
+# endif
 
-void cv::gpu::DeviceInfo::queryMemory(size_t& _totalMemory, size_t& _freeMemory) const
-{
-    int prevDeviceID = getDevice();
-    if (prevDeviceID != device_id_)
-        setDevice(device_id_);
-
-    cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) );
+#endif
 
-    if (prevDeviceID != device_id_)
-        setDevice(prevDeviceID);
+static GpuFuncTable* gpuFuncTable()
+{
+#ifdef DYNAMIC_CUDA_SUPPORT
+   static EmptyFuncTable stub;
+   static GpuFuncTable* libFuncTable = loadCudaSupportLib() ? gpuFactory(): (GpuFuncTable*)&stub;
+   static GpuFuncTable *funcTable = libFuncTable ? libFuncTable : (GpuFuncTable*)&stub;
+#else
+# ifdef USE_CUDA
+   static CudaFuncTable impl;
+   static GpuFuncTable* funcTable = &impl;
+#else
+   static EmptyFuncTable stub;
+   static GpuFuncTable* funcTable = &stub;
+#endif
+#endif
+   return funcTable;
+}
+
+static DeviceInfoFuncTable* deviceInfoFuncTable()
+{
+#ifdef DYNAMIC_CUDA_SUPPORT
+   static EmptyDeviceInfoFuncTable stub;
+   static DeviceInfoFuncTable* libFuncTable = loadCudaSupportLib() ? deviceInfoFactory(): (DeviceInfoFuncTable*)&stub;
+   static DeviceInfoFuncTable* funcTable = libFuncTable ? libFuncTable : (DeviceInfoFuncTable*)&stub;
+#else
+# ifdef USE_CUDA
+   static CudaDeviceInfoFuncTable impl;
+   static DeviceInfoFuncTable* funcTable = &impl;
+#else
+   static EmptyDeviceInfoFuncTable stub;
+   static DeviceInfoFuncTable* funcTable = &stub;
+#endif
+#endif
+   return funcTable;
 }
 
-size_t cv::gpu::DeviceInfo::freeMemory() const
-{
-    size_t _totalMemory, _freeMemory;
-    queryMemory(_totalMemory, _freeMemory);
-    return _freeMemory;
-}
 
-size_t cv::gpu::DeviceInfo::totalMemory() const
-{
-    size_t _totalMemory, _freeMemory;
-    queryMemory(_totalMemory, _freeMemory);
-    return _totalMemory;
-}
+//////////////////////////////// Initialization & Info ////////////////////////
 
-bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const
-{
-    int version = majorVersion() * 10 + minorVersion();
-    return version >= feature_set;
-}
+int cv::gpu::getCudaEnabledDeviceCount() { return deviceInfoFuncTable()->getCudaEnabledDeviceCount(); }
 
-bool cv::gpu::DeviceInfo::isCompatible() const
-{
-    // Check PTX compatibility
-    if (TargetArchs::hasEqualOrLessPtx(majorVersion(), minorVersion()))
-        return true;
+void cv::gpu::setDevice(int device) { deviceInfoFuncTable()->setDevice(device); }
+int cv::gpu::getDevice() { return deviceInfoFuncTable()->getDevice(); }
 
-    // Check BIN compatibility
-    for (int i = minorVersion(); i >= 0; --i)
-        if (TargetArchs::hasBin(majorVersion(), i))
-            return true;
+void cv::gpu::resetDevice() { deviceInfoFuncTable()->resetDevice(); }
 
-    return false;
-}
+bool cv::gpu::deviceSupports(FeatureSet feature_set) { return deviceInfoFuncTable()->deviceSupports(feature_set); }
 
-void cv::gpu::DeviceInfo::query()
-{
-    const cudaDeviceProp* prop = deviceProps.get(device_id_);
+bool cv::gpu::TargetArchs::builtWith(FeatureSet feature_set) { return deviceInfoFuncTable()->builtWith(feature_set); }
+bool cv::gpu::TargetArchs::has(int major, int minor) { return deviceInfoFuncTable()->has(major, minor); }
+bool cv::gpu::TargetArchs::hasPtx(int major, int minor) {  return deviceInfoFuncTable()->hasPtx(major, minor); }
+bool cv::gpu::TargetArchs::hasBin(int major, int minor) { return deviceInfoFuncTable()->hasBin(major, minor);  }
+bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor) { return deviceInfoFuncTable()->hasEqualOrLessPtx(major, minor); }
+bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor) { return deviceInfoFuncTable()->hasEqualOrGreater(major, minor); }
+bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) { return deviceInfoFuncTable()->hasEqualOrGreaterPtx(major, minor); }
+bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) { return deviceInfoFuncTable()->hasEqualOrGreaterBin(major, minor); }
 
-    name_ = prop->name;
-    multi_processor_count_ = prop->multiProcessorCount;
-    majorVersion_ = prop->major;
-    minorVersion_ = prop->minor;
-}
+size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { return deviceInfoFuncTable()->sharedMemPerBlock(); }
+void cv::gpu::DeviceInfo::queryMemory(size_t& total_memory, size_t& free_memory) const { deviceInfoFuncTable()->queryMemory(total_memory, free_memory); }
+size_t cv::gpu::DeviceInfo::freeMemory() const { return deviceInfoFuncTable()->freeMemory(); }
+size_t cv::gpu::DeviceInfo::totalMemory() const { return deviceInfoFuncTable()->totalMemory(); }
+bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const { return deviceInfoFuncTable()->supports(feature_set); }
+bool cv::gpu::DeviceInfo::isCompatible() const { return deviceInfoFuncTable()->isCompatible(); }
 
-namespace
+void cv::gpu::DeviceInfo::query()
 {
-    int convertSMVer2Cores(int major, int minor)
-    {
-        // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
-        typedef struct {
-            int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
-            int Cores;
-        } SMtoCores;
-
-        SMtoCores gpuArchCoresPerSM[] =  { { 0x10,  8 }, { 0x11,  8 }, { 0x12,  8 }, { 0x13,  8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 }  };
-
-        int index = 0;
-        while (gpuArchCoresPerSM[index].SM != -1)
-        {
-            if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) )
-                return gpuArchCoresPerSM[index].Cores;
-            index++;
-        }
-
-        return -1;
-    }
+    deviceInfoFuncTable()->query();
+    name_ = deviceInfoFuncTable()->name();
+    multi_processor_count_ = deviceInfoFuncTable()->multiProcessorCount();
+    majorVersion_ = deviceInfoFuncTable()->majorVersion();
+    minorVersion_ = deviceInfoFuncTable()->minorVersion();
 }
 
-void cv::gpu::printCudaDeviceInfo(int device)
-{
-    int count = getCudaEnabledDeviceCount();
-    bool valid = (device >= 0) && (device < count);
-
-    int beg = valid ? device   : 0;
-    int end = valid ? device+1 : count;
-
-    printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n");
-    printf("Device count: %d\n", count);
-
-    int driverVersion = 0, runtimeVersion = 0;
-    cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
-    cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );
+void cv::gpu::printCudaDeviceInfo(int device) { deviceInfoFuncTable()->printCudaDeviceInfo(device); }
+void cv::gpu::printShortCudaDeviceInfo(int device) { deviceInfoFuncTable()->printShortCudaDeviceInfo(device); }
 
-    const char *computeMode[] = {
-        "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
-        "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
-        "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
-        "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
-        "Unknown",
-        NULL
-    };
-
-    for(int dev = beg; dev < end; ++dev)
-    {
-        cudaDeviceProp prop;
-        cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
-
-        printf("\nDevice %d: \"%s\"\n", dev, prop.name);
-        printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
-        printf("  CUDA Capability Major/Minor version number:    %d.%d\n", prop.major, prop.minor);
-        printf("  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem);
-
-        int cores = convertSMVer2Cores(prop.major, prop.minor);
-        if (cores > 0)
-            printf("  (%2d) Multiprocessors x (%2d) CUDA Cores/MP:     %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount);
-
-        printf("  GPU Clock Speed:                               %.2f GHz\n", prop.clockRate * 1e-6f);
-
-        printf("  Max Texture Dimension Size (x,y,z)             1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
-            prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1],
-            prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]);
-        printf("  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, 2D=(%d,%d) x %d\n",
-            prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1],
-            prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]);
-
-        printf("  Total amount of constant memory:               %u bytes\n", (int)prop.totalConstMem);
-        printf("  Total amount of shared memory per block:       %u bytes\n", (int)prop.sharedMemPerBlock);
-        printf("  Total number of registers available per block: %d\n", prop.regsPerBlock);
-        printf("  Warp size:                                     %d\n", prop.warpSize);
-        printf("  Maximum number of threads per block:           %d\n", prop.maxThreadsPerBlock);
-        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
-        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1],  prop.maxGridSize[2]);
-        printf("  Maximum memory pitch:                          %u bytes\n", (int)prop.memPitch);
-        printf("  Texture alignment:                             %u bytes\n", (int)prop.textureAlignment);
-
-        printf("  Concurrent copy and execution:                 %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount);
-        printf("  Run time limit on kernels:                     %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No");
-        printf("  Integrated GPU sharing Host Memory:            %s\n", prop.integrated ? "Yes" : "No");
-        printf("  Support host page-locked memory mapping:       %s\n", prop.canMapHostMemory ? "Yes" : "No");
-
-        printf("  Concurrent kernel execution:                   %s\n", prop.concurrentKernels ? "Yes" : "No");
-        printf("  Alignment requirement for Surfaces:            %s\n", prop.surfaceAlignment ? "Yes" : "No");
-        printf("  Device has ECC support enabled:                %s\n", prop.ECCEnabled ? "Yes" : "No");
-        printf("  Device is using TCC driver mode:               %s\n", prop.tccDriver ? "Yes" : "No");
-        printf("  Device supports Unified Addressing (UVA):      %s\n", prop.unifiedAddressing ? "Yes" : "No");
-        printf("  Device PCI Bus ID / PCI location ID:           %d / %d\n", prop.pciBusID, prop.pciDeviceID );
-        printf("  Compute Mode:\n");
-        printf("      %s \n", computeMode[prop.computeMode]);
-    }
-
-    printf("\n");
-    printf("deviceQuery, CUDA Driver = CUDART");
-    printf(", CUDA Driver Version  = %d.%d", driverVersion / 1000, driverVersion % 100);
-    printf(", CUDA Runtime Version = %d.%d", runtimeVersion/1000, runtimeVersion%100);
-    printf(", NumDevs = %d\n\n", count);
-    fflush(stdout);
-}
-
-void cv::gpu::printShortCudaDeviceInfo(int device)
+namespace cv { namespace gpu
 {
-    int count = getCudaEnabledDeviceCount();
-    bool valid = (device >= 0) && (device < count);
-
-    int beg = valid ? device   : 0;
-    int end = valid ? device+1 : count;
-
-    int driverVersion = 0, runtimeVersion = 0;
-    cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
-    cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );
-
-    for(int dev = beg; dev < end; ++dev)
-    {
-        cudaDeviceProp prop;
-        cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
-
-        const char *arch_str = prop.major < 2 ? " (not Fermi)" : "";
-        printf("Device %d:  \"%s\"  %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f);
-        printf(", sm_%d%d%s", prop.major, prop.minor, arch_str);
-
-        int cores = convertSMVer2Cores(prop.major, prop.minor);
-        if (cores > 0)
-            printf(", %d cores", cores * prop.multiProcessorCount);
-
-        printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
-    }
-    fflush(stdout);
-}
-
-#endif // HAVE_CUDA
+    CV_EXPORTS void copyWithMask(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, const cv::gpu::GpuMat&, cudaStream_t);
+    CV_EXPORTS void convertTo(const cv::gpu::GpuMat&, cv::gpu::GpuMat&);
+    CV_EXPORTS void convertTo(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, double, double, cudaStream_t = 0);
+    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, cudaStream_t);
+    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, cudaStream_t);
+    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar);
+    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&);
+}}
 
 //////////////////////////////// GpuMat ///////////////////////////////
 
@@ -830,601 +569,6 @@ GpuMat cv::gpu::allocMatFromBuf(int rows, int cols, int type, GpuMat &mat)
     return mat = GpuMat(rows, cols, type);
 }
 
-namespace
-{
-    class GpuFuncTable
-    {
-    public:
-        virtual ~GpuFuncTable() {}
-
-        virtual void copy(const Mat& src, GpuMat& dst) const = 0;
-        virtual void copy(const GpuMat& src, Mat& dst) const = 0;
-        virtual void copy(const GpuMat& src, GpuMat& dst) const = 0;
-
-        virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0;
-
-        virtual void convert(const GpuMat& src, GpuMat& dst) const = 0;
-        virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const = 0;
-
-        virtual void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const = 0;
-
-        virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0;
-        virtual void free(void* devPtr) const = 0;
-    };
-}
-
-#ifndef HAVE_CUDA
-
-namespace
-{
-    class EmptyFuncTable : public GpuFuncTable
-    {
-    public:
-        void copy(const Mat&, GpuMat&) const { throw_nogpu; }
-        void copy(const GpuMat&, Mat&) const { throw_nogpu; }
-        void copy(const GpuMat&, GpuMat&) const { throw_nogpu; }
-
-        void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu; }
-
-        void convert(const GpuMat&, GpuMat&) const { throw_nogpu; }
-        void convert(const GpuMat&, GpuMat&, double, double) const { throw_nogpu; }
-
-        void setTo(GpuMat&, Scalar, const GpuMat&) const { throw_nogpu; }
-
-        void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; }
-        void free(void*) const {}
-    };
-
-    const GpuFuncTable* gpuFuncTable()
-    {
-        static EmptyFuncTable empty;
-        return &empty;
-    }
-}
-
-#else // HAVE_CUDA
-
-namespace cv { namespace gpu { namespace device
-{
-    void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream);
-
-    template <typename T>
-    void set_to_gpu(PtrStepSzb mat, const T* scalar, int channels, cudaStream_t stream);
-
-    template <typename T>
-    void set_to_gpu(PtrStepSzb mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
-
-    void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream);
-}}}
-
-namespace
-{
-    template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream)
-    {
-        Scalar_<T> sf = s;
-        cv::gpu::device::set_to_gpu(src, sf.val, src.channels(), stream);
-    }
-
-    template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
-    {
-        Scalar_<T> sf = s;
-        cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream);
-    }
-}
-
-
-namespace cv { namespace gpu
-{
-    CV_EXPORTS void copyWithMask(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, const cv::gpu::GpuMat&, CUstream_st*);
-    CV_EXPORTS void convertTo(const cv::gpu::GpuMat&, cv::gpu::GpuMat&);
-    CV_EXPORTS void convertTo(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, double, double, CUstream_st*);
-    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, CUstream_st*);
-    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*);
-    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar);
-    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&);
-}}
-
-
-namespace cv { namespace gpu
-{
-    void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0)
-    {
-        CV_Assert(src.size() == dst.size() && src.type() == dst.type());
-        CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));
-
-        cv::gpu::device::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream);
-    }
-
-    void convertTo(const GpuMat& src, GpuMat& dst)
-    {
-        cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0);
-    }
-
-    void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0)
-    {
-        cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream);
-    }
-
-    void setTo(GpuMat& src, Scalar s, cudaStream_t stream)
-    {
-        typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream);
-
-        static const caller_t callers[] =
-        {
-            kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
-            kernelSetCaller<float>, kernelSetCaller<double>
-        };
-
-        callers[src.depth()](src, s, stream);
-    }
-
-    void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
-    {
-        typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
-
-        static const caller_t callers[] =
-        {
-            kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
-            kernelSetCaller<float>, kernelSetCaller<double>
-        };
-
-        callers[src.depth()](src, s, mask, stream);
-    }
-
-    void setTo(GpuMat& src, Scalar s)
-    {
-        setTo(src, s, 0);
-    }
-
-    void setTo(GpuMat& src, Scalar s, const GpuMat& mask)
-    {
-        setTo(src, s, mask, 0);
-    }
-}}
-
-namespace
-{
-    template<int n> struct NPPTypeTraits;
-    template<> struct NPPTypeTraits<CV_8U>  { typedef Npp8u npp_type; };
-    template<> struct NPPTypeTraits<CV_8S>  { typedef Npp8s npp_type; };
-    template<> struct NPPTypeTraits<CV_16U> { typedef Npp16u npp_type; };
-    template<> struct NPPTypeTraits<CV_16S> { typedef Npp16s npp_type; };
-    template<> struct NPPTypeTraits<CV_32S> { typedef Npp32s npp_type; };
-    template<> struct NPPTypeTraits<CV_32F> { typedef Npp32f npp_type; };
-    template<> struct NPPTypeTraits<CV_64F> { typedef Npp64f npp_type; };
-
-    //////////////////////////////////////////////////////////////////////////
-    // Convert
-
-    template<int SDEPTH, int DDEPTH> struct NppConvertFunc
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-
-        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI);
-    };
-    template<int DDEPTH> struct NppConvertFunc<CV_32F, DDEPTH>
-    {
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-
-        typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode);
-    };
-
-    template<int SDEPTH, int DDEPTH, typename NppConvertFunc<SDEPTH, DDEPTH>::func_ptr func> struct NppCvt
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-
-        static void call(const GpuMat& src, GpuMat& dst)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
-    {
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-
-        static void call(const GpuMat& src, GpuMat& dst)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            nppSafeCall( func(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz, NPP_RND_NEAR) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    //////////////////////////////////////////////////////////////////////////
-    // Set
-
-    template<int SDEPTH, int SCN> struct NppSetFunc
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
-    };
-    template<int SDEPTH> struct NppSetFunc<SDEPTH, 1>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
-    };
-    template<int SCN> struct NppSetFunc<CV_8S, SCN>
-    {
-        typedef NppStatus (*func_ptr)(Npp8s values[], Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
-    };
-    template<> struct NppSetFunc<CV_8S, 1>
-    {
-        typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
-    };
-
-    template<int SDEPTH, int SCN, typename NppSetFunc<SDEPTH, SCN>::func_ptr func> struct NppSet
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void call(GpuMat& src, Scalar s)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            Scalar_<src_t> nppS = s;
-
-            nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void call(GpuMat& src, Scalar s)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            Scalar_<src_t> nppS = s;
-
-            nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    template<int SDEPTH, int SCN> struct NppSetMaskFunc
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
-    };
-    template<int SDEPTH> struct NppSetMaskFunc<SDEPTH, 1>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
-    };
-
-    template<int SDEPTH, int SCN, typename NppSetMaskFunc<SDEPTH, SCN>::func_ptr func> struct NppSetMask
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void call(GpuMat& src, Scalar s, const GpuMat& mask)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            Scalar_<src_t> nppS = s;
-
-            nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void call(GpuMat& src, Scalar s, const GpuMat& mask)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            Scalar_<src_t> nppS = s;
-
-            nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    //////////////////////////////////////////////////////////////////////////
-    // CopyMasked
-
-    template<int SDEPTH> struct NppCopyMaskedFunc
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
-    };
-
-    template<int SDEPTH, typename NppCopyMaskedFunc<SDEPTH>::func_ptr func> struct NppCopyMasked
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void call(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t /*stream*/)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<src_t>(), static_cast<int>(dst.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    template <typename T> static inline bool isAligned(const T* ptr, size_t size)
-    {
-        return reinterpret_cast<size_t>(ptr) % size == 0;
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    // CudaFuncTable
-
-    class CudaFuncTable : public GpuFuncTable
-    {
-    public:
-        void copy(const Mat& src, GpuMat& dst) const
-        {
-            cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) );
-        }
-        void copy(const GpuMat& src, Mat& dst) const
-        {
-            cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) );
-        }
-        void copy(const GpuMat& src, GpuMat& dst) const
-        {
-            cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) );
-        }
-
-        void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const
-        {
-            CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
-            CV_Assert(src.size() == dst.size() && src.type() == dst.type());
-            CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));
-
-            if (src.depth() == CV_64F)
-            {
-                if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
-                    CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-            }
-
-            typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
-            static const func_t funcs[7][4] =
-            {
-                /*  8U */ {NppCopyMasked<CV_8U , nppiCopy_8u_C1MR >::call, cv::gpu::copyWithMask, NppCopyMasked<CV_8U , nppiCopy_8u_C3MR >::call, NppCopyMasked<CV_8U , nppiCopy_8u_C4MR >::call},
-                /*  8S */ {cv::gpu::copyWithMask                         , cv::gpu::copyWithMask, cv::gpu::copyWithMask                         , cv::gpu::copyWithMask                         },
-                /* 16U */ {NppCopyMasked<CV_16U, nppiCopy_16u_C1MR>::call, cv::gpu::copyWithMask, NppCopyMasked<CV_16U, nppiCopy_16u_C3MR>::call, NppCopyMasked<CV_16U, nppiCopy_16u_C4MR>::call},
-                /* 16S */ {NppCopyMasked<CV_16S, nppiCopy_16s_C1MR>::call, cv::gpu::copyWithMask, NppCopyMasked<CV_16S, nppiCopy_16s_C3MR>::call, NppCopyMasked<CV_16S, nppiCopy_16s_C4MR>::call},
-                /* 32S */ {NppCopyMasked<CV_32S, nppiCopy_32s_C1MR>::call, cv::gpu::copyWithMask, NppCopyMasked<CV_32S, nppiCopy_32s_C3MR>::call, NppCopyMasked<CV_32S, nppiCopy_32s_C4MR>::call},
-                /* 32F */ {NppCopyMasked<CV_32F, nppiCopy_32f_C1MR>::call, cv::gpu::copyWithMask, NppCopyMasked<CV_32F, nppiCopy_32f_C3MR>::call, NppCopyMasked<CV_32F, nppiCopy_32f_C4MR>::call},
-                /* 64F */ {cv::gpu::copyWithMask                         , cv::gpu::copyWithMask, cv::gpu::copyWithMask                         , cv::gpu::copyWithMask                         }
-            };
-
-            const func_t func =  mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::copyWithMask;
-
-            func(src, dst, mask, 0);
-        }
-
-        void convert(const GpuMat& src, GpuMat& dst) const
-        {
-            typedef void (*func_t)(const GpuMat& src, GpuMat& dst);
-            static const func_t funcs[7][7][4] =
-            {
-                {
-                    /*  8U ->  8U */ {0, 0, 0, 0},
-                    /*  8U ->  8S */ {cv::gpu::convertTo                                , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /*  8U -> 16U */ {NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C1R>::call, cv::gpu::convertTo, cv::gpu::convertTo, NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C4R>::call},
-                    /*  8U -> 16S */ {NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C1R>::call, cv::gpu::convertTo, cv::gpu::convertTo, NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C4R>::call},
-                    /*  8U -> 32S */ {cv::gpu::convertTo                                , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /*  8U -> 32F */ {NppCvt<CV_8U, CV_32F, nppiConvert_8u32f_C1R>::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /*  8U -> 64F */ {cv::gpu::convertTo                                , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                }
-                },
-                {
-                    /*  8S ->  8U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /*  8S ->  8S */ {0,0,0,0},
-                    /*  8S -> 16U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /*  8S -> 16S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /*  8S -> 32S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /*  8S -> 32F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /*  8S -> 64F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}
-                },
-                {
-                    /* 16U ->  8U */ {NppCvt<CV_16U, CV_8U , nppiConvert_16u8u_C1R >::call, cv::gpu::convertTo, cv::gpu::convertTo, NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C4R>::call},
-                    /* 16U ->  8S */ {cv::gpu::convertTo                                  , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /* 16U -> 16U */ {0,0,0,0},
-                    /* 16U -> 16S */ {cv::gpu::convertTo                                  , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /* 16U -> 32S */ {NppCvt<CV_16U, CV_32S, nppiConvert_16u32s_C1R>::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /* 16U -> 32F */ {NppCvt<CV_16U, CV_32F, nppiConvert_16u32f_C1R>::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /* 16U -> 64F */ {cv::gpu::convertTo                                  , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                }
-                },
-                {
-                    /* 16S ->  8U */ {NppCvt<CV_16S, CV_8U , nppiConvert_16s8u_C1R >::call, cv::gpu::convertTo, cv::gpu::convertTo, NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C4R>::call},
-                    /* 16S ->  8S */ {cv::gpu::convertTo                                  , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /* 16S -> 16U */ {cv::gpu::convertTo                                  , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /* 16S -> 16S */ {0,0,0,0},
-                    /* 16S -> 32S */ {NppCvt<CV_16S, CV_32S, nppiConvert_16s32s_C1R>::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /* 16S -> 32F */ {NppCvt<CV_16S, CV_32F, nppiConvert_16s32f_C1R>::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /* 16S -> 64F */ {cv::gpu::convertTo                                  , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                }
-                },
-                {
-                    /* 32S ->  8U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 32S ->  8S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 32S -> 16U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 32S -> 16S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 32S -> 32S */ {0,0,0,0},
-                    /* 32S -> 32F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 32S -> 64F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}
-                },
-                {
-                    /* 32F ->  8U */ {NppCvt<CV_32F, CV_8U , nppiConvert_32f8u_C1R >::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 32F ->  8S */ {cv::gpu::convertTo                                  , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 32F -> 16U */ {NppCvt<CV_32F, CV_16U, nppiConvert_32f16u_C1R>::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 32F -> 16S */ {NppCvt<CV_32F, CV_16S, nppiConvert_32f16s_C1R>::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 32F -> 32S */ {cv::gpu::convertTo                                  , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 32F -> 32F */ {0,0,0,0},
-                    /* 32F -> 64F */ {cv::gpu::convertTo                                  , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}
-                },
-                {
-                    /* 64F ->  8U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 64F ->  8S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 64F -> 16U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 64F -> 16S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 64F -> 32S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 64F -> 32F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 64F -> 64F */ {0,0,0,0}
-                }
-            };
-
-            CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
-            CV_Assert(dst.depth() <= CV_64F);
-            CV_Assert(src.size() == dst.size() && src.channels() == dst.channels());
-
-            if (src.depth() == CV_64F || dst.depth() == CV_64F)
-            {
-                if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
-                    CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-            }
-
-            bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16);
-            if (!aligned)
-            {
-                cv::gpu::convertTo(src, dst);
-                return;
-            }
-
-            const func_t func = funcs[src.depth()][dst.depth()][src.channels() - 1];
-            CV_DbgAssert(func != 0);
-
-            func(src, dst);
-        }
-
-        void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const
-        {
-            CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
-            CV_Assert(dst.depth() <= CV_64F);
-
-            if (src.depth() == CV_64F || dst.depth() == CV_64F)
-            {
-                if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
-                    CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-            }
-
-            cv::gpu::convertTo(src, dst, alpha, beta);
-        }
-
-        void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const
-        {
-            if (mask.empty())
-            {
-                if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
-                {
-                    cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) );
-                    return;
-                }
-
-                if (m.depth() == CV_8U)
-                {
-                    int cn = m.channels();
-
-                    if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))
-                    {
-                        int val = saturate_cast<uchar>(s[0]);
-                        cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) );
-                        return;
-                    }
-                }
-
-                typedef void (*func_t)(GpuMat& src, Scalar s);
-                static const func_t funcs[7][4] =
-                {
-                    {NppSet<CV_8U , 1, nppiSet_8u_C1R >::call, cv::gpu::setTo                          , cv::gpu::setTo                        , NppSet<CV_8U , 4, nppiSet_8u_C4R >::call},
-                    {cv::gpu::setTo                          , cv::gpu::setTo                          , cv::gpu::setTo                        , cv::gpu::setTo                          },
-                    {NppSet<CV_16U, 1, nppiSet_16u_C1R>::call, NppSet<CV_16U, 2, nppiSet_16u_C2R>::call, cv::gpu::setTo                        , NppSet<CV_16U, 4, nppiSet_16u_C4R>::call},
-                    {NppSet<CV_16S, 1, nppiSet_16s_C1R>::call, NppSet<CV_16S, 2, nppiSet_16s_C2R>::call, cv::gpu::setTo                        , NppSet<CV_16S, 4, nppiSet_16s_C4R>::call},
-                    {NppSet<CV_32S, 1, nppiSet_32s_C1R>::call, cv::gpu::setTo                          , cv::gpu::setTo                        , NppSet<CV_32S, 4, nppiSet_32s_C4R>::call},
-                    {NppSet<CV_32F, 1, nppiSet_32f_C1R>::call, cv::gpu::setTo                          , cv::gpu::setTo                        , NppSet<CV_32F, 4, nppiSet_32f_C4R>::call},
-                    {cv::gpu::setTo                          , cv::gpu::setTo                          , cv::gpu::setTo                        , cv::gpu::setTo                          }
-                };
-
-                CV_Assert(m.depth() <= CV_64F && m.channels() <= 4);
-
-                if (m.depth() == CV_64F)
-                {
-                    if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
-                        CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-                }
-
-                funcs[m.depth()][m.channels() - 1](m, s);
-            }
-            else
-            {
-                typedef void (*func_t)(GpuMat& src, Scalar s, const GpuMat& mask);
-                static const func_t funcs[7][4] =
-                {
-                    {NppSetMask<CV_8U , 1, nppiSet_8u_C1MR >::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask<CV_8U , 4, nppiSet_8u_C4MR >::call},
-                    {cv::gpu::setTo                               , cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo                               },
-                    {NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::call},
-                    {NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::call},
-                    {NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::call},
-                    {NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::call},
-                    {cv::gpu::setTo                               , cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo                               }
-                };
-
-                CV_Assert(m.depth() <= CV_64F && m.channels() <= 4);
-
-                if (m.depth() == CV_64F)
-                {
-                    if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
-                        CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-                }
-
-                funcs[m.depth()][m.channels() - 1](m, s, mask);
-            }
-        }
-
-        void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const
-        {
-            cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) );
-        }
-
-        void free(void* devPtr) const
-        {
-            cudaFree(devPtr);
-        }
-    };
-
-    const GpuFuncTable* gpuFuncTable()
-    {
-        static CudaFuncTable funcTable;
-        return &funcTable;
-    }
-}
-
-#endif // HAVE_CUDA
-
 void cv::gpu::GpuMat::upload(const Mat& m)
 {
     CV_DbgAssert(!m.empty());
@@ -1492,9 +636,9 @@ void cv::gpu::GpuMat::convertTo(GpuMat& dst, int rtype, double alpha, double bet
     dst.create(size(), rtype);
 
     if (noScale)
-        gpuFuncTable()->convert(*psrc, dst);
+        cv::gpu::convertTo(*psrc, dst);
     else
-        gpuFuncTable()->convert(*psrc, dst, alpha, beta);
+        cv::gpu::convertTo(*psrc, dst, alpha, beta);
 }
 
 GpuMat& cv::gpu::GpuMat::setTo(Scalar s, const GpuMat& mask)
@@ -1502,7 +646,7 @@ GpuMat& cv::gpu::GpuMat::setTo(Scalar s, const GpuMat& mask)
     CV_Assert(mask.empty() || mask.type() == CV_8UC1);
     CV_DbgAssert(!empty());
 
-    gpuFuncTable()->setTo(*this, s, mask);
+    gpu::setTo(*this, s, mask);
 
     return *this;
 }
@@ -1562,6 +706,39 @@ void cv::gpu::GpuMat::release()
     refcount = 0;
 }
 
+namespace cv { namespace gpu
+{
+    void convertTo(const GpuMat& src, GpuMat& dst)
+    {
+        gpuFuncTable()->convert(src, dst);
+    }
+
+    void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream)
+    {
+        gpuFuncTable()->convert(src, dst, alpha, beta, stream);
+    }
+
+    void setTo(GpuMat& src, Scalar s, cudaStream_t stream)
+    {
+        gpuFuncTable()->setTo(src, s, cv::gpu::GpuMat(), stream);
+    }
+
+    void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
+    {
+        gpuFuncTable()->setTo(src, s, mask, stream);
+    }
+
+    void setTo(GpuMat& src, Scalar s)
+    {
+        setTo(src, s, 0);
+    }
+
+    void setTo(GpuMat& src, Scalar s, const GpuMat& mask)
+    {
+        setTo(src, s, mask, 0);
+    }
+}}
+
 ////////////////////////////////////////////////////////////////////////
 // Error handling
 
diff --git a/modules/dynamicuda/CMakeLists.txt b/modules/dynamicuda/CMakeLists.txt
new file mode 100644
index 0000000000..f67879ef91
--- /dev/null
+++ b/modules/dynamicuda/CMakeLists.txt
@@ -0,0 +1,15 @@
+if(NOT DYNAMIC_CUDA_SUPPORT)
+  ocv_module_disable(dynamicuda)
+endif()
+
+set(the_description "Dynamic CUDA linkage")
+
+add_definitions(-DUSE_CUDA)
+ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
+ocv_module_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include")
+set(OPENCV_MODULE_TYPE SHARED)
+if (BUILD_FAT_JAVA_LIB)
+  ocv_define_module(dynamicuda opencv_java PRIVATE_REQUIRED ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
+else()
+  ocv_define_module(dynamicuda opencv_core PRIVATE_REQUIRED ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
+endif()
diff --git a/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp b/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp
new file mode 100644
index 0000000000..8973c53049
--- /dev/null
+++ b/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp
@@ -0,0 +1,1112 @@
+#ifndef __GPUMAT_CUDA_HPP__
+#define __GPUMAT_CUDA_HPP__
+
+#ifndef HAVE_CUDA
+typedef void* cudaStream_t;
+#endif
+
+class DeviceInfoFuncTable
+{
+public:
+    // cv::DeviceInfo
+    virtual size_t sharedMemPerBlock() const = 0;
+    virtual void queryMemory(size_t&, size_t&) const = 0;
+    virtual size_t freeMemory() const = 0;
+    virtual size_t totalMemory() const = 0;
+    virtual bool supports(FeatureSet) const = 0;
+    virtual bool isCompatible() const = 0;
+    virtual void query() = 0;
+    virtual int deviceID() const = 0;
+    virtual std::string name() const = 0;
+    virtual int majorVersion() const = 0;
+    virtual int minorVersion() const = 0;
+    virtual int multiProcessorCount() const = 0;
+    virtual int getCudaEnabledDeviceCount() const = 0;
+    virtual void setDevice(int) const = 0;
+    virtual int getDevice() const = 0;
+    virtual void resetDevice() const  = 0;
+    virtual bool deviceSupports(FeatureSet) const = 0;
+
+    // cv::TargetArchs
+    virtual bool builtWith(FeatureSet) const = 0;
+    virtual bool has(int, int) const = 0;
+    virtual bool hasPtx(int, int) const = 0;
+    virtual bool hasBin(int, int) const = 0;
+    virtual bool hasEqualOrLessPtx(int, int) const = 0;
+    virtual bool hasEqualOrGreater(int, int) const = 0;
+    virtual bool hasEqualOrGreaterPtx(int, int) const = 0;
+    virtual bool hasEqualOrGreaterBin(int, int) const = 0;
+
+    virtual void printCudaDeviceInfo(int) const = 0;
+    virtual void printShortCudaDeviceInfo(int) const = 0;
+
+    virtual ~DeviceInfoFuncTable() {};
+};
+
+class GpuFuncTable
+{
+public:
+    virtual ~GpuFuncTable() {}
+
+    // GpuMat routines
+    virtual void copy(const Mat& src, GpuMat& dst) const = 0;
+    virtual void copy(const GpuMat& src, Mat& dst) const = 0;
+    virtual void copy(const GpuMat& src, GpuMat& dst) const = 0;
+
+    virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0;
+
+    // gpu::device::convertTo funcs
+    virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0) const = 0;
+    virtual void convert(const GpuMat& src, GpuMat& dst) const = 0;
+
+    // for gpu::device::setTo funcs
+    virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, cudaStream_t) const = 0;
+
+    virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0;
+    virtual void free(void* devPtr) const = 0;
+};
+
+class EmptyDeviceInfoFuncTable: public DeviceInfoFuncTable
+{
+public:
+    size_t sharedMemPerBlock() const { throw_nogpu; return 0; }
+    void queryMemory(size_t&, size_t&) const { throw_nogpu; }
+    size_t freeMemory() const { throw_nogpu; return 0; }
+    size_t totalMemory() const { throw_nogpu; return 0; }
+    bool supports(FeatureSet) const { throw_nogpu; return false; }
+    bool isCompatible() const { throw_nogpu; return false; }
+    void query() { throw_nogpu; }
+    int deviceID() const { throw_nogpu; return -1; };
+    std::string name() const { throw_nogpu; return std::string(); }
+    int majorVersion() const { throw_nogpu; return -1; }
+    int minorVersion() const { throw_nogpu; return -1; }
+    int multiProcessorCount() const { throw_nogpu; return -1; }
+
+    int getCudaEnabledDeviceCount() const { return 0; }
+
+    void setDevice(int) const { throw_nogpu; }
+    int getDevice() const { throw_nogpu; return 0; }
+
+    void resetDevice() const { throw_nogpu; }
+
+    bool deviceSupports(FeatureSet) const { throw_nogpu; return false; }
+
+    bool builtWith(FeatureSet) const { throw_nogpu; return false; }
+    bool has(int, int) const { throw_nogpu; return false; }
+    bool hasPtx(int, int) const { throw_nogpu; return false; }
+    bool hasBin(int, int) const { throw_nogpu; return false; }
+    bool hasEqualOrLessPtx(int, int) const { throw_nogpu; return false; }
+    bool hasEqualOrGreater(int, int) const { throw_nogpu; return false; }
+    bool hasEqualOrGreaterPtx(int, int) const { throw_nogpu; return false; }
+    bool hasEqualOrGreaterBin(int, int) const { throw_nogpu; return false; }
+
+    void printCudaDeviceInfo(int) const
+    {
+        printf("The library is compiled without CUDA support\n");
+    }
+
+    void printShortCudaDeviceInfo(int) const
+    {
+        printf("The library is compiled without CUDA support\n");
+    }
+};
+
+class EmptyFuncTable : public GpuFuncTable
+{
+public:
+
+    void copy(const Mat&, GpuMat&) const { throw_nogpu; }
+    void copy(const GpuMat&, Mat&) const { throw_nogpu; }
+    void copy(const GpuMat&, GpuMat&) const { throw_nogpu; }
+
+    void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu; }
+
+    void convert(const GpuMat&, GpuMat&) const { throw_nogpu; }
+    void convert(const GpuMat&, GpuMat&, double, double, cudaStream_t stream = 0) const { (void)stream; throw_nogpu; }
+
+    virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, cudaStream_t) const { throw_nogpu; }
+
+    void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; }
+    void free(void*) const {}
+};
+
+#if defined(USE_CUDA)
+
+#define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, CV_Func)
+#define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, CV_Func)
+
+inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+{
+    if (cudaSuccess != err)
+        cv::gpu::error(cudaGetErrorString(err), file, line, func);
+}
+
+inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
+{
+    if (err < 0)
+    {
+        std::ostringstream msg;
+        msg << "NPP API Call Error: " << err;
+        cv::gpu::error(msg.str().c_str(), file, line, func);
+    }
+}
+
+namespace cv { namespace gpu { namespace device
+{
+    void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream);
+
+    template <typename T>
+    void set_to_gpu(PtrStepSzb mat, const T* scalar, int channels, cudaStream_t stream);
+
+    template <typename T>
+    void set_to_gpu(PtrStepSzb mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
+
+    void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream);
+}}}
+
+template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream)
+{
+    Scalar_<T> sf = s;
+    cv::gpu::device::set_to_gpu(src, sf.val, src.channels(), stream);
+}
+
+template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
+{
+    Scalar_<T> sf = s;
+    cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream);
+}
+
+template<int n> struct NPPTypeTraits;
+template<> struct NPPTypeTraits<CV_8U>  { typedef Npp8u npp_type; };
+template<> struct NPPTypeTraits<CV_8S>  { typedef Npp8s npp_type; };
+template<> struct NPPTypeTraits<CV_16U> { typedef Npp16u npp_type; };
+template<> struct NPPTypeTraits<CV_16S> { typedef Npp16s npp_type; };
+template<> struct NPPTypeTraits<CV_32S> { typedef Npp32s npp_type; };
+template<> struct NPPTypeTraits<CV_32F> { typedef Npp32f npp_type; };
+template<> struct NPPTypeTraits<CV_64F> { typedef Npp64f npp_type; };
+
+//////////////////////////////////////////////////////////////////////////
+// Convert
+
+template<int SDEPTH, int DDEPTH> struct NppConvertFunc
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+    typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+
+    typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI);
+};
+template<int DDEPTH> struct NppConvertFunc<CV_32F, DDEPTH>
+{
+    typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+
+    typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode);
+};
+
+template<int SDEPTH, int DDEPTH, typename NppConvertFunc<SDEPTH, DDEPTH>::func_ptr func> struct NppCvt
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+    typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+
+    static void call(const GpuMat& src, GpuMat& dst)
+    {
+        NppiSize sz;
+        sz.width = src.cols;
+        sz.height = src.rows;
+
+        nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz) );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+};
+
+template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
+{
+    typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+
+    static void call(const GpuMat& src, GpuMat& dst)
+    {
+        NppiSize sz;
+        sz.width = src.cols;
+        sz.height = src.rows;
+
+        nppSafeCall( func(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz, NPP_RND_NEAR) );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+// Set
+
+template<int SDEPTH, int SCN> struct NppSetFunc
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
+};
+template<int SDEPTH> struct NppSetFunc<SDEPTH, 1>
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
+};
+template<int SCN> struct NppSetFunc<CV_8S, SCN>
+{
+    typedef NppStatus (*func_ptr)(Npp8s values[], Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
+};
+template<> struct NppSetFunc<CV_8S, 1>
+{
+    typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
+};
+
+template<int SDEPTH, int SCN, typename NppSetFunc<SDEPTH, SCN>::func_ptr func> struct NppSet
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    static void call(GpuMat& src, Scalar s)
+    {
+        NppiSize sz;
+        sz.width = src.cols;
+        sz.height = src.rows;
+
+        Scalar_<src_t> nppS = s;
+
+        nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz) );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+};
+template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func>
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    static void call(GpuMat& src, Scalar s)
+    {
+        NppiSize sz;
+        sz.width = src.cols;
+        sz.height = src.rows;
+
+        Scalar_<src_t> nppS = s;
+
+        nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz) );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+};
+
+template<int SDEPTH, int SCN> struct NppSetMaskFunc
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
+};
+template<int SDEPTH> struct NppSetMaskFunc<SDEPTH, 1>
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
+};
+
+template<int SDEPTH, int SCN, typename NppSetMaskFunc<SDEPTH, SCN>::func_ptr func> struct NppSetMask
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    static void call(GpuMat& src, Scalar s, const GpuMat& mask)
+    {
+        NppiSize sz;
+        sz.width = src.cols;
+        sz.height = src.rows;
+
+        Scalar_<src_t> nppS = s;
+
+        nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+};
+template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func>
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    static void call(GpuMat& src, Scalar s, const GpuMat& mask)
+    {
+        NppiSize sz;
+        sz.width = src.cols;
+        sz.height = src.rows;
+
+        Scalar_<src_t> nppS = s;
+
+        nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+// CopyMasked
+
+template<int SDEPTH> struct NppCopyMaskedFunc
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
+};
+
+template<int SDEPTH, typename NppCopyMaskedFunc<SDEPTH>::func_ptr func> struct NppCopyMasked
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    static void call(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t /*stream*/)
+    {
+        NppiSize sz;
+        sz.width = src.cols;
+        sz.height = src.rows;
+
+        nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<src_t>(), static_cast<int>(dst.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+};
+
+template <typename T> static inline bool isAligned(const T* ptr, size_t size)
+{
+    return reinterpret_cast<size_t>(ptr) % size == 0;
+}
+
+namespace cv { namespace gpu { namespace device
+{
+    void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0);
+    void convertTo(const GpuMat& src, GpuMat& dst);
+    void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0);
+    void setTo(GpuMat& src, Scalar s, cudaStream_t stream);
+    void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
+    void setTo(GpuMat& src, Scalar s);
+    void setTo(GpuMat& src, Scalar s, const GpuMat& mask);
+
+    void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
+    {
+        CV_Assert(src.size() == dst.size() && src.type() == dst.type());
+        CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));
+
+        cv::gpu::device::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream);
+    }
+
+    void convertTo(const GpuMat& src, GpuMat& dst)
+    {
+        cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0);
+    }
+
+    void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream)
+    {
+        cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream);
+    }
+
+    void setTo(GpuMat& src, Scalar s, cudaStream_t stream)
+    {
+        typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream);
+
+        static const caller_t callers[] =
+        {
+            kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
+            kernelSetCaller<float>, kernelSetCaller<double>
+        };
+
+        callers[src.depth()](src, s, stream);
+    }
+
+    void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
+    {
+        typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
+
+        static const caller_t callers[] =
+        {
+            kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
+            kernelSetCaller<float>, kernelSetCaller<double>
+        };
+
+        callers[src.depth()](src, s, mask, stream);
+    }
+
+    void setTo(GpuMat& src, Scalar s)
+    {
+        setTo(src, s, 0);
+    }
+
+    void setTo(GpuMat& src, Scalar s, const GpuMat& mask)
+    {
+        setTo(src, s, mask, 0);
+    }
+}}}
+
+class CudaArch
+{
+public:
+    CudaArch()
+    {
+        fromStr(CUDA_ARCH_BIN, bin);
+        fromStr(CUDA_ARCH_PTX, ptx);
+        fromStr(CUDA_ARCH_FEATURES, features);
+    }
+
+    bool builtWith(FeatureSet feature_set) const
+    {
+        return !features.empty() && (features.back() >= feature_set);
+    }
+
+    bool hasPtx(int major, int minor) const
+    {
+        return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end();
+    }
+
+    bool hasBin(int major, int minor) const
+    {
+        return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end();
+    }
+
+    bool hasEqualOrLessPtx(int major, int minor) const
+    {
+        return !ptx.empty() && (ptx.front() <= major * 10 + minor);
+    }
+
+    bool hasEqualOrGreaterPtx(int major, int minor) const
+    {
+        return !ptx.empty() && (ptx.back() >= major * 10 + minor);
+    }
+
+    bool hasEqualOrGreaterBin(int major, int minor) const
+    {
+        return !bin.empty() && (bin.back() >= major * 10 + minor);
+    }
+
+
+private:
+    void fromStr(const string& set_as_str, vector<int>& arr)
+    {
+        if (set_as_str.find_first_not_of(" ") == string::npos)
+            return;
+
+        istringstream stream(set_as_str);
+        int cur_value;
+
+        while (!stream.eof())
+        {
+            stream >> cur_value;
+            arr.push_back(cur_value);
+        }
+
+        sort(arr.begin(), arr.end());
+    }
+
+    vector<int> bin;
+    vector<int> ptx;
+    vector<int> features;
+};
+
+class DeviceProps
+{
+public:
+    DeviceProps()
+    {
+        props_.resize(10, 0);
+    }
+
+    ~DeviceProps()
+    {
+        for (size_t i = 0; i < props_.size(); ++i)
+        {
+            if (props_[i])
+                delete props_[i];
+        }
+        props_.clear();
+    }
+
+    cudaDeviceProp* get(int devID)
+    {
+        if (devID >= (int) props_.size())
+            props_.resize(devID + 5, 0);
+
+        if (!props_[devID])
+        {
+            props_[devID] = new cudaDeviceProp;
+            cudaSafeCall( cudaGetDeviceProperties(props_[devID], devID) );
+        }
+
+        return props_[devID];
+    }
+private:
+    std::vector<cudaDeviceProp*> props_;
+};
+
+DeviceProps deviceProps;
+
+class CudaDeviceInfoFuncTable : public DeviceInfoFuncTable
+{
+public:
+    size_t sharedMemPerBlock() const
+    {
+        return deviceProps.get(device_id_)->sharedMemPerBlock;
+    }
+
+    void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const
+    {
+        int prevDeviceID = getDevice();
+        if (prevDeviceID != device_id_)
+            setDevice(device_id_);
+
+        cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) );
+
+        if (prevDeviceID != device_id_)
+            setDevice(prevDeviceID);
+    }
+
+    size_t freeMemory() const
+    {
+        size_t _totalMemory, _freeMemory;
+        queryMemory(_totalMemory, _freeMemory);
+        return _freeMemory;
+    }
+
+    size_t totalMemory() const
+    {
+        size_t _totalMemory, _freeMemory;
+        queryMemory(_totalMemory, _freeMemory);
+        return _totalMemory;
+    }
+
+    bool supports(FeatureSet feature_set) const
+    {
+        int version = majorVersion_ * 10 + minorVersion_;
+        return version >= feature_set;
+    }
+
+    bool isCompatible() const
+    {
+        // Check PTX compatibility
+        if (hasEqualOrLessPtx(majorVersion_, minorVersion_))
+            return true;
+
+        // Check BIN compatibility
+            for (int i = minorVersion_; i >= 0; --i)
+                if (hasBin(majorVersion_, i))
+                    return true;
+
+                return false;
+    }
+
+    void query()
+    {
+        const cudaDeviceProp* prop = deviceProps.get(device_id_);
+
+        name_ = prop->name;
+        multi_processor_count_ = prop->multiProcessorCount;
+        majorVersion_ = prop->major;
+        minorVersion_ = prop->minor;
+    }
+
+    int deviceID() const
+    {
+        return device_id_;
+    }
+
+    std::string name() const
+    {
+        return name_;
+    }
+
+    int majorVersion() const
+    {
+        return majorVersion_;
+    }
+
+    int minorVersion() const
+    {
+        return minorVersion_;
+    }
+
+    int multiProcessorCount() const
+    {
+        return multi_processor_count_;
+    }
+
+    int getCudaEnabledDeviceCount() const
+    {
+        int count;
+        cudaError_t error = cudaGetDeviceCount( &count );
+
+        if (error == cudaErrorInsufficientDriver)
+            return -1;
+
+        if (error == cudaErrorNoDevice)
+            return 0;
+
+        cudaSafeCall( error );
+        return count;
+    }
+
+    void setDevice(int device) const
+    {
+        cudaSafeCall( cudaSetDevice( device ) );
+    }
+
+    int getDevice() const
+    {
+        int device;
+        cudaSafeCall( cudaGetDevice( &device ) );
+        return device;
+    }
+
+    void resetDevice() const
+    {
+        cudaSafeCall( cudaDeviceReset() );
+    }
+
+    bool builtWith(FeatureSet feature_set) const
+    {
+        return cudaArch.builtWith(feature_set);
+    }
+
+    bool has(int major, int minor) const
+    {
+        return hasPtx(major, minor) || hasBin(major, minor);
+    }
+
+    bool hasPtx(int major, int minor) const
+    {
+        return cudaArch.hasPtx(major, minor);
+    }
+
+    bool hasBin(int major, int minor) const
+    {
+        return cudaArch.hasBin(major, minor);
+    }
+
+    bool hasEqualOrLessPtx(int major, int minor) const
+    {
+        return cudaArch.hasEqualOrLessPtx(major, minor);
+    }
+
+    bool hasEqualOrGreater(int major, int minor) const
+    {
+        return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
+    }
+
+    bool hasEqualOrGreaterPtx(int major, int minor) const
+    {
+        return cudaArch.hasEqualOrGreaterPtx(major, minor);
+    }
+
+    bool hasEqualOrGreaterBin(int major, int minor) const
+    {
+        return cudaArch.hasEqualOrGreaterBin(major, minor);
+    }
+
+    bool deviceSupports(FeatureSet feature_set) const
+    {
+        static int versions[] =
+        {
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+        };
+        static const int cache_size = static_cast<int>(sizeof(versions) / sizeof(versions[0]));
+
+        const int devId = getDevice();
+
+        int version;
+
+        if (devId < cache_size && versions[devId] >= 0)
+            version = versions[devId];
+        else
+        {
+            DeviceInfo dev(devId);
+            version = dev.majorVersion() * 10 + dev.minorVersion();
+            if (devId < cache_size)
+                versions[devId] = version;
+        }
+
+        return TargetArchs::builtWith(feature_set) && (version >= feature_set);
+    }
+
+    void printCudaDeviceInfo(int device) const
+    {
+        int count = getCudaEnabledDeviceCount();
+        bool valid = (device >= 0) && (device < count);
+
+        int beg = valid ? device   : 0;
+        int end = valid ? device+1 : count;
+
+        printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n");
+        printf("Device count: %d\n", count);
+
+        int driverVersion = 0, runtimeVersion = 0;
+        cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
+        cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );
+
+        const char *computeMode[] = {
+            "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
+               "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
+               "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
+               "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
+               "Unknown",
+               NULL
+        };
+
+        for(int dev = beg; dev < end; ++dev)
+        {
+            cudaDeviceProp prop;
+            cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
+
+            printf("\nDevice %d: \"%s\"\n", dev, prop.name);
+            printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
+            printf("  CUDA Capability Major/Minor version number:    %d.%d\n", prop.major, prop.minor);
+            printf("  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem);
+
+        int cores = convertSMVer2Cores(prop.major, prop.minor);
+        if (cores > 0)
+            printf("  (%2d) Multiprocessors x (%2d) CUDA Cores/MP:     %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount);
+
+        printf("  GPU Clock Speed:                               %.2f GHz\n", prop.clockRate * 1e-6f);
+
+        printf("  Max Texture Dimension Size (x,y,z)             1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
+               prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1],
+               prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]);
+        printf("  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, 2D=(%d,%d) x %d\n",
+               prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1],
+               prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]);
+
+        printf("  Total amount of constant memory:               %u bytes\n", (int)prop.totalConstMem);
+        printf("  Total amount of shared memory per block:       %u bytes\n", (int)prop.sharedMemPerBlock);
+        printf("  Total number of registers available per block: %d\n", prop.regsPerBlock);
+        printf("  Warp size:                                     %d\n", prop.warpSize);
+        printf("  Maximum number of threads per block:           %d\n", prop.maxThreadsPerBlock);
+        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
+        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1],  prop.maxGridSize[2]);
+        printf("  Maximum memory pitch:                          %u bytes\n", (int)prop.memPitch);
+        printf("  Texture alignment:                             %u bytes\n", (int)prop.textureAlignment);
+
+        printf("  Concurrent copy and execution:                 %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount);
+        printf("  Run time limit on kernels:                     %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No");
+        printf("  Integrated GPU sharing Host Memory:            %s\n", prop.integrated ? "Yes" : "No");
+        printf("  Support host page-locked memory mapping:       %s\n", prop.canMapHostMemory ? "Yes" : "No");
+
+        printf("  Concurrent kernel execution:                   %s\n", prop.concurrentKernels ? "Yes" : "No");
+        printf("  Alignment requirement for Surfaces:            %s\n", prop.surfaceAlignment ? "Yes" : "No");
+        printf("  Device has ECC support enabled:                %s\n", prop.ECCEnabled ? "Yes" : "No");
+        printf("  Device is using TCC driver mode:               %s\n", prop.tccDriver ? "Yes" : "No");
+        printf("  Device supports Unified Addressing (UVA):      %s\n", prop.unifiedAddressing ? "Yes" : "No");
+        printf("  Device PCI Bus ID / PCI location ID:           %d / %d\n", prop.pciBusID, prop.pciDeviceID );
+        printf("  Compute Mode:\n");
+        printf("      %s \n", computeMode[prop.computeMode]);
+        }
+
+        printf("\n");
+        printf("deviceQuery, CUDA Driver = CUDART");
+        printf(", CUDA Driver Version  = %d.%d", driverVersion / 1000, driverVersion % 100);
+        printf(", CUDA Runtime Version = %d.%d", runtimeVersion/1000, runtimeVersion%100);
+        printf(", NumDevs = %d\n\n", count);
+        fflush(stdout);
+    }
+
+    void printShortCudaDeviceInfo(int device) const
+    {
+        int count = getCudaEnabledDeviceCount();
+        bool valid = (device >= 0) && (device < count);
+
+        int beg = valid ? device   : 0;
+        int end = valid ? device+1 : count;
+
+        int driverVersion = 0, runtimeVersion = 0;
+        cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
+        cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );
+
+        for(int dev = beg; dev < end; ++dev)
+        {
+            cudaDeviceProp prop;
+            cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
+
+            const char *arch_str = prop.major < 2 ? " (not Fermi)" : "";
+            printf("Device %d:  \"%s\"  %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f);
+            printf(", sm_%d%d%s", prop.major, prop.minor, arch_str);
+
+            int cores = convertSMVer2Cores(prop.major, prop.minor);
+            if (cores > 0)
+                printf(", %d cores", cores * prop.multiProcessorCount);
+
+            printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
+        }
+        fflush(stdout);
+    }
+
+private:
+    int device_id_;
+
+    std::string name_;
+    int multi_processor_count_;
+    int majorVersion_;
+    int minorVersion_;
+
+    const CudaArch cudaArch;
+
+    int convertSMVer2Cores(int major, int minor) const
+    {
+        // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
+        typedef struct {
+            int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
+            int Cores;
+        } SMtoCores;
+
+        SMtoCores gpuArchCoresPerSM[] =  { { 0x10,  8 }, { 0x11,  8 }, { 0x12,  8 }, { 0x13,  8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 }  };
+
+        int index = 0;
+        while (gpuArchCoresPerSM[index].SM != -1)
+        {
+            if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) )
+                return gpuArchCoresPerSM[index].Cores;
+            index++;
+        }
+
+        return -1;
+    }
+};
+
+class CudaFuncTable : public GpuFuncTable
+{
+public:
+
+    void copy(const Mat& src, GpuMat& dst) const
+    {
+        cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) );
+    }
+
+    void copy(const GpuMat& src, Mat& dst) const
+    {
+        cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) );
+    }
+
+    void copy(const GpuMat& src, GpuMat& dst) const
+    {
+        cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) );
+    }
+
+    void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const
+    {
+        CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
+        CV_Assert(src.size() == dst.size() && src.type() == dst.type());
+        CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));
+
+        if (src.depth() == CV_64F)
+        {
+            if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+                CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+        }
+
+        typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
+        static const func_t funcs[7][4] =
+        {
+            /*  8U */ {NppCopyMasked<CV_8U , nppiCopy_8u_C1MR >::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_8U , nppiCopy_8u_C3MR >::call, NppCopyMasked<CV_8U , nppiCopy_8u_C4MR >::call},
+            /*  8S */ {cv::gpu::device::copyWithMask                ,  cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask                 , cv::gpu::device::copyWithMask                         },
+            /* 16U */ {NppCopyMasked<CV_16U, nppiCopy_16u_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_16U, nppiCopy_16u_C3MR>::call, NppCopyMasked<CV_16U, nppiCopy_16u_C4MR>::call},
+            /* 16S */ {NppCopyMasked<CV_16S, nppiCopy_16s_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_16S, nppiCopy_16s_C3MR>::call, NppCopyMasked<CV_16S, nppiCopy_16s_C4MR>::call},
+            /* 32S */ {NppCopyMasked<CV_32S, nppiCopy_32s_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_32S, nppiCopy_32s_C3MR>::call, NppCopyMasked<CV_32S, nppiCopy_32s_C4MR>::call},
+            /* 32F */ {NppCopyMasked<CV_32F, nppiCopy_32f_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_32F, nppiCopy_32f_C3MR>::call, NppCopyMasked<CV_32F, nppiCopy_32f_C4MR>::call},
+            /* 64F */ {cv::gpu::device::copyWithMask                ,  cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask                 , cv::gpu::device::copyWithMask                         }
+         };
+
+         const func_t func =  mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::device::copyWithMask;
+
+         func(src, dst, mask, 0);
+    }
+
+    void convert(const GpuMat& src, GpuMat& dst) const
+    {
+        typedef void (*func_t)(const GpuMat& src, GpuMat& dst);
+        static const func_t funcs[7][7][4] =
+        {
+            {
+                /*  8U ->  8U */ {0, 0, 0, 0},
+                /*  8U ->  8S */ {cv::gpu::device::convertTo                        , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /*  8U -> 16U */ {NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C4R>::call},
+                /*  8U -> 16S */ {NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C4R>::call},
+                /*  8U -> 32S */ {cv::gpu::device::convertTo                        , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /*  8U -> 32F */ {NppCvt<CV_8U, CV_32F, nppiConvert_8u32f_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /*  8U -> 64F */ {cv::gpu::device::convertTo                        , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                }
+            },
+            {
+                /*  8S ->  8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /*  8S ->  8S */ {0,0,0,0},
+                /*  8S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /*  8S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /*  8S -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /*  8S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /*  8S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}
+            },
+            {
+                /* 16U ->  8U */ {NppCvt<CV_16U, CV_8U , nppiConvert_16u8u_C1R >::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C4R>::call},
+                /* 16U ->  8S */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16U -> 16U */ {0,0,0,0},
+                /* 16U -> 16S */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16U -> 32S */ {NppCvt<CV_16U, CV_32S, nppiConvert_16u32s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16U -> 32F */ {NppCvt<CV_16U, CV_32F, nppiConvert_16u32f_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16U -> 64F */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                }
+            },
+            {
+                /* 16S ->  8U */ {NppCvt<CV_16S, CV_8U , nppiConvert_16s8u_C1R >::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C4R>::call},
+                /* 16S ->  8S */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16S -> 16U */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16S -> 16S */ {0,0,0,0},
+                /* 16S -> 32S */ {NppCvt<CV_16S, CV_32S, nppiConvert_16s32s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16S -> 32F */ {NppCvt<CV_16S, CV_32F, nppiConvert_16s32f_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16S -> 64F */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                }
+            },
+            {
+                /* 32S ->  8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32S ->  8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32S -> 32S */ {0,0,0,0},
+                /* 32S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}
+            },
+            {
+                /* 32F ->  8U */ {NppCvt<CV_32F, CV_8U , nppiConvert_32f8u_C1R >::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32F ->  8S */ {cv::gpu::device::convertTo                          , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32F -> 16U */ {NppCvt<CV_32F, CV_16U, nppiConvert_32f16u_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32F -> 16S */ {NppCvt<CV_32F, CV_16S, nppiConvert_32f16s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32F -> 32S */ {cv::gpu::device::convertTo                          , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32F -> 32F */ {0,0,0,0},
+                /* 32F -> 64F */ {cv::gpu::device::convertTo                          , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}
+            },
+            {
+                /* 64F ->  8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 64F ->  8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 64F -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 64F -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 64F -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 64F -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 64F -> 64F */ {0,0,0,0}
+            }
+        };
+
+        CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
+        CV_Assert(dst.depth() <= CV_64F);
+        CV_Assert(src.size() == dst.size() && src.channels() == dst.channels());
+
+        if (src.depth() == CV_64F || dst.depth() == CV_64F)
+        {
+            if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+                CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+        }
+
+        bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16);
+        if (!aligned)
+        {
+            cv::gpu::device::convertTo(src, dst);
+            return;
+        }
+
+        const func_t func = funcs[src.depth()][dst.depth()][src.channels() - 1];
+        CV_DbgAssert(func != 0);
+
+        func(src, dst);
+    }
+
+    void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream) const
+    {
+        CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
+        CV_Assert(dst.depth() <= CV_64F);
+
+        if (src.depth() == CV_64F || dst.depth() == CV_64F)
+        {
+            if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+                CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+        }
+
+        cv::gpu::device::convertTo(src, dst, alpha, beta, stream);
+    }
+
+    void setTo(GpuMat& m, Scalar s, const GpuMat& mask, cudaStream_t stream) const
+    {
+        if (mask.empty())
+        {
+            if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
+            {
+                cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) );
+                return;
+            }
+
+            if (m.depth() == CV_8U)
+            {
+                int cn = m.channels();
+
+                if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))
+                {
+                    int val = saturate_cast<uchar>(s[0]);
+                    cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) );
+                    return;
+                }
+            }
+
+            typedef void (*func_t)(GpuMat& src, Scalar s);
+            static const func_t funcs[7][4] =
+            {
+                {NppSet<CV_8U , 1, nppiSet_8u_C1R >::call, cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , NppSet<CV_8U , 4, nppiSet_8u_C4R >::call},
+                {cv::gpu::device::setTo                  , cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , cv::gpu::device::setTo                          },
+                {NppSet<CV_16U, 1, nppiSet_16u_C1R>::call, NppSet<CV_16U, 2, nppiSet_16u_C2R>::call, cv::gpu::device::setTo                        , NppSet<CV_16U, 4, nppiSet_16u_C4R>::call},
+                {NppSet<CV_16S, 1, nppiSet_16s_C1R>::call, NppSet<CV_16S, 2, nppiSet_16s_C2R>::call, cv::gpu::device::setTo                        , NppSet<CV_16S, 4, nppiSet_16s_C4R>::call},
+                {NppSet<CV_32S, 1, nppiSet_32s_C1R>::call, cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , NppSet<CV_32S, 4, nppiSet_32s_C4R>::call},
+                {NppSet<CV_32F, 1, nppiSet_32f_C1R>::call, cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , NppSet<CV_32F, 4, nppiSet_32f_C4R>::call},
+                {cv::gpu::device::setTo                  , cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , cv::gpu::device::setTo                          }
+            };
+
+            CV_Assert(m.depth() <= CV_64F && m.channels() <= 4);
+
+            if (m.depth() == CV_64F)
+            {
+                if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+                    CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+            }
+
+            if (stream)
+                cv::gpu::device::setTo(m, s, stream);
+            else
+                funcs[m.depth()][m.channels() - 1](m, s);
+        }
+        else
+        {
+            typedef void (*func_t)(GpuMat& src, Scalar s, const GpuMat& mask);
+            static const func_t funcs[7][4] =
+            {
+                {NppSetMask<CV_8U , 1, nppiSet_8u_C1MR >::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_8U , 4, nppiSet_8u_C4MR >::call},
+                {cv::gpu::device::setTo                       , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo                               },
+                {NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::call},
+                {NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::call},
+                {NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::call},
+                {NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::call},
+                {cv::gpu::device::setTo                       , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo                               }
+            };
+
+            CV_Assert(m.depth() <= CV_64F && m.channels() <= 4);
+
+            if (m.depth() == CV_64F)
+            {
+                if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+                    CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+            }
+
+            if (stream)
+                cv::gpu::device::setTo(m, s, mask, stream);
+            else
+                funcs[m.depth()][m.channels() - 1](m, s, mask);
+        }
+    }
+
+    void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const
+    {
+        cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) );
+    }
+
+    void free(void* devPtr) const
+    {
+        cudaFree(devPtr);
+    }
+};
+#endif
+#endif
diff --git a/modules/core/src/cuda/matrix_operations.cu b/modules/dynamicuda/src/cuda/matrix_operations.cu
similarity index 100%
rename from modules/core/src/cuda/matrix_operations.cu
rename to modules/dynamicuda/src/cuda/matrix_operations.cu
diff --git a/modules/dynamicuda/src/main.cpp b/modules/dynamicuda/src/main.cpp
new file mode 100644
index 0000000000..8eb66fd98d
--- /dev/null
+++ b/modules/dynamicuda/src/main.cpp
@@ -0,0 +1,55 @@
+#include "cvconfig.h"
+#include "opencv2/core/core.hpp"
+#include "opencv2/core/gpumat.hpp"
+
+#include <stdio.h>
+#include <iostream>
+
+#ifdef HAVE_CUDA
+#include <cuda_runtime.h>
+#include <npp.h>
+
+#define CUDART_MINIMUM_REQUIRED_VERSION 4020
+#define NPP_MINIMUM_REQUIRED_VERSION 4200
+
+#if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION)
+#error "Insufficient Cuda Runtime library version, please update it."
+#endif
+
+#if (NPP_VERSION_MAJOR * 1000 + NPP_VERSION_MINOR * 100 + NPP_VERSION_BUILD < NPP_MINIMUM_REQUIRED_VERSION)
+#error "Insufficient NPP version, please update it."
+#endif
+#endif
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+
+#define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")
+
+#include "opencv2/dynamicuda/dynamicuda.hpp"
+
+#ifdef HAVE_CUDA
+static CudaDeviceInfoFuncTable deviceInfoTable;
+static CudaFuncTable gpuTable;
+#else
+static EmptyDeviceInfoFuncTable deviceInfoTable;
+static EmptyFuncTable gpuTable;
+#endif
+
+extern "C" {
+
+DeviceInfoFuncTable* deviceInfoFactory();
+GpuFuncTable* gpuFactory();
+
+DeviceInfoFuncTable* deviceInfoFactory()
+{
+    return (DeviceInfoFuncTable*)&deviceInfoTable;
+}
+
+GpuFuncTable* gpuFactory()
+{
+    return (GpuFuncTable*)&gpuTable;
+}
+
+}
diff --git a/modules/gpu/CMakeLists.txt b/modules/gpu/CMakeLists.txt
index a616597894..9171febc74 100644
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -3,7 +3,8 @@ if(IOS)
 endif()
 
 set(the_description "GPU-accelerated Computer Vision")
-ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_photo opencv_legacy)
+ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_photo opencv_legacy
+               OPTIONAL ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY})
 
 ocv_module_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/src/cuda")
 
diff --git a/modules/gpu/perf4au/CMakeLists.txt b/modules/gpu/perf4au/CMakeLists.txt
index 376e7b2706..13efe7ffa3 100644
--- a/modules/gpu/perf4au/CMakeLists.txt
+++ b/modules/gpu/perf4au/CMakeLists.txt
@@ -2,26 +2,28 @@ set(PERF4AU_REQUIRED_DEPS opencv_core opencv_imgproc opencv_highgui opencv_video
 
 ocv_check_dependencies(${PERF4AU_REQUIRED_DEPS})
 
-set(the_target gpu_perf4au)
-project(${the_target})
+if (OCV_DEPENDENCIES_FOUND)
+  set(the_target gpu_perf4au)
+  project(${the_target})
 
-ocv_include_modules(${PERF4AU_REQUIRED_DEPS})
+  ocv_include_modules(${PERF4AU_REQUIRED_DEPS})
 
-if(CMAKE_COMPILER_IS_GNUCXX AND NOT ENABLE_NOISY_WARNINGS)
+  if(CMAKE_COMPILER_IS_GNUCXX AND NOT ENABLE_NOISY_WARNINGS)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-function")
-endif()
+  endif()
 
-file(GLOB srcs RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp *.h *.hpp)
-add_executable(${the_target} ${srcs})
+  file(GLOB srcs RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp *.h *.hpp)
+  add_executable(${the_target} ${srcs})
 
-target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${PERF4AU_REQUIRED_DEPS})
+  target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${PERF4AU_REQUIRED_DEPS})
 
-if(ENABLE_SOLUTION_FOLDERS)
-  set_target_properties(${the_target} PROPERTIES FOLDER "tests performance")
-endif()
+  if(ENABLE_SOLUTION_FOLDERS)
+    set_target_properties(${the_target} PROPERTIES FOLDER "tests performance")
+  endif()
 
-if(WIN32)
+  if(WIN32)
     if(MSVC AND NOT BUILD_SHARED_LIBS)
-        set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:atlthunk.lib /NODEFAULTLIB:atlsd.lib /DEBUG")
+      set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:atlthunk.lib /NODEFAULTLIB:atlsd.lib /DEBUG")
     endif()
-endif()
+  endif()
+endif()
\ No newline at end of file
diff --git a/modules/java/CMakeLists.txt b/modules/java/CMakeLists.txt
index 5012f914c7..3a6ebe8362 100644
--- a/modules/java/CMakeLists.txt
+++ b/modules/java/CMakeLists.txt
@@ -297,6 +297,12 @@ if(BUILD_FAT_JAVA_LIB)
       list(REMOVE_ITEM __deps ${m})
     endif()
   endforeach()
+  if (ENABLE_DYNAMIC_CUDA)
+    list(REMOVE_ITEM __deps "opencv_dynamicuda")
+  endif()
+  if (ANDROID AND HAVE_opencv_gpu)
+    list(REMOVE_ITEM __deps "opencv_gpu")
+  endif()
   ocv_list_unique(__deps)
   set(__extradeps ${__deps})
   ocv_list_filterout(__extradeps "^opencv_")
diff --git a/modules/stitching/CMakeLists.txt b/modules/stitching/CMakeLists.txt
index fda44591f7..6e9a35ba73 100644
--- a/modules/stitching/CMakeLists.txt
+++ b/modules/stitching/CMakeLists.txt
@@ -1,2 +1,6 @@
 set(the_description "Images stitching")
-ocv_define_module(stitching opencv_imgproc opencv_features2d opencv_calib3d opencv_objdetect OPTIONAL opencv_gpu opencv_nonfree)
+if (ENABLE_DYNAMIC_CUDA)
+  ocv_define_module(stitching opencv_imgproc opencv_features2d opencv_calib3d opencv_objdetect OPTIONAL opencv_nonfree)
+else()
+  ocv_define_module(stitching opencv_imgproc opencv_features2d opencv_calib3d opencv_objdetect OPTIONAL opencv_gpu opencv_nonfree)
+endif()
\ No newline at end of file
diff --git a/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp b/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp
index 09a1a106fd..9301dc5ebe 100644
--- a/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp
@@ -227,7 +227,7 @@ private:
 };
 
 
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 class CV_EXPORTS GraphCutSeamFinderGpu : public GraphCutSeamFinderBase, public PairwiseSeamFinder
 {
 public:
diff --git a/modules/stitching/include/opencv2/stitching/detail/warpers.hpp b/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
index 2bd46f75a9..d44bfe69eb 100644
--- a/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
@@ -46,7 +46,7 @@
 #include "opencv2/core/core.hpp"
 #include "opencv2/imgproc/imgproc.hpp"
 #include "opencv2/opencv_modules.hpp"
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 # include "opencv2/gpu/gpu.hpp"
 #endif
 
@@ -331,7 +331,7 @@ public:
 };
 
 
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 class CV_EXPORTS PlaneWarperGpu : public PlaneWarper
 {
 public:
diff --git a/modules/stitching/include/opencv2/stitching/warpers.hpp b/modules/stitching/include/opencv2/stitching/warpers.hpp
index 7475d1304a..87efa7e80a 100644
--- a/modules/stitching/include/opencv2/stitching/warpers.hpp
+++ b/modules/stitching/include/opencv2/stitching/warpers.hpp
@@ -145,7 +145,7 @@ public:
 
 
 
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 class PlaneWarperGpu: public WarperCreator
 {
 public:
diff --git a/modules/stitching/src/blenders.cpp b/modules/stitching/src/blenders.cpp
index e65023a55d..fb3c0d666b 100644
--- a/modules/stitching/src/blenders.cpp
+++ b/modules/stitching/src/blenders.cpp
@@ -189,7 +189,7 @@ Rect FeatherBlender::createWeightMaps(const vector<Mat> &masks, const vector<Poi
 MultiBandBlender::MultiBandBlender(int try_gpu, int num_bands, int weight_type)
 {
     setNumBands(num_bands);
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
     can_use_gpu_ = try_gpu && gpu::getCudaEnabledDeviceCount();
 #else
     (void)try_gpu;
@@ -491,7 +491,7 @@ void createLaplacePyr(const Mat &img, int num_levels, vector<Mat> &pyr)
 
 void createLaplacePyrGpu(const Mat &img, int num_levels, vector<Mat> &pyr)
 {
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
     pyr.resize(num_levels + 1);
 
     vector<gpu::GpuMat> gpu_pyr(num_levels + 1);
@@ -531,7 +531,7 @@ void restoreImageFromLaplacePyr(vector<Mat> &pyr)
 
 void restoreImageFromLaplacePyrGpu(vector<Mat> &pyr)
 {
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
     if (pyr.empty())
         return;
 
diff --git a/modules/stitching/src/matchers.cpp b/modules/stitching/src/matchers.cpp
index d918cfff29..d86206233f 100644
--- a/modules/stitching/src/matchers.cpp
+++ b/modules/stitching/src/matchers.cpp
@@ -46,7 +46,7 @@ using namespace std;
 using namespace cv;
 using namespace cv::detail;
 
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 using namespace cv::gpu;
 #endif
 
@@ -129,7 +129,7 @@ private:
     float match_conf_;
 };
 
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 class GpuMatcher : public FeaturesMatcher
 {
 public:
@@ -204,7 +204,7 @@ void CpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat
     LOG("1->2 & 2->1 matches: " << matches_info.matches.size() << endl);
 }
 
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 void GpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &features2, MatchesInfo& matches_info)
 {
     matches_info.matches.clear();
@@ -432,7 +432,7 @@ void OrbFeaturesFinder::find(const Mat &image, ImageFeatures &features)
     }
 }
 
-#if defined(HAVE_OPENCV_NONFREE) && defined(HAVE_OPENCV_GPU)
+#if defined(HAVE_OPENCV_NONFREE) && defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 SurfFeaturesFinderGpu::SurfFeaturesFinderGpu(double hess_thresh, int num_octaves, int num_layers,
                                              int num_octaves_descr, int num_layers_descr)
 {
@@ -533,7 +533,7 @@ void FeaturesMatcher::operator ()(const vector<ImageFeatures> &features, vector<
 
 BestOf2NearestMatcher::BestOf2NearestMatcher(bool try_use_gpu, float match_conf, int num_matches_thresh1, int num_matches_thresh2)
 {
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
     if (try_use_gpu && getCudaEnabledDeviceCount() > 0)
         impl_ = new GpuMatcher(match_conf);
     else
diff --git a/modules/stitching/src/precomp.hpp b/modules/stitching/src/precomp.hpp
index 1050856d31..54b6721437 100644
--- a/modules/stitching/src/precomp.hpp
+++ b/modules/stitching/src/precomp.hpp
@@ -68,7 +68,7 @@
 #include "opencv2/imgproc/imgproc.hpp"
 #include "opencv2/features2d/features2d.hpp"
 #include "opencv2/calib3d/calib3d.hpp"
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
     #include "opencv2/gpu/gpu.hpp"
 
     #ifdef HAVE_OPENCV_NONFREE
diff --git a/modules/stitching/src/seam_finders.cpp b/modules/stitching/src/seam_finders.cpp
index 784209c935..a198c1ebb4 100644
--- a/modules/stitching/src/seam_finders.cpp
+++ b/modules/stitching/src/seam_finders.cpp
@@ -1318,7 +1318,7 @@ void GraphCutSeamFinder::find(const vector<Mat> &src, const vector<Point> &corne
 }
 
 
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 void GraphCutSeamFinderGpu::find(const vector<Mat> &src, const vector<Point> &corners,
                                  vector<Mat> &masks)
 {
diff --git a/modules/stitching/src/stitcher.cpp b/modules/stitching/src/stitcher.cpp
index 5da26f6dbf..4a36ab0a45 100644
--- a/modules/stitching/src/stitcher.cpp
+++ b/modules/stitching/src/stitcher.cpp
@@ -58,7 +58,7 @@ Stitcher Stitcher::createDefault(bool try_use_gpu)
     stitcher.setFeaturesMatcher(new detail::BestOf2NearestMatcher(try_use_gpu));
     stitcher.setBundleAdjuster(new detail::BundleAdjusterRay());
 
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
     if (try_use_gpu && gpu::getCudaEnabledDeviceCount() > 0)
     {
 #if defined(HAVE_OPENCV_NONFREE)
diff --git a/modules/stitching/src/warpers.cpp b/modules/stitching/src/warpers.cpp
index 932958c6f7..935831950f 100644
--- a/modules/stitching/src/warpers.cpp
+++ b/modules/stitching/src/warpers.cpp
@@ -212,7 +212,7 @@ void SphericalWarper::detectResultRoi(Size src_size, Point &dst_tl, Point &dst_b
 }
 
 
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 Rect PlaneWarperGpu::buildMaps(Size src_size, const Mat &K, const Mat &R, gpu::GpuMat &xmap, gpu::GpuMat &ymap)
 {
     return buildMaps(src_size, K, R, Mat::zeros(3, 1, CV_32F), xmap, ymap);
diff --git a/modules/superres/CMakeLists.txt b/modules/superres/CMakeLists.txt
index 44e9dc0f3b..3da8dc2c6e 100644
--- a/modules/superres/CMakeLists.txt
+++ b/modules/superres/CMakeLists.txt
@@ -4,4 +4,4 @@ endif()
 
 set(the_description "Super Resolution")
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 -Wundef)
-ocv_define_module(superres opencv_imgproc opencv_video OPTIONAL opencv_gpu opencv_highgui opencv_ocl)
+ocv_define_module(superres opencv_imgproc opencv_video OPTIONAL opencv_gpu opencv_highgui opencv_ocl ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
diff --git a/modules/videostab/CMakeLists.txt b/modules/videostab/CMakeLists.txt
index ac5cb0d69b..84ec1d2e8d 100644
--- a/modules/videostab/CMakeLists.txt
+++ b/modules/videostab/CMakeLists.txt
@@ -1,2 +1,6 @@
 set(the_description "Video stabilization")
-ocv_define_module(videostab opencv_imgproc opencv_features2d opencv_video opencv_photo opencv_calib3d opencv_highgui OPTIONAL opencv_gpu)
+if(ENABLE_DYNAMIC_CUDA)
+  ocv_define_module(videostab opencv_imgproc opencv_features2d opencv_video opencv_photo opencv_calib3d opencv_highgui)
+else()
+  ocv_define_module(videostab opencv_imgproc opencv_features2d opencv_video opencv_photo opencv_calib3d opencv_highgui OPTIONAL opencv_gpu)
+endif()
diff --git a/modules/videostab/include/opencv2/videostab/optical_flow.hpp b/modules/videostab/include/opencv2/videostab/optical_flow.hpp
index 18b7d3f283..2c1742fc79 100644
--- a/modules/videostab/include/opencv2/videostab/optical_flow.hpp
+++ b/modules/videostab/include/opencv2/videostab/optical_flow.hpp
@@ -46,7 +46,7 @@
 #include "opencv2/core/core.hpp"
 #include "opencv2/opencv_modules.hpp"
 
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 #  include "opencv2/gpu/gpu.hpp"
 #endif
 
@@ -98,7 +98,7 @@ public:
             OutputArray status, OutputArray errors);
 };
 
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 class CV_EXPORTS DensePyrLkOptFlowEstimatorGpu
         : public PyrLkOptFlowEstimatorBase, public IDenseOptFlowEstimator
 {
diff --git a/modules/videostab/src/inpainting.cpp b/modules/videostab/src/inpainting.cpp
index 4377c007c8..c6568e071e 100644
--- a/modules/videostab/src/inpainting.cpp
+++ b/modules/videostab/src/inpainting.cpp
@@ -323,7 +323,7 @@ public:
 
 MotionInpainter::MotionInpainter()
 {
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
     setOptFlowEstimator(new DensePyrLkOptFlowEstimatorGpu());
 #else
     CV_Error(CV_StsNotImplemented, "Current implementation of MotionInpainter requires GPU");
diff --git a/modules/videostab/src/optical_flow.cpp b/modules/videostab/src/optical_flow.cpp
index 46100fdb59..3441df1683 100644
--- a/modules/videostab/src/optical_flow.cpp
+++ b/modules/videostab/src/optical_flow.cpp
@@ -59,7 +59,7 @@ void SparsePyrLkOptFlowEstimator::run(
 }
 
 
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 DensePyrLkOptFlowEstimatorGpu::DensePyrLkOptFlowEstimatorGpu()
 {
     CV_Assert(gpu::getCudaEnabledDeviceCount() > 0);
diff --git a/samples/cpp/stitching_detailed.cpp b/samples/cpp/stitching_detailed.cpp
index 49d86086de..7394a72821 100644
--- a/samples/cpp/stitching_detailed.cpp
+++ b/samples/cpp/stitching_detailed.cpp
@@ -355,7 +355,7 @@ int main(int argc, char* argv[])
     Ptr<FeaturesFinder> finder;
     if (features_type == "surf")
     {
-#if defined(HAVE_OPENCV_NONFREE) && defined(HAVE_OPENCV_GPU)
+#if defined(HAVE_OPENCV_NONFREE) && defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
         if (try_gpu && gpu::getCudaEnabledDeviceCount() > 0)
             finder = new SurfFeaturesFinderGpu();
         else
@@ -543,7 +543,7 @@ int main(int argc, char* argv[])
     // Warp images and their masks
 
     Ptr<WarperCreator> warper_creator;
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
     if (try_gpu && gpu::getCudaEnabledDeviceCount() > 0)
     {
         if (warp_type == "plane") warper_creator = new cv::PlaneWarperGpu();
@@ -608,7 +608,7 @@ int main(int argc, char* argv[])
         seam_finder = new detail::VoronoiSeamFinder();
     else if (seam_find_type == "gc_color")
     {
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
         if (try_gpu && gpu::getCudaEnabledDeviceCount() > 0)
             seam_finder = new detail::GraphCutSeamFinderGpu(GraphCutSeamFinderBase::COST_COLOR);
         else
@@ -617,7 +617,7 @@ int main(int argc, char* argv[])
     }
     else if (seam_find_type == "gc_colorgrad")
     {
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
         if (try_gpu && gpu::getCudaEnabledDeviceCount() > 0)
             seam_finder = new detail::GraphCutSeamFinderGpu(GraphCutSeamFinderBase::COST_COLOR_GRAD);
         else