diff --git a/CMakeLists.txt b/CMakeLists.txt index 3f793f1070..2973f4ca00 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -201,6 +201,7 @@ OCV_OPTION(INSTALL_TO_MANGLED_PATHS "Enables mangled install paths, that help wi # OpenCV build options # =================================================== +OCV_OPTION(ENABLE_DYNAMIC_CUDA "Enabled dynamic CUDA linkage" ON IF ANDROID ) OCV_OPTION(ENABLE_PRECOMPILED_HEADERS "Use precompiled headers" ON IF (NOT IOS) ) OCV_OPTION(ENABLE_SOLUTION_FOLDERS "Solution folder in Visual Studio or in other IDEs" (MSVC_IDE OR CMAKE_GENERATOR MATCHES Xcode) IF (CMAKE_VERSION VERSION_GREATER "2.8.0") ) OCV_OPTION(ENABLE_PROFILING "Enable profiling in the GCC compiler (Add flags: -g -pg)" OFF IF CMAKE_COMPILER_IS_GNUCXX ) @@ -472,7 +473,11 @@ endif() # ---------------------------------------------------------------------------- # Add CUDA libraries (needed for apps/tools, samples) # ---------------------------------------------------------------------------- -if(HAVE_CUDA) +if(NOT HAVE_CUDA) + set(ENABLE_DYNAMIC_CUDA OFF) +endif() + +if(HAVE_CUDA AND NOT ENABLE_DYNAMIC_CUDA) set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) if(HAVE_CUBLAS) set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CUDA_cublas_LIBRARY}) @@ -481,6 +486,7 @@ if(HAVE_CUDA) set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CUDA_cufft_LIBRARY}) endif() endif() + # ---------------------------------------------------------------------------- # Solution folders: # ---------------------------------------------------------------------------- diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake index c923aba413..3dd749b053 100644 --- a/cmake/OpenCVModule.cmake +++ b/cmake/OpenCVModule.cmake @@ -488,7 +488,7 @@ macro(ocv_glob_module_sources) file(GLOB lib_cuda_srcs "src/cuda/*.cu") set(cuda_objs "") set(lib_cuda_hdrs "") - if(HAVE_CUDA AND lib_cuda_srcs) + if(HAVE_CUDA) ocv_include_directories(${CUDA_INCLUDE_DIRS}) file(GLOB lib_cuda_hdrs "src/cuda/*.hpp") @@ -537,9 +537,6 @@ macro(ocv_create_module) target_link_libraries(${the_module} ${OPENCV_MODULE_${the_module}_DEPS}) target_link_libraries(${the_module} LINK_INTERFACE_LIBRARIES ${OPENCV_MODULE_${the_module}_DEPS}) target_link_libraries(${the_module} ${OPENCV_MODULE_${the_module}_DEPS_EXT} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${ARGN}) - if (HAVE_CUDA) - target_link_libraries(${the_module} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) - endif() endif() add_dependencies(opencv_modules ${the_module}) diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt index 66b8ae0d2f..a1e71bf4f7 100644 --- a/modules/core/CMakeLists.txt +++ b/modules/core/CMakeLists.txt @@ -1,11 +1,25 @@ set(the_description "The Core Functionality") -ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES}) -ocv_module_include_directories(${ZLIB_INCLUDE_DIR}) + +if (NOT HAVE_CUDA OR ENABLE_DYNAMIC_CUDA) + ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES}) +else() + ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) +endif() + +ocv_module_include_directories("${OpenCV_SOURCE_DIR}/modules/dynamicuda/include/" ${ZLIB_INCLUDE_DIR}) if(HAVE_WINRT) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /ZW /GS /Gm- /AI\"${WINDOWS_SDK_PATH}/References/CommonConfiguration/Neutral\" /AI\"${VISUAL_STUDIO_PATH}/vcpackages\"") endif() +if(ENABLE_DYNAMIC_CUDA) + add_definitions(-DDYNAMIC_CUDA_SUPPORT) +else() + if (HAVE_CUDA) + add_definitions(-DUSE_CUDA) + endif() +endif() + if(HAVE_CUDA) ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include") ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef) @@ -14,11 +28,26 @@ endif() file(GLOB lib_cuda_hdrs "include/opencv2/${name}/cuda/*.hpp" "include/opencv2/${name}/cuda/*.h") file(GLOB lib_cuda_hdrs_detail "include/opencv2/${name}/cuda/detail/*.hpp" "include/opencv2/${name}/cuda/detail/*.h") +if(HAVE_CUDA AND NOT ENABLE_DYNAMIC_CUDA) + file(GLOB lib_cuda "../dynamicuda/src/cuda/*.cu*") + ocv_include_directories(${CUDA_INCLUDE_DIRS}) + ocv_cuda_compile(cuda_objs ${lib_cuda}) +endif() + source_group("Cuda Headers" FILES ${lib_cuda_hdrs}) source_group("Cuda Headers\\Detail" FILES ${lib_cuda_hdrs_detail}) -ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc" - HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail}) +if (HAVE_CUDA AND NOT ENABLE_DYNAMIC_CUDA) + source_group("Src\\Cuda" FILES ${lib_cuda} ${lib_cuda_hdrs}) +endif() + +if (NOT HAVE_CUDA OR ENABLE_DYNAMIC_CUDA) + ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc" + HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail}) +else() + ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc" ${lib_cuda} ${cuda_objs} + HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail}) +endif() ocv_create_module() ocv_add_precompiled_headers(${the_module}) diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp index 4c4af61c47..94bb548235 100644 --- a/modules/core/src/gpumat.cpp +++ b/modules/core/src/gpumat.cpp @@ -44,7 +44,7 @@ #include "opencv2/core/gpumat.hpp" #include -#ifdef HAVE_CUDA +#if defined(HAVE_CUDA) #include #include @@ -60,493 +60,232 @@ #endif #endif +#ifdef DYNAMIC_CUDA_SUPPORT +#include +#include +#include +#include +#endif + +#ifdef ANDROID +# include + +# define LOG_TAG "OpenCV::CUDA" +# define LOGE(...) ((void)__android_log_print(ANDROID_LOG_ERROR, LOG_TAG, __VA_ARGS__)) +# define LOGD(...) ((void)__android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, __VA_ARGS__)) +# define LOGI(...) ((void)__android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__)) +#endif + using namespace std; using namespace cv; using namespace cv::gpu; -#ifndef HAVE_CUDA - #define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support") -#else // HAVE_CUDA - -namespace -{ -#define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__, CV_Func) -#define nppSafeCall(expr) ___nppSafeCall(expr, __FILE__, __LINE__, CV_Func) - - inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "") - { - if (cudaSuccess != err) - cv::gpu::error(cudaGetErrorString(err), file, line, func); - } - - inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "") - { - if (err < 0) - { - std::ostringstream msg; - msg << "NPP API Call Error: " << err; - cv::gpu::error(msg.str().c_str(), file, line, func); - } - } -} - -#endif // HAVE_CUDA - -//////////////////////////////// Initialization & Info //////////////////////// - -#ifndef HAVE_CUDA +#include "opencv2/dynamicuda/dynamicuda.hpp" -int cv::gpu::getCudaEnabledDeviceCount() { return 0; } +#ifdef DYNAMIC_CUDA_SUPPORT -void cv::gpu::setDevice(int) { throw_nogpu; } -int cv::gpu::getDevice() { throw_nogpu; return 0; } +typedef GpuFuncTable* (*GpuFactoryType)(); +typedef DeviceInfoFuncTable* (*DeviceInfoFactoryType)(); -void cv::gpu::resetDevice() { throw_nogpu; } +static GpuFactoryType gpuFactory = NULL; +static DeviceInfoFactoryType deviceInfoFactory = NULL; -bool cv::gpu::deviceSupports(FeatureSet) { throw_nogpu; return false; } - -bool cv::gpu::TargetArchs::builtWith(FeatureSet) { throw_nogpu; return false; } -bool cv::gpu::TargetArchs::has(int, int) { throw_nogpu; return false; } -bool cv::gpu::TargetArchs::hasPtx(int, int) { throw_nogpu; return false; } -bool cv::gpu::TargetArchs::hasBin(int, int) { throw_nogpu; return false; } -bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int, int) { throw_nogpu; return false; } -bool cv::gpu::TargetArchs::hasEqualOrGreater(int, int) { throw_nogpu; return false; } -bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int, int) { throw_nogpu; return false; } -bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int, int) { throw_nogpu; return false; } - -size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { throw_nogpu; return 0; } -void cv::gpu::DeviceInfo::queryMemory(size_t&, size_t&) const { throw_nogpu; } -size_t cv::gpu::DeviceInfo::freeMemory() const { throw_nogpu; return 0; } -size_t cv::gpu::DeviceInfo::totalMemory() const { throw_nogpu; return 0; } -bool cv::gpu::DeviceInfo::supports(FeatureSet) const { throw_nogpu; return false; } -bool cv::gpu::DeviceInfo::isCompatible() const { throw_nogpu; return false; } -void cv::gpu::DeviceInfo::query() { throw_nogpu; } - -void cv::gpu::printCudaDeviceInfo(int) { throw_nogpu; } -void cv::gpu::printShortCudaDeviceInfo(int) { throw_nogpu; } - -#else // HAVE_CUDA - -int cv::gpu::getCudaEnabledDeviceCount() +# if defined(__linux__) || defined(__APPLE__) || defined (ANDROID) +# ifdef ANDROID +static const std::string getCudaSupportLibName() { - int count; - cudaError_t error = cudaGetDeviceCount( &count ); - - if (error == cudaErrorInsufficientDriver) - return -1; - - if (error == cudaErrorNoDevice) - return 0; - - cudaSafeCall( error ); - return count; -} - -void cv::gpu::setDevice(int device) -{ - cudaSafeCall( cudaSetDevice( device ) ); -} - -int cv::gpu::getDevice() -{ - int device; - cudaSafeCall( cudaGetDevice( &device ) ); - return device; -} - -void cv::gpu::resetDevice() -{ - cudaSafeCall( cudaDeviceReset() ); -} - -namespace -{ - class CudaArch + Dl_info dl_info; + if(0 != dladdr((void *)getCudaSupportLibName, &dl_info)) { - public: - CudaArch(); - - bool builtWith(FeatureSet feature_set) const; - bool hasPtx(int major, int minor) const; - bool hasBin(int major, int minor) const; - bool hasEqualOrLessPtx(int major, int minor) const; - bool hasEqualOrGreaterPtx(int major, int minor) const; - bool hasEqualOrGreaterBin(int major, int minor) const; + LOGD("Library name: %s", dl_info.dli_fname); + LOGD("Library base address: %p", dl_info.dli_fbase); - private: - static void fromStr(const string& set_as_str, vector& arr); + const char* libName=dl_info.dli_fname; + while( ((*libName)=='/') || ((*libName)=='.') ) + libName++; - vector bin; - vector ptx; - vector features; - }; + char lineBuf[2048]; + FILE* file = fopen("/proc/self/smaps", "rt"); - const CudaArch cudaArch; - - CudaArch::CudaArch() - { - fromStr(CUDA_ARCH_BIN, bin); - fromStr(CUDA_ARCH_PTX, ptx); - fromStr(CUDA_ARCH_FEATURES, features); - } - - bool CudaArch::builtWith(FeatureSet feature_set) const - { - return !features.empty() && (features.back() >= feature_set); - } - - bool CudaArch::hasPtx(int major, int minor) const - { - return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end(); - } - - bool CudaArch::hasBin(int major, int minor) const - { - return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end(); - } + if(file) + { + while (fgets(lineBuf, sizeof lineBuf, file) != NULL) + { + //verify that line ends with library name + int lineLength = strlen(lineBuf); + int libNameLength = strlen(libName); - bool CudaArch::hasEqualOrLessPtx(int major, int minor) const - { - return !ptx.empty() && (ptx.front() <= major * 10 + minor); - } + //trim end + for(int i = lineLength - 1; i >= 0 && isspace(lineBuf[i]); --i) + { + lineBuf[i] = 0; + --lineLength; + } - bool CudaArch::hasEqualOrGreaterPtx(int major, int minor) const - { - return !ptx.empty() && (ptx.back() >= major * 10 + minor); - } + if (0 != strncmp(lineBuf + lineLength - libNameLength, libName, libNameLength)) + { + //the line does not contain the library name + continue; + } - bool CudaArch::hasEqualOrGreaterBin(int major, int minor) const - { - return !bin.empty() && (bin.back() >= major * 10 + minor); - } + //extract path from smaps line + char* pathBegin = strchr(lineBuf, '/'); + if (0 == pathBegin) + { + LOGE("Strange error: could not find path beginning in lin \"%s\"", lineBuf); + continue; + } - void CudaArch::fromStr(const string& set_as_str, vector& arr) - { - if (set_as_str.find_first_not_of(" ") == string::npos) - return; + char* pathEnd = strrchr(pathBegin, '/'); + pathEnd[1] = 0; - istringstream stream(set_as_str); - int cur_value; + LOGD("Libraries folder found: %s", pathBegin); - while (!stream.eof()) + fclose(file); + return std::string(pathBegin) + "/libopencv_core_cuda.so"; + } + fclose(file); + LOGE("Could not find library path"); + } + else { - stream >> cur_value; - arr.push_back(cur_value); + LOGE("Could not read /proc/self/smaps"); } - - sort(arr.begin(), arr.end()); } -} - -bool cv::gpu::TargetArchs::builtWith(cv::gpu::FeatureSet feature_set) -{ - return cudaArch.builtWith(feature_set); -} - -bool cv::gpu::TargetArchs::has(int major, int minor) -{ - return hasPtx(major, minor) || hasBin(major, minor); -} - -bool cv::gpu::TargetArchs::hasPtx(int major, int minor) -{ - return cudaArch.hasPtx(major, minor); -} - -bool cv::gpu::TargetArchs::hasBin(int major, int minor) -{ - return cudaArch.hasBin(major, minor); -} - -bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor) -{ - return cudaArch.hasEqualOrLessPtx(major, minor); -} - -bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor) -{ - return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor); -} - -bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) -{ - return cudaArch.hasEqualOrGreaterPtx(major, minor); -} - -bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) -{ - return cudaArch.hasEqualOrGreaterBin(major, minor); -} - -bool cv::gpu::deviceSupports(FeatureSet feature_set) -{ - static int versions[] = - { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 - }; - static const int cache_size = static_cast(sizeof(versions) / sizeof(versions[0])); - - const int devId = getDevice(); - - int version; - - if (devId < cache_size && versions[devId] >= 0) - version = versions[devId]; else { - DeviceInfo dev(devId); - version = dev.majorVersion() * 10 + dev.minorVersion(); - if (devId < cache_size) - versions[devId] = version; + LOGE("Could not get library name and base address"); } - return TargetArchs::builtWith(feature_set) && (version >= feature_set); + return string(); } -namespace +# else +static const std::string getCudaSupportLibName() { - class DeviceProps - { - public: - DeviceProps(); - ~DeviceProps(); - - cudaDeviceProp* get(int devID); - - private: - std::vector props_; - }; + return "libopencv_core_cuda.so"; +} +# endif - DeviceProps::DeviceProps() - { - props_.resize(10, 0); - } +static bool loadCudaSupportLib() +{ + void* handle; + const std::string name = getCudaSupportLibName(); + handle = dlopen(name.c_str(), RTLD_LAZY); + if (!handle) + return false; - DeviceProps::~DeviceProps() + deviceInfoFactory = (DeviceInfoFactoryType)dlsym(handle, "deviceInfoFactory"); + if (!deviceInfoFactory) { - for (size_t i = 0; i < props_.size(); ++i) - { - if (props_[i]) - delete props_[i]; - } - props_.clear(); + dlclose(handle); + return false; } - cudaDeviceProp* DeviceProps::get(int devID) + gpuFactory = (GpuFactoryType)dlsym(handle, "gpuFactory"); + if (!gpuFactory) { - if (devID >= (int) props_.size()) - props_.resize(devID + 5, 0); - - if (!props_[devID]) - { - props_[devID] = new cudaDeviceProp; - cudaSafeCall( cudaGetDeviceProperties(props_[devID], devID) ); - } - - return props_[devID]; + dlclose(handle); + return false; } - DeviceProps deviceProps; + return true; } -size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const -{ - return deviceProps.get(device_id_)->sharedMemPerBlock; -} +# else +# error "Dynamic CUDA support is not implemented for this platform!" +# endif -void cv::gpu::DeviceInfo::queryMemory(size_t& _totalMemory, size_t& _freeMemory) const -{ - int prevDeviceID = getDevice(); - if (prevDeviceID != device_id_) - setDevice(device_id_); - - cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) ); +#endif - if (prevDeviceID != device_id_) - setDevice(prevDeviceID); +static GpuFuncTable* gpuFuncTable() +{ +#ifdef DYNAMIC_CUDA_SUPPORT + static EmptyFuncTable stub; + static GpuFuncTable* libFuncTable = loadCudaSupportLib() ? gpuFactory(): (GpuFuncTable*)&stub; + static GpuFuncTable *funcTable = libFuncTable ? libFuncTable : (GpuFuncTable*)&stub; +#else +# ifdef USE_CUDA + static CudaFuncTable impl; + static GpuFuncTable* funcTable = &impl; +#else + static EmptyFuncTable stub; + static GpuFuncTable* funcTable = &stub; +#endif +#endif + return funcTable; +} + +static DeviceInfoFuncTable* deviceInfoFuncTable() +{ +#ifdef DYNAMIC_CUDA_SUPPORT + static EmptyDeviceInfoFuncTable stub; + static DeviceInfoFuncTable* libFuncTable = loadCudaSupportLib() ? deviceInfoFactory(): (DeviceInfoFuncTable*)&stub; + static DeviceInfoFuncTable* funcTable = libFuncTable ? libFuncTable : (DeviceInfoFuncTable*)&stub; +#else +# ifdef USE_CUDA + static CudaDeviceInfoFuncTable impl; + static DeviceInfoFuncTable* funcTable = &impl; +#else + static EmptyDeviceInfoFuncTable stub; + static DeviceInfoFuncTable* funcTable = &stub; +#endif +#endif + return funcTable; } -size_t cv::gpu::DeviceInfo::freeMemory() const -{ - size_t _totalMemory, _freeMemory; - queryMemory(_totalMemory, _freeMemory); - return _freeMemory; -} -size_t cv::gpu::DeviceInfo::totalMemory() const -{ - size_t _totalMemory, _freeMemory; - queryMemory(_totalMemory, _freeMemory); - return _totalMemory; -} +//////////////////////////////// Initialization & Info //////////////////////// -bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const -{ - int version = majorVersion() * 10 + minorVersion(); - return version >= feature_set; -} +int cv::gpu::getCudaEnabledDeviceCount() { return deviceInfoFuncTable()->getCudaEnabledDeviceCount(); } -bool cv::gpu::DeviceInfo::isCompatible() const -{ - // Check PTX compatibility - if (TargetArchs::hasEqualOrLessPtx(majorVersion(), minorVersion())) - return true; +void cv::gpu::setDevice(int device) { deviceInfoFuncTable()->setDevice(device); } +int cv::gpu::getDevice() { return deviceInfoFuncTable()->getDevice(); } - // Check BIN compatibility - for (int i = minorVersion(); i >= 0; --i) - if (TargetArchs::hasBin(majorVersion(), i)) - return true; +void cv::gpu::resetDevice() { deviceInfoFuncTable()->resetDevice(); } - return false; -} +bool cv::gpu::deviceSupports(FeatureSet feature_set) { return deviceInfoFuncTable()->deviceSupports(feature_set); } -void cv::gpu::DeviceInfo::query() -{ - const cudaDeviceProp* prop = deviceProps.get(device_id_); +bool cv::gpu::TargetArchs::builtWith(FeatureSet feature_set) { return deviceInfoFuncTable()->builtWith(feature_set); } +bool cv::gpu::TargetArchs::has(int major, int minor) { return deviceInfoFuncTable()->has(major, minor); } +bool cv::gpu::TargetArchs::hasPtx(int major, int minor) { return deviceInfoFuncTable()->hasPtx(major, minor); } +bool cv::gpu::TargetArchs::hasBin(int major, int minor) { return deviceInfoFuncTable()->hasBin(major, minor); } +bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor) { return deviceInfoFuncTable()->hasEqualOrLessPtx(major, minor); } +bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor) { return deviceInfoFuncTable()->hasEqualOrGreater(major, minor); } +bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) { return deviceInfoFuncTable()->hasEqualOrGreaterPtx(major, minor); } +bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) { return deviceInfoFuncTable()->hasEqualOrGreaterBin(major, minor); } - name_ = prop->name; - multi_processor_count_ = prop->multiProcessorCount; - majorVersion_ = prop->major; - minorVersion_ = prop->minor; -} +size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { return deviceInfoFuncTable()->sharedMemPerBlock(); } +void cv::gpu::DeviceInfo::queryMemory(size_t& total_memory, size_t& free_memory) const { deviceInfoFuncTable()->queryMemory(total_memory, free_memory); } +size_t cv::gpu::DeviceInfo::freeMemory() const { return deviceInfoFuncTable()->freeMemory(); } +size_t cv::gpu::DeviceInfo::totalMemory() const { return deviceInfoFuncTable()->totalMemory(); } +bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const { return deviceInfoFuncTable()->supports(feature_set); } +bool cv::gpu::DeviceInfo::isCompatible() const { return deviceInfoFuncTable()->isCompatible(); } -namespace +void cv::gpu::DeviceInfo::query() { - int convertSMVer2Cores(int major, int minor) - { - // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM - typedef struct { - int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version - int Cores; - } SMtoCores; - - SMtoCores gpuArchCoresPerSM[] = { { 0x10, 8 }, { 0x11, 8 }, { 0x12, 8 }, { 0x13, 8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 } }; - - int index = 0; - while (gpuArchCoresPerSM[index].SM != -1) - { - if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) - return gpuArchCoresPerSM[index].Cores; - index++; - } - - return -1; - } + deviceInfoFuncTable()->query(); + name_ = deviceInfoFuncTable()->name(); + multi_processor_count_ = deviceInfoFuncTable()->multiProcessorCount(); + majorVersion_ = deviceInfoFuncTable()->majorVersion(); + minorVersion_ = deviceInfoFuncTable()->minorVersion(); } -void cv::gpu::printCudaDeviceInfo(int device) -{ - int count = getCudaEnabledDeviceCount(); - bool valid = (device >= 0) && (device < count); - - int beg = valid ? device : 0; - int end = valid ? device+1 : count; - - printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n"); - printf("Device count: %d\n", count); - - int driverVersion = 0, runtimeVersion = 0; - cudaSafeCall( cudaDriverGetVersion(&driverVersion) ); - cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) ); +void cv::gpu::printCudaDeviceInfo(int device) { deviceInfoFuncTable()->printCudaDeviceInfo(device); } +void cv::gpu::printShortCudaDeviceInfo(int device) { deviceInfoFuncTable()->printShortCudaDeviceInfo(device); } - const char *computeMode[] = { - "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", - "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", - "Prohibited (no host thread can use ::cudaSetDevice() with this device)", - "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", - "Unknown", - NULL - }; - - for(int dev = beg; dev < end; ++dev) - { - cudaDeviceProp prop; - cudaSafeCall( cudaGetDeviceProperties(&prop, dev) ); - - printf("\nDevice %d: \"%s\"\n", dev, prop.name); - printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); - printf(" CUDA Capability Major/Minor version number: %d.%d\n", prop.major, prop.minor); - printf(" Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem); - - int cores = convertSMVer2Cores(prop.major, prop.minor); - if (cores > 0) - printf(" (%2d) Multiprocessors x (%2d) CUDA Cores/MP: %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount); - - printf(" GPU Clock Speed: %.2f GHz\n", prop.clockRate * 1e-6f); - - printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n", - prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1], - prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]); - printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n", - prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1], - prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]); - - printf(" Total amount of constant memory: %u bytes\n", (int)prop.totalConstMem); - printf(" Total amount of shared memory per block: %u bytes\n", (int)prop.sharedMemPerBlock); - printf(" Total number of registers available per block: %d\n", prop.regsPerBlock); - printf(" Warp size: %d\n", prop.warpSize); - printf(" Maximum number of threads per block: %d\n", prop.maxThreadsPerBlock); - printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]); - printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); - printf(" Maximum memory pitch: %u bytes\n", (int)prop.memPitch); - printf(" Texture alignment: %u bytes\n", (int)prop.textureAlignment); - - printf(" Concurrent copy and execution: %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount); - printf(" Run time limit on kernels: %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No"); - printf(" Integrated GPU sharing Host Memory: %s\n", prop.integrated ? "Yes" : "No"); - printf(" Support host page-locked memory mapping: %s\n", prop.canMapHostMemory ? "Yes" : "No"); - - printf(" Concurrent kernel execution: %s\n", prop.concurrentKernels ? "Yes" : "No"); - printf(" Alignment requirement for Surfaces: %s\n", prop.surfaceAlignment ? "Yes" : "No"); - printf(" Device has ECC support enabled: %s\n", prop.ECCEnabled ? "Yes" : "No"); - printf(" Device is using TCC driver mode: %s\n", prop.tccDriver ? "Yes" : "No"); - printf(" Device supports Unified Addressing (UVA): %s\n", prop.unifiedAddressing ? "Yes" : "No"); - printf(" Device PCI Bus ID / PCI location ID: %d / %d\n", prop.pciBusID, prop.pciDeviceID ); - printf(" Compute Mode:\n"); - printf(" %s \n", computeMode[prop.computeMode]); - } - - printf("\n"); - printf("deviceQuery, CUDA Driver = CUDART"); - printf(", CUDA Driver Version = %d.%d", driverVersion / 1000, driverVersion % 100); - printf(", CUDA Runtime Version = %d.%d", runtimeVersion/1000, runtimeVersion%100); - printf(", NumDevs = %d\n\n", count); - fflush(stdout); -} - -void cv::gpu::printShortCudaDeviceInfo(int device) +namespace cv { namespace gpu { - int count = getCudaEnabledDeviceCount(); - bool valid = (device >= 0) && (device < count); - - int beg = valid ? device : 0; - int end = valid ? device+1 : count; - - int driverVersion = 0, runtimeVersion = 0; - cudaSafeCall( cudaDriverGetVersion(&driverVersion) ); - cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) ); - - for(int dev = beg; dev < end; ++dev) - { - cudaDeviceProp prop; - cudaSafeCall( cudaGetDeviceProperties(&prop, dev) ); - - const char *arch_str = prop.major < 2 ? " (not Fermi)" : ""; - printf("Device %d: \"%s\" %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f); - printf(", sm_%d%d%s", prop.major, prop.minor, arch_str); - - int cores = convertSMVer2Cores(prop.major, prop.minor); - if (cores > 0) - printf(", %d cores", cores * prop.multiProcessorCount); - - printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); - } - fflush(stdout); -} - -#endif // HAVE_CUDA + CV_EXPORTS void copyWithMask(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, const cv::gpu::GpuMat&, cudaStream_t); + CV_EXPORTS void convertTo(const cv::gpu::GpuMat&, cv::gpu::GpuMat&); + CV_EXPORTS void convertTo(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, double, double, cudaStream_t = 0); + CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, cudaStream_t); + CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, cudaStream_t); + CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar); + CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&); +}} //////////////////////////////// GpuMat /////////////////////////////// @@ -830,601 +569,6 @@ GpuMat cv::gpu::allocMatFromBuf(int rows, int cols, int type, GpuMat &mat) return mat = GpuMat(rows, cols, type); } -namespace -{ - class GpuFuncTable - { - public: - virtual ~GpuFuncTable() {} - - virtual void copy(const Mat& src, GpuMat& dst) const = 0; - virtual void copy(const GpuMat& src, Mat& dst) const = 0; - virtual void copy(const GpuMat& src, GpuMat& dst) const = 0; - - virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0; - - virtual void convert(const GpuMat& src, GpuMat& dst) const = 0; - virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const = 0; - - virtual void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const = 0; - - virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0; - virtual void free(void* devPtr) const = 0; - }; -} - -#ifndef HAVE_CUDA - -namespace -{ - class EmptyFuncTable : public GpuFuncTable - { - public: - void copy(const Mat&, GpuMat&) const { throw_nogpu; } - void copy(const GpuMat&, Mat&) const { throw_nogpu; } - void copy(const GpuMat&, GpuMat&) const { throw_nogpu; } - - void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu; } - - void convert(const GpuMat&, GpuMat&) const { throw_nogpu; } - void convert(const GpuMat&, GpuMat&, double, double) const { throw_nogpu; } - - void setTo(GpuMat&, Scalar, const GpuMat&) const { throw_nogpu; } - - void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; } - void free(void*) const {} - }; - - const GpuFuncTable* gpuFuncTable() - { - static EmptyFuncTable empty; - return ∅ - } -} - -#else // HAVE_CUDA - -namespace cv { namespace gpu { namespace device -{ - void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream); - - template - void set_to_gpu(PtrStepSzb mat, const T* scalar, int channels, cudaStream_t stream); - - template - void set_to_gpu(PtrStepSzb mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream); - - void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream); -}}} - -namespace -{ - template void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream) - { - Scalar_ sf = s; - cv::gpu::device::set_to_gpu(src, sf.val, src.channels(), stream); - } - - template void kernelSetCaller(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) - { - Scalar_ sf = s; - cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream); - } -} - - -namespace cv { namespace gpu -{ - CV_EXPORTS void copyWithMask(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, const cv::gpu::GpuMat&, CUstream_st*); - CV_EXPORTS void convertTo(const cv::gpu::GpuMat&, cv::gpu::GpuMat&); - CV_EXPORTS void convertTo(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, double, double, CUstream_st*); - CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, CUstream_st*); - CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*); - CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar); - CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&); -}} - - -namespace cv { namespace gpu -{ - void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0) - { - CV_Assert(src.size() == dst.size() && src.type() == dst.type()); - CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels())); - - cv::gpu::device::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream); - } - - void convertTo(const GpuMat& src, GpuMat& dst) - { - cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0); - } - - void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0) - { - cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream); - } - - void setTo(GpuMat& src, Scalar s, cudaStream_t stream) - { - typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream); - - static const caller_t callers[] = - { - kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, - kernelSetCaller, kernelSetCaller - }; - - callers[src.depth()](src, s, stream); - } - - void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) - { - typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream); - - static const caller_t callers[] = - { - kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, - kernelSetCaller, kernelSetCaller - }; - - callers[src.depth()](src, s, mask, stream); - } - - void setTo(GpuMat& src, Scalar s) - { - setTo(src, s, 0); - } - - void setTo(GpuMat& src, Scalar s, const GpuMat& mask) - { - setTo(src, s, mask, 0); - } -}} - -namespace -{ - template struct NPPTypeTraits; - template<> struct NPPTypeTraits { typedef Npp8u npp_type; }; - template<> struct NPPTypeTraits { typedef Npp8s npp_type; }; - template<> struct NPPTypeTraits { typedef Npp16u npp_type; }; - template<> struct NPPTypeTraits { typedef Npp16s npp_type; }; - template<> struct NPPTypeTraits { typedef Npp32s npp_type; }; - template<> struct NPPTypeTraits { typedef Npp32f npp_type; }; - template<> struct NPPTypeTraits { typedef Npp64f npp_type; }; - - ////////////////////////////////////////////////////////////////////////// - // Convert - - template struct NppConvertFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - typedef typename NPPTypeTraits::npp_type dst_t; - - typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI); - }; - template struct NppConvertFunc - { - typedef typename NPPTypeTraits::npp_type dst_t; - - typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode); - }; - - template::func_ptr func> struct NppCvt - { - typedef typename NPPTypeTraits::npp_type src_t; - typedef typename NPPTypeTraits::npp_type dst_t; - - static void call(const GpuMat& src, GpuMat& dst) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - template::func_ptr func> struct NppCvt - { - typedef typename NPPTypeTraits::npp_type dst_t; - - static void call(const GpuMat& src, GpuMat& dst) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, NPP_RND_NEAR) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - - ////////////////////////////////////////////////////////////////////////// - // Set - - template struct NppSetFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI); - }; - template struct NppSetFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI); - }; - template struct NppSetFunc - { - typedef NppStatus (*func_ptr)(Npp8s values[], Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI); - }; - template<> struct NppSetFunc - { - typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI); - }; - - template::func_ptr func> struct NppSet - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(GpuMat& src, Scalar s) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - Scalar_ nppS = s; - - nppSafeCall( func(nppS.val, src.ptr(), static_cast(src.step), sz) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - template::func_ptr func> struct NppSet - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(GpuMat& src, Scalar s) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - Scalar_ nppS = s; - - nppSafeCall( func(nppS[0], src.ptr(), static_cast(src.step), sz) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - - template struct NppSetMaskFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); - }; - template struct NppSetMaskFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); - }; - - template::func_ptr func> struct NppSetMask - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(GpuMat& src, Scalar s, const GpuMat& mask) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - Scalar_ nppS = s; - - nppSafeCall( func(nppS.val, src.ptr(), static_cast(src.step), sz, mask.ptr(), static_cast(mask.step)) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - template::func_ptr func> struct NppSetMask - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(GpuMat& src, Scalar s, const GpuMat& mask) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - Scalar_ nppS = s; - - nppSafeCall( func(nppS[0], src.ptr(), static_cast(src.step), sz, mask.ptr(), static_cast(mask.step)) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - - ////////////////////////////////////////////////////////////////////////// - // CopyMasked - - template struct NppCopyMaskedFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); - }; - - template::func_ptr func> struct NppCopyMasked - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t /*stream*/) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, mask.ptr(), static_cast(mask.step)) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - - template static inline bool isAligned(const T* ptr, size_t size) - { - return reinterpret_cast(ptr) % size == 0; - } - - ////////////////////////////////////////////////////////////////////////// - // CudaFuncTable - - class CudaFuncTable : public GpuFuncTable - { - public: - void copy(const Mat& src, GpuMat& dst) const - { - cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) ); - } - void copy(const GpuMat& src, Mat& dst) const - { - cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) ); - } - void copy(const GpuMat& src, GpuMat& dst) const - { - cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) ); - } - - void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const - { - CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); - CV_Assert(src.size() == dst.size() && src.type() == dst.type()); - CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels())); - - if (src.depth() == CV_64F) - { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) - CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); - } - - typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream); - static const func_t funcs[7][4] = - { - /* 8U */ {NppCopyMasked::call, cv::gpu::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 8S */ {cv::gpu::copyWithMask , cv::gpu::copyWithMask, cv::gpu::copyWithMask , cv::gpu::copyWithMask }, - /* 16U */ {NppCopyMasked::call, cv::gpu::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 16S */ {NppCopyMasked::call, cv::gpu::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 32S */ {NppCopyMasked::call, cv::gpu::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 32F */ {NppCopyMasked::call, cv::gpu::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 64F */ {cv::gpu::copyWithMask , cv::gpu::copyWithMask, cv::gpu::copyWithMask , cv::gpu::copyWithMask } - }; - - const func_t func = mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::copyWithMask; - - func(src, dst, mask, 0); - } - - void convert(const GpuMat& src, GpuMat& dst) const - { - typedef void (*func_t)(const GpuMat& src, GpuMat& dst); - static const func_t funcs[7][7][4] = - { - { - /* 8U -> 8U */ {0, 0, 0, 0}, - /* 8U -> 8S */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 8U -> 16U */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, NppCvt::call}, - /* 8U -> 16S */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, NppCvt::call}, - /* 8U -> 32S */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 8U -> 32F */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 8U -> 64F */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo } - }, - { - /* 8S -> 8U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 8S -> 8S */ {0,0,0,0}, - /* 8S -> 16U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 8S -> 16S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 8S -> 32S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 8S -> 32F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 8S -> 64F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo} - }, - { - /* 16U -> 8U */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, NppCvt::call}, - /* 16U -> 8S */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16U -> 16U */ {0,0,0,0}, - /* 16U -> 16S */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16U -> 32S */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16U -> 32F */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16U -> 64F */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo } - }, - { - /* 16S -> 8U */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, NppCvt::call}, - /* 16S -> 8S */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16S -> 16U */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16S -> 16S */ {0,0,0,0}, - /* 16S -> 32S */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16S -> 32F */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16S -> 64F */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo } - }, - { - /* 32S -> 8U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32S -> 8S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32S -> 16U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32S -> 16S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32S -> 32S */ {0,0,0,0}, - /* 32S -> 32F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32S -> 64F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo} - }, - { - /* 32F -> 8U */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32F -> 8S */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32F -> 16U */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32F -> 16S */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32F -> 32S */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32F -> 32F */ {0,0,0,0}, - /* 32F -> 64F */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo} - }, - { - /* 64F -> 8U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 64F -> 8S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 64F -> 16U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 64F -> 16S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 64F -> 32S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 64F -> 32F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 64F -> 64F */ {0,0,0,0} - } - }; - - CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); - CV_Assert(dst.depth() <= CV_64F); - CV_Assert(src.size() == dst.size() && src.channels() == dst.channels()); - - if (src.depth() == CV_64F || dst.depth() == CV_64F) - { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) - CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); - } - - bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16); - if (!aligned) - { - cv::gpu::convertTo(src, dst); - return; - } - - const func_t func = funcs[src.depth()][dst.depth()][src.channels() - 1]; - CV_DbgAssert(func != 0); - - func(src, dst); - } - - void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const - { - CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); - CV_Assert(dst.depth() <= CV_64F); - - if (src.depth() == CV_64F || dst.depth() == CV_64F) - { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) - CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); - } - - cv::gpu::convertTo(src, dst, alpha, beta); - } - - void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const - { - if (mask.empty()) - { - if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0) - { - cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) ); - return; - } - - if (m.depth() == CV_8U) - { - int cn = m.channels(); - - if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3])) - { - int val = saturate_cast(s[0]); - cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) ); - return; - } - } - - typedef void (*func_t)(GpuMat& src, Scalar s); - static const func_t funcs[7][4] = - { - {NppSet::call, cv::gpu::setTo , cv::gpu::setTo , NppSet::call}, - {cv::gpu::setTo , cv::gpu::setTo , cv::gpu::setTo , cv::gpu::setTo }, - {NppSet::call, NppSet::call, cv::gpu::setTo , NppSet::call}, - {NppSet::call, NppSet::call, cv::gpu::setTo , NppSet::call}, - {NppSet::call, cv::gpu::setTo , cv::gpu::setTo , NppSet::call}, - {NppSet::call, cv::gpu::setTo , cv::gpu::setTo , NppSet::call}, - {cv::gpu::setTo , cv::gpu::setTo , cv::gpu::setTo , cv::gpu::setTo } - }; - - CV_Assert(m.depth() <= CV_64F && m.channels() <= 4); - - if (m.depth() == CV_64F) - { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) - CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); - } - - funcs[m.depth()][m.channels() - 1](m, s); - } - else - { - typedef void (*func_t)(GpuMat& src, Scalar s, const GpuMat& mask); - static const func_t funcs[7][4] = - { - {NppSetMask::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask::call}, - {cv::gpu::setTo , cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo }, - {NppSetMask::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask::call}, - {NppSetMask::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask::call}, - {NppSetMask::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask::call}, - {NppSetMask::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask::call}, - {cv::gpu::setTo , cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo } - }; - - CV_Assert(m.depth() <= CV_64F && m.channels() <= 4); - - if (m.depth() == CV_64F) - { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) - CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); - } - - funcs[m.depth()][m.channels() - 1](m, s, mask); - } - } - - void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const - { - cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) ); - } - - void free(void* devPtr) const - { - cudaFree(devPtr); - } - }; - - const GpuFuncTable* gpuFuncTable() - { - static CudaFuncTable funcTable; - return &funcTable; - } -} - -#endif // HAVE_CUDA - void cv::gpu::GpuMat::upload(const Mat& m) { CV_DbgAssert(!m.empty()); @@ -1492,9 +636,9 @@ void cv::gpu::GpuMat::convertTo(GpuMat& dst, int rtype, double alpha, double bet dst.create(size(), rtype); if (noScale) - gpuFuncTable()->convert(*psrc, dst); + cv::gpu::convertTo(*psrc, dst); else - gpuFuncTable()->convert(*psrc, dst, alpha, beta); + cv::gpu::convertTo(*psrc, dst, alpha, beta); } GpuMat& cv::gpu::GpuMat::setTo(Scalar s, const GpuMat& mask) @@ -1502,7 +646,7 @@ GpuMat& cv::gpu::GpuMat::setTo(Scalar s, const GpuMat& mask) CV_Assert(mask.empty() || mask.type() == CV_8UC1); CV_DbgAssert(!empty()); - gpuFuncTable()->setTo(*this, s, mask); + gpu::setTo(*this, s, mask); return *this; } @@ -1562,6 +706,39 @@ void cv::gpu::GpuMat::release() refcount = 0; } +namespace cv { namespace gpu +{ + void convertTo(const GpuMat& src, GpuMat& dst) + { + gpuFuncTable()->convert(src, dst); + } + + void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream) + { + gpuFuncTable()->convert(src, dst, alpha, beta, stream); + } + + void setTo(GpuMat& src, Scalar s, cudaStream_t stream) + { + gpuFuncTable()->setTo(src, s, cv::gpu::GpuMat(), stream); + } + + void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) + { + gpuFuncTable()->setTo(src, s, mask, stream); + } + + void setTo(GpuMat& src, Scalar s) + { + setTo(src, s, 0); + } + + void setTo(GpuMat& src, Scalar s, const GpuMat& mask) + { + setTo(src, s, mask, 0); + } +}} + //////////////////////////////////////////////////////////////////////// // Error handling diff --git a/modules/dynamicuda/CMakeLists.txt b/modules/dynamicuda/CMakeLists.txt new file mode 100644 index 0000000000..f67879ef91 --- /dev/null +++ b/modules/dynamicuda/CMakeLists.txt @@ -0,0 +1,15 @@ +if(NOT DYNAMIC_CUDA_SUPPORT) + ocv_module_disable(dynamicuda) +endif() + +set(the_description "Dynamic CUDA linkage") + +add_definitions(-DUSE_CUDA) +ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef) +ocv_module_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include") +set(OPENCV_MODULE_TYPE SHARED) +if (BUILD_FAT_JAVA_LIB) + ocv_define_module(dynamicuda opencv_java PRIVATE_REQUIRED ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) +else() + ocv_define_module(dynamicuda opencv_core PRIVATE_REQUIRED ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) +endif() diff --git a/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp b/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp new file mode 100644 index 0000000000..8973c53049 --- /dev/null +++ b/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp @@ -0,0 +1,1112 @@ +#ifndef __GPUMAT_CUDA_HPP__ +#define __GPUMAT_CUDA_HPP__ + +#ifndef HAVE_CUDA +typedef void* cudaStream_t; +#endif + +class DeviceInfoFuncTable +{ +public: + // cv::DeviceInfo + virtual size_t sharedMemPerBlock() const = 0; + virtual void queryMemory(size_t&, size_t&) const = 0; + virtual size_t freeMemory() const = 0; + virtual size_t totalMemory() const = 0; + virtual bool supports(FeatureSet) const = 0; + virtual bool isCompatible() const = 0; + virtual void query() = 0; + virtual int deviceID() const = 0; + virtual std::string name() const = 0; + virtual int majorVersion() const = 0; + virtual int minorVersion() const = 0; + virtual int multiProcessorCount() const = 0; + virtual int getCudaEnabledDeviceCount() const = 0; + virtual void setDevice(int) const = 0; + virtual int getDevice() const = 0; + virtual void resetDevice() const = 0; + virtual bool deviceSupports(FeatureSet) const = 0; + + // cv::TargetArchs + virtual bool builtWith(FeatureSet) const = 0; + virtual bool has(int, int) const = 0; + virtual bool hasPtx(int, int) const = 0; + virtual bool hasBin(int, int) const = 0; + virtual bool hasEqualOrLessPtx(int, int) const = 0; + virtual bool hasEqualOrGreater(int, int) const = 0; + virtual bool hasEqualOrGreaterPtx(int, int) const = 0; + virtual bool hasEqualOrGreaterBin(int, int) const = 0; + + virtual void printCudaDeviceInfo(int) const = 0; + virtual void printShortCudaDeviceInfo(int) const = 0; + + virtual ~DeviceInfoFuncTable() {}; +}; + +class GpuFuncTable +{ +public: + virtual ~GpuFuncTable() {} + + // GpuMat routines + virtual void copy(const Mat& src, GpuMat& dst) const = 0; + virtual void copy(const GpuMat& src, Mat& dst) const = 0; + virtual void copy(const GpuMat& src, GpuMat& dst) const = 0; + + virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0; + + // gpu::device::convertTo funcs + virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0) const = 0; + virtual void convert(const GpuMat& src, GpuMat& dst) const = 0; + + // for gpu::device::setTo funcs + virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, cudaStream_t) const = 0; + + virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0; + virtual void free(void* devPtr) const = 0; +}; + +class EmptyDeviceInfoFuncTable: public DeviceInfoFuncTable +{ +public: + size_t sharedMemPerBlock() const { throw_nogpu; return 0; } + void queryMemory(size_t&, size_t&) const { throw_nogpu; } + size_t freeMemory() const { throw_nogpu; return 0; } + size_t totalMemory() const { throw_nogpu; return 0; } + bool supports(FeatureSet) const { throw_nogpu; return false; } + bool isCompatible() const { throw_nogpu; return false; } + void query() { throw_nogpu; } + int deviceID() const { throw_nogpu; return -1; }; + std::string name() const { throw_nogpu; return std::string(); } + int majorVersion() const { throw_nogpu; return -1; } + int minorVersion() const { throw_nogpu; return -1; } + int multiProcessorCount() const { throw_nogpu; return -1; } + + int getCudaEnabledDeviceCount() const { return 0; } + + void setDevice(int) const { throw_nogpu; } + int getDevice() const { throw_nogpu; return 0; } + + void resetDevice() const { throw_nogpu; } + + bool deviceSupports(FeatureSet) const { throw_nogpu; return false; } + + bool builtWith(FeatureSet) const { throw_nogpu; return false; } + bool has(int, int) const { throw_nogpu; return false; } + bool hasPtx(int, int) const { throw_nogpu; return false; } + bool hasBin(int, int) const { throw_nogpu; return false; } + bool hasEqualOrLessPtx(int, int) const { throw_nogpu; return false; } + bool hasEqualOrGreater(int, int) const { throw_nogpu; return false; } + bool hasEqualOrGreaterPtx(int, int) const { throw_nogpu; return false; } + bool hasEqualOrGreaterBin(int, int) const { throw_nogpu; return false; } + + void printCudaDeviceInfo(int) const + { + printf("The library is compiled without CUDA support\n"); + } + + void printShortCudaDeviceInfo(int) const + { + printf("The library is compiled without CUDA support\n"); + } +}; + +class EmptyFuncTable : public GpuFuncTable +{ +public: + + void copy(const Mat&, GpuMat&) const { throw_nogpu; } + void copy(const GpuMat&, Mat&) const { throw_nogpu; } + void copy(const GpuMat&, GpuMat&) const { throw_nogpu; } + + void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu; } + + void convert(const GpuMat&, GpuMat&) const { throw_nogpu; } + void convert(const GpuMat&, GpuMat&, double, double, cudaStream_t stream = 0) const { (void)stream; throw_nogpu; } + + virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, cudaStream_t) const { throw_nogpu; } + + void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; } + void free(void*) const {} +}; + +#if defined(USE_CUDA) + +#define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__, CV_Func) +#define nppSafeCall(expr) ___nppSafeCall(expr, __FILE__, __LINE__, CV_Func) + +inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "") +{ + if (cudaSuccess != err) + cv::gpu::error(cudaGetErrorString(err), file, line, func); +} + +inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "") +{ + if (err < 0) + { + std::ostringstream msg; + msg << "NPP API Call Error: " << err; + cv::gpu::error(msg.str().c_str(), file, line, func); + } +} + +namespace cv { namespace gpu { namespace device +{ + void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream); + + template + void set_to_gpu(PtrStepSzb mat, const T* scalar, int channels, cudaStream_t stream); + + template + void set_to_gpu(PtrStepSzb mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream); + + void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream); +}}} + +template void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream) +{ + Scalar_ sf = s; + cv::gpu::device::set_to_gpu(src, sf.val, src.channels(), stream); +} + +template void kernelSetCaller(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) +{ + Scalar_ sf = s; + cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream); +} + +template struct NPPTypeTraits; +template<> struct NPPTypeTraits { typedef Npp8u npp_type; }; +template<> struct NPPTypeTraits { typedef Npp8s npp_type; }; +template<> struct NPPTypeTraits { typedef Npp16u npp_type; }; +template<> struct NPPTypeTraits { typedef Npp16s npp_type; }; +template<> struct NPPTypeTraits { typedef Npp32s npp_type; }; +template<> struct NPPTypeTraits { typedef Npp32f npp_type; }; +template<> struct NPPTypeTraits { typedef Npp64f npp_type; }; + +////////////////////////////////////////////////////////////////////////// +// Convert + +template struct NppConvertFunc +{ + typedef typename NPPTypeTraits::npp_type src_t; + typedef typename NPPTypeTraits::npp_type dst_t; + + typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI); +}; +template struct NppConvertFunc +{ + typedef typename NPPTypeTraits::npp_type dst_t; + + typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode); +}; + +template::func_ptr func> struct NppCvt +{ + typedef typename NPPTypeTraits::npp_type src_t; + typedef typename NPPTypeTraits::npp_type dst_t; + + static void call(const GpuMat& src, GpuMat& dst) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } +}; + +template::func_ptr func> struct NppCvt +{ + typedef typename NPPTypeTraits::npp_type dst_t; + + static void call(const GpuMat& src, GpuMat& dst) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, NPP_RND_NEAR) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } +}; + +////////////////////////////////////////////////////////////////////////// +// Set + +template struct NppSetFunc +{ + typedef typename NPPTypeTraits::npp_type src_t; + + typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI); +}; +template struct NppSetFunc +{ + typedef typename NPPTypeTraits::npp_type src_t; + + typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI); +}; +template struct NppSetFunc +{ + typedef NppStatus (*func_ptr)(Npp8s values[], Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI); +}; +template<> struct NppSetFunc +{ + typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI); +}; + +template::func_ptr func> struct NppSet +{ + typedef typename NPPTypeTraits::npp_type src_t; + + static void call(GpuMat& src, Scalar s) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + Scalar_ nppS = s; + + nppSafeCall( func(nppS.val, src.ptr(), static_cast(src.step), sz) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } +}; +template::func_ptr func> struct NppSet +{ + typedef typename NPPTypeTraits::npp_type src_t; + + static void call(GpuMat& src, Scalar s) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + Scalar_ nppS = s; + + nppSafeCall( func(nppS[0], src.ptr(), static_cast(src.step), sz) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } +}; + +template struct NppSetMaskFunc +{ + typedef typename NPPTypeTraits::npp_type src_t; + + typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); +}; +template struct NppSetMaskFunc +{ + typedef typename NPPTypeTraits::npp_type src_t; + + typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); +}; + +template::func_ptr func> struct NppSetMask +{ + typedef typename NPPTypeTraits::npp_type src_t; + + static void call(GpuMat& src, Scalar s, const GpuMat& mask) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + Scalar_ nppS = s; + + nppSafeCall( func(nppS.val, src.ptr(), static_cast(src.step), sz, mask.ptr(), static_cast(mask.step)) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } +}; +template::func_ptr func> struct NppSetMask +{ + typedef typename NPPTypeTraits::npp_type src_t; + + static void call(GpuMat& src, Scalar s, const GpuMat& mask) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + Scalar_ nppS = s; + + nppSafeCall( func(nppS[0], src.ptr(), static_cast(src.step), sz, mask.ptr(), static_cast(mask.step)) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } +}; + +////////////////////////////////////////////////////////////////////////// +// CopyMasked + +template struct NppCopyMaskedFunc +{ + typedef typename NPPTypeTraits::npp_type src_t; + + typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); +}; + +template::func_ptr func> struct NppCopyMasked +{ + typedef typename NPPTypeTraits::npp_type src_t; + + static void call(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t /*stream*/) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, mask.ptr(), static_cast(mask.step)) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } +}; + +template static inline bool isAligned(const T* ptr, size_t size) +{ + return reinterpret_cast(ptr) % size == 0; +} + +namespace cv { namespace gpu { namespace device +{ + void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0); + void convertTo(const GpuMat& src, GpuMat& dst); + void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0); + void setTo(GpuMat& src, Scalar s, cudaStream_t stream); + void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream); + void setTo(GpuMat& src, Scalar s); + void setTo(GpuMat& src, Scalar s, const GpuMat& mask); + + void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream) + { + CV_Assert(src.size() == dst.size() && src.type() == dst.type()); + CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels())); + + cv::gpu::device::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream); + } + + void convertTo(const GpuMat& src, GpuMat& dst) + { + cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0); + } + + void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream) + { + cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream); + } + + void setTo(GpuMat& src, Scalar s, cudaStream_t stream) + { + typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream); + + static const caller_t callers[] = + { + kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, + kernelSetCaller, kernelSetCaller + }; + + callers[src.depth()](src, s, stream); + } + + void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) + { + typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream); + + static const caller_t callers[] = + { + kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, + kernelSetCaller, kernelSetCaller + }; + + callers[src.depth()](src, s, mask, stream); + } + + void setTo(GpuMat& src, Scalar s) + { + setTo(src, s, 0); + } + + void setTo(GpuMat& src, Scalar s, const GpuMat& mask) + { + setTo(src, s, mask, 0); + } +}}} + +class CudaArch +{ +public: + CudaArch() + { + fromStr(CUDA_ARCH_BIN, bin); + fromStr(CUDA_ARCH_PTX, ptx); + fromStr(CUDA_ARCH_FEATURES, features); + } + + bool builtWith(FeatureSet feature_set) const + { + return !features.empty() && (features.back() >= feature_set); + } + + bool hasPtx(int major, int minor) const + { + return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end(); + } + + bool hasBin(int major, int minor) const + { + return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end(); + } + + bool hasEqualOrLessPtx(int major, int minor) const + { + return !ptx.empty() && (ptx.front() <= major * 10 + minor); + } + + bool hasEqualOrGreaterPtx(int major, int minor) const + { + return !ptx.empty() && (ptx.back() >= major * 10 + minor); + } + + bool hasEqualOrGreaterBin(int major, int minor) const + { + return !bin.empty() && (bin.back() >= major * 10 + minor); + } + + +private: + void fromStr(const string& set_as_str, vector& arr) + { + if (set_as_str.find_first_not_of(" ") == string::npos) + return; + + istringstream stream(set_as_str); + int cur_value; + + while (!stream.eof()) + { + stream >> cur_value; + arr.push_back(cur_value); + } + + sort(arr.begin(), arr.end()); + } + + vector bin; + vector ptx; + vector features; +}; + +class DeviceProps +{ +public: + DeviceProps() + { + props_.resize(10, 0); + } + + ~DeviceProps() + { + for (size_t i = 0; i < props_.size(); ++i) + { + if (props_[i]) + delete props_[i]; + } + props_.clear(); + } + + cudaDeviceProp* get(int devID) + { + if (devID >= (int) props_.size()) + props_.resize(devID + 5, 0); + + if (!props_[devID]) + { + props_[devID] = new cudaDeviceProp; + cudaSafeCall( cudaGetDeviceProperties(props_[devID], devID) ); + } + + return props_[devID]; + } +private: + std::vector props_; +}; + +DeviceProps deviceProps; + +class CudaDeviceInfoFuncTable : public DeviceInfoFuncTable +{ +public: + size_t sharedMemPerBlock() const + { + return deviceProps.get(device_id_)->sharedMemPerBlock; + } + + void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const + { + int prevDeviceID = getDevice(); + if (prevDeviceID != device_id_) + setDevice(device_id_); + + cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) ); + + if (prevDeviceID != device_id_) + setDevice(prevDeviceID); + } + + size_t freeMemory() const + { + size_t _totalMemory, _freeMemory; + queryMemory(_totalMemory, _freeMemory); + return _freeMemory; + } + + size_t totalMemory() const + { + size_t _totalMemory, _freeMemory; + queryMemory(_totalMemory, _freeMemory); + return _totalMemory; + } + + bool supports(FeatureSet feature_set) const + { + int version = majorVersion_ * 10 + minorVersion_; + return version >= feature_set; + } + + bool isCompatible() const + { + // Check PTX compatibility + if (hasEqualOrLessPtx(majorVersion_, minorVersion_)) + return true; + + // Check BIN compatibility + for (int i = minorVersion_; i >= 0; --i) + if (hasBin(majorVersion_, i)) + return true; + + return false; + } + + void query() + { + const cudaDeviceProp* prop = deviceProps.get(device_id_); + + name_ = prop->name; + multi_processor_count_ = prop->multiProcessorCount; + majorVersion_ = prop->major; + minorVersion_ = prop->minor; + } + + int deviceID() const + { + return device_id_; + } + + std::string name() const + { + return name_; + } + + int majorVersion() const + { + return majorVersion_; + } + + int minorVersion() const + { + return minorVersion_; + } + + int multiProcessorCount() const + { + return multi_processor_count_; + } + + int getCudaEnabledDeviceCount() const + { + int count; + cudaError_t error = cudaGetDeviceCount( &count ); + + if (error == cudaErrorInsufficientDriver) + return -1; + + if (error == cudaErrorNoDevice) + return 0; + + cudaSafeCall( error ); + return count; + } + + void setDevice(int device) const + { + cudaSafeCall( cudaSetDevice( device ) ); + } + + int getDevice() const + { + int device; + cudaSafeCall( cudaGetDevice( &device ) ); + return device; + } + + void resetDevice() const + { + cudaSafeCall( cudaDeviceReset() ); + } + + bool builtWith(FeatureSet feature_set) const + { + return cudaArch.builtWith(feature_set); + } + + bool has(int major, int minor) const + { + return hasPtx(major, minor) || hasBin(major, minor); + } + + bool hasPtx(int major, int minor) const + { + return cudaArch.hasPtx(major, minor); + } + + bool hasBin(int major, int minor) const + { + return cudaArch.hasBin(major, minor); + } + + bool hasEqualOrLessPtx(int major, int minor) const + { + return cudaArch.hasEqualOrLessPtx(major, minor); + } + + bool hasEqualOrGreater(int major, int minor) const + { + return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor); + } + + bool hasEqualOrGreaterPtx(int major, int minor) const + { + return cudaArch.hasEqualOrGreaterPtx(major, minor); + } + + bool hasEqualOrGreaterBin(int major, int minor) const + { + return cudaArch.hasEqualOrGreaterBin(major, minor); + } + + bool deviceSupports(FeatureSet feature_set) const + { + static int versions[] = + { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; + static const int cache_size = static_cast(sizeof(versions) / sizeof(versions[0])); + + const int devId = getDevice(); + + int version; + + if (devId < cache_size && versions[devId] >= 0) + version = versions[devId]; + else + { + DeviceInfo dev(devId); + version = dev.majorVersion() * 10 + dev.minorVersion(); + if (devId < cache_size) + versions[devId] = version; + } + + return TargetArchs::builtWith(feature_set) && (version >= feature_set); + } + + void printCudaDeviceInfo(int device) const + { + int count = getCudaEnabledDeviceCount(); + bool valid = (device >= 0) && (device < count); + + int beg = valid ? device : 0; + int end = valid ? device+1 : count; + + printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n"); + printf("Device count: %d\n", count); + + int driverVersion = 0, runtimeVersion = 0; + cudaSafeCall( cudaDriverGetVersion(&driverVersion) ); + cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) ); + + const char *computeMode[] = { + "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", + "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", + "Prohibited (no host thread can use ::cudaSetDevice() with this device)", + "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", + "Unknown", + NULL + }; + + for(int dev = beg; dev < end; ++dev) + { + cudaDeviceProp prop; + cudaSafeCall( cudaGetDeviceProperties(&prop, dev) ); + + printf("\nDevice %d: \"%s\"\n", dev, prop.name); + printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); + printf(" CUDA Capability Major/Minor version number: %d.%d\n", prop.major, prop.minor); + printf(" Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem); + + int cores = convertSMVer2Cores(prop.major, prop.minor); + if (cores > 0) + printf(" (%2d) Multiprocessors x (%2d) CUDA Cores/MP: %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount); + + printf(" GPU Clock Speed: %.2f GHz\n", prop.clockRate * 1e-6f); + + printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n", + prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1], + prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]); + printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n", + prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1], + prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]); + + printf(" Total amount of constant memory: %u bytes\n", (int)prop.totalConstMem); + printf(" Total amount of shared memory per block: %u bytes\n", (int)prop.sharedMemPerBlock); + printf(" Total number of registers available per block: %d\n", prop.regsPerBlock); + printf(" Warp size: %d\n", prop.warpSize); + printf(" Maximum number of threads per block: %d\n", prop.maxThreadsPerBlock); + printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]); + printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); + printf(" Maximum memory pitch: %u bytes\n", (int)prop.memPitch); + printf(" Texture alignment: %u bytes\n", (int)prop.textureAlignment); + + printf(" Concurrent copy and execution: %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount); + printf(" Run time limit on kernels: %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No"); + printf(" Integrated GPU sharing Host Memory: %s\n", prop.integrated ? "Yes" : "No"); + printf(" Support host page-locked memory mapping: %s\n", prop.canMapHostMemory ? "Yes" : "No"); + + printf(" Concurrent kernel execution: %s\n", prop.concurrentKernels ? "Yes" : "No"); + printf(" Alignment requirement for Surfaces: %s\n", prop.surfaceAlignment ? "Yes" : "No"); + printf(" Device has ECC support enabled: %s\n", prop.ECCEnabled ? "Yes" : "No"); + printf(" Device is using TCC driver mode: %s\n", prop.tccDriver ? "Yes" : "No"); + printf(" Device supports Unified Addressing (UVA): %s\n", prop.unifiedAddressing ? "Yes" : "No"); + printf(" Device PCI Bus ID / PCI location ID: %d / %d\n", prop.pciBusID, prop.pciDeviceID ); + printf(" Compute Mode:\n"); + printf(" %s \n", computeMode[prop.computeMode]); + } + + printf("\n"); + printf("deviceQuery, CUDA Driver = CUDART"); + printf(", CUDA Driver Version = %d.%d", driverVersion / 1000, driverVersion % 100); + printf(", CUDA Runtime Version = %d.%d", runtimeVersion/1000, runtimeVersion%100); + printf(", NumDevs = %d\n\n", count); + fflush(stdout); + } + + void printShortCudaDeviceInfo(int device) const + { + int count = getCudaEnabledDeviceCount(); + bool valid = (device >= 0) && (device < count); + + int beg = valid ? device : 0; + int end = valid ? device+1 : count; + + int driverVersion = 0, runtimeVersion = 0; + cudaSafeCall( cudaDriverGetVersion(&driverVersion) ); + cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) ); + + for(int dev = beg; dev < end; ++dev) + { + cudaDeviceProp prop; + cudaSafeCall( cudaGetDeviceProperties(&prop, dev) ); + + const char *arch_str = prop.major < 2 ? " (not Fermi)" : ""; + printf("Device %d: \"%s\" %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f); + printf(", sm_%d%d%s", prop.major, prop.minor, arch_str); + + int cores = convertSMVer2Cores(prop.major, prop.minor); + if (cores > 0) + printf(", %d cores", cores * prop.multiProcessorCount); + + printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); + } + fflush(stdout); + } + +private: + int device_id_; + + std::string name_; + int multi_processor_count_; + int majorVersion_; + int minorVersion_; + + const CudaArch cudaArch; + + int convertSMVer2Cores(int major, int minor) const + { + // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM + typedef struct { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version + int Cores; + } SMtoCores; + + SMtoCores gpuArchCoresPerSM[] = { { 0x10, 8 }, { 0x11, 8 }, { 0x12, 8 }, { 0x13, 8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 } }; + + int index = 0; + while (gpuArchCoresPerSM[index].SM != -1) + { + if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) + return gpuArchCoresPerSM[index].Cores; + index++; + } + + return -1; + } +}; + +class CudaFuncTable : public GpuFuncTable +{ +public: + + void copy(const Mat& src, GpuMat& dst) const + { + cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) ); + } + + void copy(const GpuMat& src, Mat& dst) const + { + cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) ); + } + + void copy(const GpuMat& src, GpuMat& dst) const + { + cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) ); + } + + void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const + { + CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); + CV_Assert(src.size() == dst.size() && src.type() == dst.type()); + CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels())); + + if (src.depth() == CV_64F) + { + if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); + } + + typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream); + static const func_t funcs[7][4] = + { + /* 8U */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 8S */ {cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask }, + /* 16U */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 16S */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 32S */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 32F */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 64F */ {cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask } + }; + + const func_t func = mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::device::copyWithMask; + + func(src, dst, mask, 0); + } + + void convert(const GpuMat& src, GpuMat& dst) const + { + typedef void (*func_t)(const GpuMat& src, GpuMat& dst); + static const func_t funcs[7][7][4] = + { + { + /* 8U -> 8U */ {0, 0, 0, 0}, + /* 8U -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 8U -> 16U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt::call}, + /* 8U -> 16S */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt::call}, + /* 8U -> 32S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 8U -> 32F */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 8U -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo } + }, + { + /* 8S -> 8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 8S -> 8S */ {0,0,0,0}, + /* 8S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 8S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 8S -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 8S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 8S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo} + }, + { + /* 16U -> 8U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt::call}, + /* 16U -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16U -> 16U */ {0,0,0,0}, + /* 16U -> 16S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16U -> 32S */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16U -> 32F */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16U -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo } + }, + { + /* 16S -> 8U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt::call}, + /* 16S -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16S -> 16U */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16S -> 16S */ {0,0,0,0}, + /* 16S -> 32S */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16S -> 32F */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16S -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo } + }, + { + /* 32S -> 8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32S -> 8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32S -> 32S */ {0,0,0,0}, + /* 32S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo} + }, + { + /* 32F -> 8U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32F -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32F -> 16U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32F -> 16S */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32F -> 32S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32F -> 32F */ {0,0,0,0}, + /* 32F -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo} + }, + { + /* 64F -> 8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 64F -> 8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 64F -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 64F -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 64F -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 64F -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 64F -> 64F */ {0,0,0,0} + } + }; + + CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); + CV_Assert(dst.depth() <= CV_64F); + CV_Assert(src.size() == dst.size() && src.channels() == dst.channels()); + + if (src.depth() == CV_64F || dst.depth() == CV_64F) + { + if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); + } + + bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16); + if (!aligned) + { + cv::gpu::device::convertTo(src, dst); + return; + } + + const func_t func = funcs[src.depth()][dst.depth()][src.channels() - 1]; + CV_DbgAssert(func != 0); + + func(src, dst); + } + + void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream) const + { + CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); + CV_Assert(dst.depth() <= CV_64F); + + if (src.depth() == CV_64F || dst.depth() == CV_64F) + { + if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); + } + + cv::gpu::device::convertTo(src, dst, alpha, beta, stream); + } + + void setTo(GpuMat& m, Scalar s, const GpuMat& mask, cudaStream_t stream) const + { + if (mask.empty()) + { + if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0) + { + cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) ); + return; + } + + if (m.depth() == CV_8U) + { + int cn = m.channels(); + + if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3])) + { + int val = saturate_cast(s[0]); + cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) ); + return; + } + } + + typedef void (*func_t)(GpuMat& src, Scalar s); + static const func_t funcs[7][4] = + { + {NppSet::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet::call}, + {cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo }, + {NppSet::call, NppSet::call, cv::gpu::device::setTo , NppSet::call}, + {NppSet::call, NppSet::call, cv::gpu::device::setTo , NppSet::call}, + {NppSet::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet::call}, + {NppSet::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet::call}, + {cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo } + }; + + CV_Assert(m.depth() <= CV_64F && m.channels() <= 4); + + if (m.depth() == CV_64F) + { + if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); + } + + if (stream) + cv::gpu::device::setTo(m, s, stream); + else + funcs[m.depth()][m.channels() - 1](m, s); + } + else + { + typedef void (*func_t)(GpuMat& src, Scalar s, const GpuMat& mask); + static const func_t funcs[7][4] = + { + {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, + {cv::gpu::device::setTo , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo }, + {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, + {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, + {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, + {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, + {cv::gpu::device::setTo , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo } + }; + + CV_Assert(m.depth() <= CV_64F && m.channels() <= 4); + + if (m.depth() == CV_64F) + { + if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); + } + + if (stream) + cv::gpu::device::setTo(m, s, mask, stream); + else + funcs[m.depth()][m.channels() - 1](m, s, mask); + } + } + + void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const + { + cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) ); + } + + void free(void* devPtr) const + { + cudaFree(devPtr); + } +}; +#endif +#endif diff --git a/modules/core/src/cuda/matrix_operations.cu b/modules/dynamicuda/src/cuda/matrix_operations.cu similarity index 100% rename from modules/core/src/cuda/matrix_operations.cu rename to modules/dynamicuda/src/cuda/matrix_operations.cu diff --git a/modules/dynamicuda/src/main.cpp b/modules/dynamicuda/src/main.cpp new file mode 100644 index 0000000000..8eb66fd98d --- /dev/null +++ b/modules/dynamicuda/src/main.cpp @@ -0,0 +1,55 @@ +#include "cvconfig.h" +#include "opencv2/core/core.hpp" +#include "opencv2/core/gpumat.hpp" + +#include +#include + +#ifdef HAVE_CUDA +#include +#include + +#define CUDART_MINIMUM_REQUIRED_VERSION 4020 +#define NPP_MINIMUM_REQUIRED_VERSION 4200 + +#if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION) +#error "Insufficient Cuda Runtime library version, please update it." +#endif + +#if (NPP_VERSION_MAJOR * 1000 + NPP_VERSION_MINOR * 100 + NPP_VERSION_BUILD < NPP_MINIMUM_REQUIRED_VERSION) +#error "Insufficient NPP version, please update it." +#endif +#endif + +using namespace std; +using namespace cv; +using namespace cv::gpu; + +#define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support") + +#include "opencv2/dynamicuda/dynamicuda.hpp" + +#ifdef HAVE_CUDA +static CudaDeviceInfoFuncTable deviceInfoTable; +static CudaFuncTable gpuTable; +#else +static EmptyDeviceInfoFuncTable deviceInfoTable; +static EmptyFuncTable gpuTable; +#endif + +extern "C" { + +DeviceInfoFuncTable* deviceInfoFactory(); +GpuFuncTable* gpuFactory(); + +DeviceInfoFuncTable* deviceInfoFactory() +{ + return (DeviceInfoFuncTable*)&deviceInfoTable; +} + +GpuFuncTable* gpuFactory() +{ + return (GpuFuncTable*)&gpuTable; +} + +} diff --git a/modules/gpu/CMakeLists.txt b/modules/gpu/CMakeLists.txt index a616597894..9171febc74 100644 --- a/modules/gpu/CMakeLists.txt +++ b/modules/gpu/CMakeLists.txt @@ -3,7 +3,8 @@ if(IOS) endif() set(the_description "GPU-accelerated Computer Vision") -ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_photo opencv_legacy) +ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_photo opencv_legacy + OPTIONAL ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY}) ocv_module_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/src/cuda") diff --git a/modules/gpu/perf4au/CMakeLists.txt b/modules/gpu/perf4au/CMakeLists.txt index 376e7b2706..13efe7ffa3 100644 --- a/modules/gpu/perf4au/CMakeLists.txt +++ b/modules/gpu/perf4au/CMakeLists.txt @@ -2,26 +2,28 @@ set(PERF4AU_REQUIRED_DEPS opencv_core opencv_imgproc opencv_highgui opencv_video ocv_check_dependencies(${PERF4AU_REQUIRED_DEPS}) -set(the_target gpu_perf4au) -project(${the_target}) +if (OCV_DEPENDENCIES_FOUND) + set(the_target gpu_perf4au) + project(${the_target}) -ocv_include_modules(${PERF4AU_REQUIRED_DEPS}) + ocv_include_modules(${PERF4AU_REQUIRED_DEPS}) -if(CMAKE_COMPILER_IS_GNUCXX AND NOT ENABLE_NOISY_WARNINGS) + if(CMAKE_COMPILER_IS_GNUCXX AND NOT ENABLE_NOISY_WARNINGS) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-function") -endif() + endif() -file(GLOB srcs RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp *.h *.hpp) -add_executable(${the_target} ${srcs}) + file(GLOB srcs RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp *.h *.hpp) + add_executable(${the_target} ${srcs}) -target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${PERF4AU_REQUIRED_DEPS}) + target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${PERF4AU_REQUIRED_DEPS}) -if(ENABLE_SOLUTION_FOLDERS) - set_target_properties(${the_target} PROPERTIES FOLDER "tests performance") -endif() + if(ENABLE_SOLUTION_FOLDERS) + set_target_properties(${the_target} PROPERTIES FOLDER "tests performance") + endif() -if(WIN32) + if(WIN32) if(MSVC AND NOT BUILD_SHARED_LIBS) - set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:atlthunk.lib /NODEFAULTLIB:atlsd.lib /DEBUG") + set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:atlthunk.lib /NODEFAULTLIB:atlsd.lib /DEBUG") endif() -endif() + endif() +endif() \ No newline at end of file diff --git a/modules/java/CMakeLists.txt b/modules/java/CMakeLists.txt index 5012f914c7..3a6ebe8362 100644 --- a/modules/java/CMakeLists.txt +++ b/modules/java/CMakeLists.txt @@ -297,6 +297,12 @@ if(BUILD_FAT_JAVA_LIB) list(REMOVE_ITEM __deps ${m}) endif() endforeach() + if (ENABLE_DYNAMIC_CUDA) + list(REMOVE_ITEM __deps "opencv_dynamicuda") + endif() + if (ANDROID AND HAVE_opencv_gpu) + list(REMOVE_ITEM __deps "opencv_gpu") + endif() ocv_list_unique(__deps) set(__extradeps ${__deps}) ocv_list_filterout(__extradeps "^opencv_") diff --git a/modules/stitching/CMakeLists.txt b/modules/stitching/CMakeLists.txt index fda44591f7..6e9a35ba73 100644 --- a/modules/stitching/CMakeLists.txt +++ b/modules/stitching/CMakeLists.txt @@ -1,2 +1,6 @@ set(the_description "Images stitching") -ocv_define_module(stitching opencv_imgproc opencv_features2d opencv_calib3d opencv_objdetect OPTIONAL opencv_gpu opencv_nonfree) +if (ENABLE_DYNAMIC_CUDA) + ocv_define_module(stitching opencv_imgproc opencv_features2d opencv_calib3d opencv_objdetect OPTIONAL opencv_nonfree) +else() + ocv_define_module(stitching opencv_imgproc opencv_features2d opencv_calib3d opencv_objdetect OPTIONAL opencv_gpu opencv_nonfree) +endif() \ No newline at end of file diff --git a/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp b/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp index 09a1a106fd..9301dc5ebe 100644 --- a/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp +++ b/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp @@ -227,7 +227,7 @@ private: }; -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) class CV_EXPORTS GraphCutSeamFinderGpu : public GraphCutSeamFinderBase, public PairwiseSeamFinder { public: diff --git a/modules/stitching/include/opencv2/stitching/detail/warpers.hpp b/modules/stitching/include/opencv2/stitching/detail/warpers.hpp index 2bd46f75a9..d44bfe69eb 100644 --- a/modules/stitching/include/opencv2/stitching/detail/warpers.hpp +++ b/modules/stitching/include/opencv2/stitching/detail/warpers.hpp @@ -46,7 +46,7 @@ #include "opencv2/core/core.hpp" #include "opencv2/imgproc/imgproc.hpp" #include "opencv2/opencv_modules.hpp" -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) # include "opencv2/gpu/gpu.hpp" #endif @@ -331,7 +331,7 @@ public: }; -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) class CV_EXPORTS PlaneWarperGpu : public PlaneWarper { public: diff --git a/modules/stitching/include/opencv2/stitching/warpers.hpp b/modules/stitching/include/opencv2/stitching/warpers.hpp index 7475d1304a..87efa7e80a 100644 --- a/modules/stitching/include/opencv2/stitching/warpers.hpp +++ b/modules/stitching/include/opencv2/stitching/warpers.hpp @@ -145,7 +145,7 @@ public: -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) class PlaneWarperGpu: public WarperCreator { public: diff --git a/modules/stitching/src/blenders.cpp b/modules/stitching/src/blenders.cpp index e65023a55d..fb3c0d666b 100644 --- a/modules/stitching/src/blenders.cpp +++ b/modules/stitching/src/blenders.cpp @@ -189,7 +189,7 @@ Rect FeatherBlender::createWeightMaps(const vector &masks, const vector &pyr) void createLaplacePyrGpu(const Mat &img, int num_levels, vector &pyr) { -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) pyr.resize(num_levels + 1); vector gpu_pyr(num_levels + 1); @@ -531,7 +531,7 @@ void restoreImageFromLaplacePyr(vector &pyr) void restoreImageFromLaplacePyrGpu(vector &pyr) { -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) if (pyr.empty()) return; diff --git a/modules/stitching/src/matchers.cpp b/modules/stitching/src/matchers.cpp index d918cfff29..d86206233f 100644 --- a/modules/stitching/src/matchers.cpp +++ b/modules/stitching/src/matchers.cpp @@ -46,7 +46,7 @@ using namespace std; using namespace cv; using namespace cv::detail; -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) using namespace cv::gpu; #endif @@ -129,7 +129,7 @@ private: float match_conf_; }; -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) class GpuMatcher : public FeaturesMatcher { public: @@ -204,7 +204,7 @@ void CpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat LOG("1->2 & 2->1 matches: " << matches_info.matches.size() << endl); } -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) void GpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &features2, MatchesInfo& matches_info) { matches_info.matches.clear(); @@ -432,7 +432,7 @@ void OrbFeaturesFinder::find(const Mat &image, ImageFeatures &features) } } -#if defined(HAVE_OPENCV_NONFREE) && defined(HAVE_OPENCV_GPU) +#if defined(HAVE_OPENCV_NONFREE) && defined(HAVE_OPENCV_GPU) && !defined(ANDROID) SurfFeaturesFinderGpu::SurfFeaturesFinderGpu(double hess_thresh, int num_octaves, int num_layers, int num_octaves_descr, int num_layers_descr) { @@ -533,7 +533,7 @@ void FeaturesMatcher::operator ()(const vector &features, vector< BestOf2NearestMatcher::BestOf2NearestMatcher(bool try_use_gpu, float match_conf, int num_matches_thresh1, int num_matches_thresh2) { -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) if (try_use_gpu && getCudaEnabledDeviceCount() > 0) impl_ = new GpuMatcher(match_conf); else diff --git a/modules/stitching/src/precomp.hpp b/modules/stitching/src/precomp.hpp index 1050856d31..54b6721437 100644 --- a/modules/stitching/src/precomp.hpp +++ b/modules/stitching/src/precomp.hpp @@ -68,7 +68,7 @@ #include "opencv2/imgproc/imgproc.hpp" #include "opencv2/features2d/features2d.hpp" #include "opencv2/calib3d/calib3d.hpp" -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) #include "opencv2/gpu/gpu.hpp" #ifdef HAVE_OPENCV_NONFREE diff --git a/modules/stitching/src/seam_finders.cpp b/modules/stitching/src/seam_finders.cpp index 784209c935..a198c1ebb4 100644 --- a/modules/stitching/src/seam_finders.cpp +++ b/modules/stitching/src/seam_finders.cpp @@ -1318,7 +1318,7 @@ void GraphCutSeamFinder::find(const vector &src, const vector &corne } -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) void GraphCutSeamFinderGpu::find(const vector &src, const vector &corners, vector &masks) { diff --git a/modules/stitching/src/stitcher.cpp b/modules/stitching/src/stitcher.cpp index 5da26f6dbf..4a36ab0a45 100644 --- a/modules/stitching/src/stitcher.cpp +++ b/modules/stitching/src/stitcher.cpp @@ -58,7 +58,7 @@ Stitcher Stitcher::createDefault(bool try_use_gpu) stitcher.setFeaturesMatcher(new detail::BestOf2NearestMatcher(try_use_gpu)); stitcher.setBundleAdjuster(new detail::BundleAdjusterRay()); -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) if (try_use_gpu && gpu::getCudaEnabledDeviceCount() > 0) { #if defined(HAVE_OPENCV_NONFREE) diff --git a/modules/stitching/src/warpers.cpp b/modules/stitching/src/warpers.cpp index 932958c6f7..935831950f 100644 --- a/modules/stitching/src/warpers.cpp +++ b/modules/stitching/src/warpers.cpp @@ -212,7 +212,7 @@ void SphericalWarper::detectResultRoi(Size src_size, Point &dst_tl, Point &dst_b } -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) Rect PlaneWarperGpu::buildMaps(Size src_size, const Mat &K, const Mat &R, gpu::GpuMat &xmap, gpu::GpuMat &ymap) { return buildMaps(src_size, K, R, Mat::zeros(3, 1, CV_32F), xmap, ymap); diff --git a/modules/superres/CMakeLists.txt b/modules/superres/CMakeLists.txt index 44e9dc0f3b..3da8dc2c6e 100644 --- a/modules/superres/CMakeLists.txt +++ b/modules/superres/CMakeLists.txt @@ -4,4 +4,4 @@ endif() set(the_description "Super Resolution") ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 -Wundef) -ocv_define_module(superres opencv_imgproc opencv_video OPTIONAL opencv_gpu opencv_highgui opencv_ocl) +ocv_define_module(superres opencv_imgproc opencv_video OPTIONAL opencv_gpu opencv_highgui opencv_ocl ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) diff --git a/modules/videostab/CMakeLists.txt b/modules/videostab/CMakeLists.txt index ac5cb0d69b..84ec1d2e8d 100644 --- a/modules/videostab/CMakeLists.txt +++ b/modules/videostab/CMakeLists.txt @@ -1,2 +1,6 @@ set(the_description "Video stabilization") -ocv_define_module(videostab opencv_imgproc opencv_features2d opencv_video opencv_photo opencv_calib3d opencv_highgui OPTIONAL opencv_gpu) +if(ENABLE_DYNAMIC_CUDA) + ocv_define_module(videostab opencv_imgproc opencv_features2d opencv_video opencv_photo opencv_calib3d opencv_highgui) +else() + ocv_define_module(videostab opencv_imgproc opencv_features2d opencv_video opencv_photo opencv_calib3d opencv_highgui OPTIONAL opencv_gpu) +endif() diff --git a/modules/videostab/include/opencv2/videostab/optical_flow.hpp b/modules/videostab/include/opencv2/videostab/optical_flow.hpp index 18b7d3f283..2c1742fc79 100644 --- a/modules/videostab/include/opencv2/videostab/optical_flow.hpp +++ b/modules/videostab/include/opencv2/videostab/optical_flow.hpp @@ -46,7 +46,7 @@ #include "opencv2/core/core.hpp" #include "opencv2/opencv_modules.hpp" -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) # include "opencv2/gpu/gpu.hpp" #endif @@ -98,7 +98,7 @@ public: OutputArray status, OutputArray errors); }; -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) class CV_EXPORTS DensePyrLkOptFlowEstimatorGpu : public PyrLkOptFlowEstimatorBase, public IDenseOptFlowEstimator { diff --git a/modules/videostab/src/inpainting.cpp b/modules/videostab/src/inpainting.cpp index 4377c007c8..c6568e071e 100644 --- a/modules/videostab/src/inpainting.cpp +++ b/modules/videostab/src/inpainting.cpp @@ -323,7 +323,7 @@ public: MotionInpainter::MotionInpainter() { -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) setOptFlowEstimator(new DensePyrLkOptFlowEstimatorGpu()); #else CV_Error(CV_StsNotImplemented, "Current implementation of MotionInpainter requires GPU"); diff --git a/modules/videostab/src/optical_flow.cpp b/modules/videostab/src/optical_flow.cpp index 46100fdb59..3441df1683 100644 --- a/modules/videostab/src/optical_flow.cpp +++ b/modules/videostab/src/optical_flow.cpp @@ -59,7 +59,7 @@ void SparsePyrLkOptFlowEstimator::run( } -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) DensePyrLkOptFlowEstimatorGpu::DensePyrLkOptFlowEstimatorGpu() { CV_Assert(gpu::getCudaEnabledDeviceCount() > 0); diff --git a/samples/cpp/stitching_detailed.cpp b/samples/cpp/stitching_detailed.cpp index 49d86086de..7394a72821 100644 --- a/samples/cpp/stitching_detailed.cpp +++ b/samples/cpp/stitching_detailed.cpp @@ -355,7 +355,7 @@ int main(int argc, char* argv[]) Ptr finder; if (features_type == "surf") { -#if defined(HAVE_OPENCV_NONFREE) && defined(HAVE_OPENCV_GPU) +#if defined(HAVE_OPENCV_NONFREE) && defined(HAVE_OPENCV_GPU) && !defined(ANDROID) if (try_gpu && gpu::getCudaEnabledDeviceCount() > 0) finder = new SurfFeaturesFinderGpu(); else @@ -543,7 +543,7 @@ int main(int argc, char* argv[]) // Warp images and their masks Ptr warper_creator; -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) if (try_gpu && gpu::getCudaEnabledDeviceCount() > 0) { if (warp_type == "plane") warper_creator = new cv::PlaneWarperGpu(); @@ -608,7 +608,7 @@ int main(int argc, char* argv[]) seam_finder = new detail::VoronoiSeamFinder(); else if (seam_find_type == "gc_color") { -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) if (try_gpu && gpu::getCudaEnabledDeviceCount() > 0) seam_finder = new detail::GraphCutSeamFinderGpu(GraphCutSeamFinderBase::COST_COLOR); else @@ -617,7 +617,7 @@ int main(int argc, char* argv[]) } else if (seam_find_type == "gc_colorgrad") { -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) if (try_gpu && gpu::getCudaEnabledDeviceCount() > 0) seam_finder = new detail::GraphCutSeamFinderGpu(GraphCutSeamFinderBase::COST_COLOR_GRAD); else