diff --git a/CMakeLists.txt b/CMakeLists.txt index 23b79ec2b6..da82a9d908 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -708,47 +708,36 @@ if(WITH_CUDA) message(STATUS "CUDA detected: " ${CUDA_VERSION}) set(CUDA_ARCH_GPU "1.3 2.0" CACHE STRING "Specify 'real' GPU architectures to build binaries for") - set(CUDA_ARCH_PTX "1.1 1.3" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for") + set(CUDA_ARCH_PTX "1.1 1.3" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for") - # Architectures to be searched for in user's input - set (CUDA_ARCH_ALL 1.0 1.1 1.2 1.3 2.0 2.1) + # These variables are used in config templates + string(REGEX REPLACE "\\." "" ARCH_GPU_NO_POINTS "${CUDA_ARCH_GPU}") + string(REGEX REPLACE "\\." "" ARCH_PTX_NO_POINTS "${CUDA_ARCH_PTX}") - # Parse user's input - foreach(ARCH IN LISTS CUDA_ARCH_ALL) - string(REGEX MATCH ${ARCH} ARCH_GPU_MATCH "${CUDA_ARCH_GPU}") - string(REGEX MATCH ${ARCH} ARCH_PTX_MATCH "${CUDA_ARCH_PTX}") - string(REGEX REPLACE "\\." "" ARCH_GPU_AS_NUM "${ARCH_GPU_MATCH}") - string(REGEX REPLACE "\\." "" ARCH_PTX_AS_NUM "${ARCH_PTX_MATCH}") - - # Define variables indicating the architectures specified by user - if(NOT ${ARCH_GPU_AS_NUM} STREQUAL "") - set(OPENCV_ARCH_GPU_${ARCH_GPU_AS_NUM} 1) - endif() - if(NOT ${ARCH_PTX_AS_NUM} STREQUAL "") - set(OPENCV_ARCH_PTX_${ARCH_PTX_AS_NUM} 1) - endif() - endforeach() + # Ckeck if user specified 1.0 compute capability + string(REGEX MATCH "1.0" HAS_ARCH_10 "${CUDA_ARCH_GPU} ${CUDA_ARCH_PTX}") + if(NOT ${HAS_ARCH_10} STREQUAL "") + set(OPENCV_ARCH_GPU_OR_PTX_10 1) + endif() set(NVCC_FLAGS_EXTRA "") # Tell nvcc to add binaries for the specified GPUs - string(REGEX REPLACE "\\." "" CUDA_ARCH_GPU "${CUDA_ARCH_GPU}") - string(REGEX MATCHALL "[0-9]+" CUDA_ARCH_GPU_LIST "${CUDA_ARCH_GPU}") - foreach(ARCH_GPU IN LISTS CUDA_ARCH_GPU_LIST) - set(NVCC_FLAGS_EXTRA ${NVCC_FLAGS_EXTRA} -gencode arch=compute_${ARCH_GPU},code=sm_${ARCH_GPU}) + string(REGEX MATCHALL "[0-9]+" ARCH_LIST "${ARCH_GPU_NO_POINTS}") + foreach(ARCH IN LISTS ARCH_LIST) + set(NVCC_FLAGS_EXTRA ${NVCC_FLAGS_EXTRA} -gencode arch=compute_${ARCH},code=sm_${ARCH}) endforeach() # Tell nvcc to add PTX intermediate code for the specified architectures - string(REGEX REPLACE "\\." "" CUDA_ARCH_PTX "${CUDA_ARCH_PTX}") - string(REGEX MATCHALL "[0-9]+" CUDA_ARCH_PTX_LIST "${CUDA_ARCH_PTX}") - foreach(ARCH_PTX IN LISTS CUDA_ARCH_PTX_LIST) - set(NVCC_FLAGS_EXTRA ${NVCC_FLAGS_EXTRA} -gencode arch=compute_${ARCH_PTX},code=compute_${ARCH_PTX}) + string(REGEX MATCHALL "[0-9]+" ARCH_LIST "${ARCH_PTX_NO_POINTS}") + foreach(ARCH IN LISTS ARCH_LIST) + set(NVCC_FLAGS_EXTRA ${NVCC_FLAGS_EXTRA} -gencode arch=compute_${ARCH},code=compute_${ARCH}) endforeach() - - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${NVCC_FLAGS_EXTRA}) - message(STATUS "CUDA NVCC flags: ${CUDA_NVCC_FLAGS}") + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${NVCC_FLAGS_EXTRA}) set(OpenCV_CUDA_CC "${NVCC_FLAGS_EXTRA}") + + message(STATUS "CUDA NVCC flags: ${CUDA_NVCC_FLAGS}") endif() endif() diff --git a/cvconfig.h.cmake b/cvconfig.h.cmake index 053fde1e24..27681a3b2e 100644 --- a/cvconfig.h.cmake +++ b/cvconfig.h.cmake @@ -163,21 +163,14 @@ /* NVidia Cuda Runtime API*/ #cmakedefine HAVE_CUDA -/* Compile for 'real' NVIDIA GPU architecture */ -#cmakedefine OPENCV_ARCH_GPU_10 -#cmakedefine OPENCV_ARCH_GPU_11 -#cmakedefine OPENCV_ARCH_GPU_12 -#cmakedefine OPENCV_ARCH_GPU_13 -#cmakedefine OPENCV_ARCH_GPU_20 -#cmakedefine OPENCV_ARCH_GPU_21 - -/* Compile for 'virtual' NVIDIA PTX architecture */ -#cmakedefine OPENCV_ARCH_PTX_10 -#cmakedefine OPENCV_ARCH_PTX_11 -#cmakedefine OPENCV_ARCH_PTX_12 -#cmakedefine OPENCV_ARCH_PTX_13 -#cmakedefine OPENCV_ARCH_PTX_20 -#cmakedefine OPENCV_ARCH_PTX_21 +/* Compile for 'real' NVIDIA GPU architectures */ +#define OPENCV_ARCH_GPU "${ARCH_GPU_NO_POINTS}" + +/* Compile for 'virtual' NVIDIA PTX architectures */ +#define OPENCV_ARCH_PTX "${ARCH_PTX_NO_POINTS}" + +/* Create PTX or CUBIN for 1.0 compute capability */ +#cmakedefine OPENCV_ARCH_GPU_OR_PTX_10 /* VideoInput library */ #cmakedefine HAVE_VIDEOINPUT diff --git a/doc/gpu_image_processing.tex b/doc/gpu_image_processing.tex index 38e326b3eb..fe6a69ee8b 100644 --- a/doc/gpu_image_processing.tex +++ b/doc/gpu_image_processing.tex @@ -232,10 +232,10 @@ private: \cvCppFunc{gpu::ConvolveBuf::ConvolveBuf} -\cvdefCpp{ConvolveBuf();} +\cvdefCpp{ConvolveBuf::ConvolveBuf();} Constructs an empty buffer which will be properly resized after first call of the convolve function. -\cvdefCpp{ConvolveBuf(Size image\_size, Size templ\_size);} +\cvdefCpp{ConvolveBuf::ConvolveBuf(Size image\_size, Size templ\_size);} Constructs a buffer for the convolve function with respectively arguments. diff --git a/doc/gpu_object_detection.tex b/doc/gpu_object_detection.tex index e5cfb18cf7..46cca72fee 100644 --- a/doc/gpu_object_detection.tex +++ b/doc/gpu_object_detection.tex @@ -82,13 +82,13 @@ Creates HOG descriptor and detector. \cvCppFunc{gpu::HOGDescriptor::getDescriptorSize} Returns number of coefficients required for the classification. -\cvdefCpp{size\_t getDescriptorSize() const;} +\cvdefCpp{size\_t HOGDescriptor::getDescriptorSize() const;} \cvCppFunc{gpu::HOGDescriptor::getBlockHistogramSize} Returns block histogram size. -\cvdefCpp{size\_t getBlockHistogramSize() const;} +\cvdefCpp{size\_t HOGDescriptor::getBlockHistogramSize() const;} \cvCppFunc{gpu::HOGDescriptor::setSVMDetector} @@ -100,25 +100,25 @@ Sets coefficients for the linear SVM classifier. \cvCppFunc{gpu::HOGDescriptor::getDefaultPeopleDetector} Returns coefficients of the classifier trained for people detection (for default window size). -\cvdefCpp{static vector getDefaultPeopleDetector();} +\cvdefCpp{static vector HOGDescriptor::getDefaultPeopleDetector();} \cvCppFunc{gpu::HOGDescriptor::getPeopleDetector48x96} Returns coefficients of the classifier trained for people detection (for 48x96 windows). -\cvdefCpp{static vector getPeopleDetector48x96();} +\cvdefCpp{static vector HOGDescriptor::getPeopleDetector48x96();} \cvCppFunc{gpu::HOGDescriptor::getPeopleDetector64x128} Returns coefficients of the classifier trained for people detection (for 64x128 windows). -\cvdefCpp{static vector getPeopleDetector64x128();} +\cvdefCpp{static vector HOGDescriptor::getPeopleDetector64x128();} \cvCppFunc{gpu::HOGDescriptor::detect} Perfroms object detection without multiscale window. -\cvdefCpp{void detect(const GpuMat\& img, vector\& found\_locations,\par +\cvdefCpp{void HOGDescriptor::detect(const GpuMat\& img, vector\& found\_locations,\par double hit\_threshold=0, Size win\_stride=Size(),\par Size padding=Size());} @@ -134,10 +134,10 @@ Perfroms object detection without multiscale window. \cvCppFunc{gpu::HOGDescriptor::detectMultiScale} Perfroms object detection with multiscale window. -\cvdefCpp{void detectMultiScale(const GpuMat\& img, vector\& found\_locations,\par - double hit\_threshold=0, Size win\_stride=Size(),\par - Size padding=Size(), double scale0=1.05,\par - int group\_threshold=2);} +\cvdefCpp{void HOGDescriptor::detectMultiScale(const GpuMat\& img,\par + vector\& found\_locations, double hit\_threshold=0,\par + Size win\_stride=Size(), Size padding=Size(),\par + double scale0=1.05, int group\_threshold=2);} \begin{description} \cvarg{img}{Source image. See \cvCppCross{gpu::HOGDescriptor::detect} for type limitations.} @@ -154,9 +154,9 @@ See \cvCppCross{groupRectangles}.} \cvCppFunc{gpu::HOGDescriptor::getDescriptors} Returns block descriptors computed for the whole image. It's mainly used for classifier learning purposes. -\cvdefCpp{void getDescriptors(const GpuMat\& img, Size win\_stride,\par - GpuMat\& descriptors,\par - int descr\_format=DESCR\_FORMAT\_COL\_BY\_COL);} +\cvdefCpp{void HOGDescriptor::getDescriptors(const GpuMat\& img,\par + Size win\_stride, GpuMat\& descriptors,\par + int descr\_format=DESCR\_FORMAT\_COL\_BY\_COL);} \begin{description} \cvarg{img}{Source image. See \cvCppCross{gpu::HOGDescriptor::detect} for type limitations.} diff --git a/modules/gpu/src/initialization.cpp b/modules/gpu/src/initialization.cpp index 10ee3b5b9d..d754c87364 100644 --- a/modules/gpu/src/initialization.cpp +++ b/modules/gpu/src/initialization.cpp @@ -41,6 +41,7 @@ //M*/ #include "precomp.hpp" +#include using namespace cv; using namespace cv::gpu; @@ -58,12 +59,12 @@ CV_EXPORTS void cv::gpu::getGpuMemInfo(size_t& /*free*/, size_t& /*total*/) { t CV_EXPORTS bool cv::gpu::hasNativeDoubleSupport(int /*device*/) { throw_nogpu(); return false; } CV_EXPORTS bool cv::gpu::hasAtomicsSupport(int /*device*/) { throw_nogpu(); return false; } CV_EXPORTS bool cv::gpu::hasPtxVersion(int major, int minor) { throw_nogpu(); return false; } -CV_EXPORTS bool cv::gpu::hasLessOrEqualPtxVersion(int major, int minor) { throw_nogpu(); return false; } -CV_EXPORTS bool cv::gpu::hasGreaterOrEqualPtxVersion(int major, int minor) { throw_nogpu(); return false; } -CV_EXPORTS bool cv::gpu::hasCubinVersion(int major, int minor) { throw_nogpu(); return false; } -CV_EXPORTS bool cv::gpu::hasGreaterOrEqualCubinVersion(int major, int minor) { throw_nogpu(); return false; } -CV_EXPORTS bool cv::gpu::hasVersion(int major, int minor) { throw_nogpu(); return false; } -CV_EXPORTS bool cv::gpu::hasGreaterOrEqualVersion(int major, int minor) { throw_nogpu(); return false; } +CV_EXPORTS bool cv::gpu::hasLessOrEqualPtxVersion(int major, int minor) { return false; } +CV_EXPORTS bool cv::gpu::hasGreaterOrEqualPtxVersion(int major, int minor) { return false; } +CV_EXPORTS bool cv::gpu::hasCubinVersion(int major, int minor) { return false; } +CV_EXPORTS bool cv::gpu::hasGreaterOrEqualCubinVersion(int major, int minor) { return false; } +CV_EXPORTS bool cv::gpu::hasVersion(int major, int minor) { return false; } +CV_EXPORTS bool cv::gpu::hasGreaterOrEqualVersion(int major, int minor) { return false; } CV_EXPORTS bool cv::gpu::isCompatibleWith(int device) { throw_nogpu(); return false; } @@ -142,118 +143,55 @@ CV_EXPORTS bool cv::gpu::hasAtomicsSupport(int device) namespace { template - bool checkPtxVersion(int major, int minor, Comparer cmp) + bool compare(const std::string& str, int x, Comparer cmp) { -#ifdef OPENCV_ARCH_PTX_10 - if (cmp(1, 0, major, minor)) return true; -#endif - -#ifdef OPENCV_ARCH_PTX_11 - if (cmp(1, 1, major, minor)) return true; -#endif - -#ifdef OPENCV_ARCH_PTX_12 - if (cmp(1, 2, major, minor)) return true; -#endif - -#ifdef OPENCV_ARCH_PTX_13 - if (cmp(1, 3, major, minor)) return true; -#endif - -#ifdef OPENCV_ARCH_PTX_20 - if (cmp(2, 0, major, minor)) return true; -#endif - -#ifdef OPENCV_ARCH_PTX_21 - if (cmp(2, 1, major, minor)) return true; -#endif - - return false; - } - - template - bool checkCubinVersion(int major, int minor, Comparer cmp) - { -#ifdef OPENCV_ARCH_GPU_10 - if (cmp(1, 0, major, minor)) return true; -#endif - -#ifdef OPENCV_ARCH_GPU_11 - if (cmp(1, 1, major, minor)) return true; -#endif + std::stringstream stream(str); -#ifdef OPENCV_ARCH_GPU_12 - if (cmp(1, 2, major, minor)) return true; -#endif - -#ifdef OPENCV_ARCH_GPU_13 - if (cmp(1, 3, major, minor)) return true; -#endif - -#ifdef OPENCV_ARCH_GPU_20 - if (cmp(2, 0, major, minor)) return true; -#endif - -#ifdef OPENCV_ARCH_GPU_21 - if (cmp(2, 1, major, minor)) return true; -#endif - - return false; - } + int val; + stream >> val; - struct ComparerEqual - { - bool operator()(int lhs1, int lhs2, int rhs1, int rhs2) const - { - return lhs1 == rhs1 && lhs2 == rhs2; - } - }; - - struct ComparerLessOrEqual - { - bool operator()(int lhs1, int lhs2, int rhs1, int rhs2) const + while (!stream.eof() && !stream.fail()) { - return lhs1 < rhs1 || (lhs1 == rhs1 && lhs2 <= rhs2); + if (cmp(val, x)) + return true; + stream >> val; } - }; - struct ComparerGreaterOrEqual - { - bool operator()(int lhs1, int lhs2, int rhs1, int rhs2) const - { - return lhs1 > rhs1 || (lhs1 == rhs1 && lhs2 >= rhs2); - } - }; + return false; + } } CV_EXPORTS bool cv::gpu::hasPtxVersion(int major, int minor) { - return checkPtxVersion(major, minor, ComparerEqual()); + return ::compare(OPENCV_ARCH_PTX, major * 10 + minor, std::equal_to()); } CV_EXPORTS bool cv::gpu::hasLessOrEqualPtxVersion(int major, int minor) { - return checkPtxVersion(major, minor, ComparerLessOrEqual()); + return ::compare(OPENCV_ARCH_PTX, major * 10 + minor, + std::less_equal()); } CV_EXPORTS bool cv::gpu::hasGreaterOrEqualPtxVersion(int major, int minor) { - return checkPtxVersion(major, minor, ComparerGreaterOrEqual()); + return ::compare(OPENCV_ARCH_PTX, major * 10 + minor, + std::greater_equal()); } CV_EXPORTS bool cv::gpu::hasCubinVersion(int major, int minor) { - return checkCubinVersion(major, minor, ComparerEqual()); + return ::compare(OPENCV_ARCH_GPU, major * 10 + minor, std::equal_to()); } CV_EXPORTS bool cv::gpu::hasGreaterOrEqualCubinVersion(int major, int minor) { - return checkCubinVersion(major, minor, ComparerGreaterOrEqual()); + return ::compare(OPENCV_ARCH_GPU, major * 10 + minor, + std::greater_equal()); } @@ -284,7 +222,7 @@ CV_EXPORTS bool cv::gpu::isCompatibleWith(int device) return true; // Check CUBIN compatibilty - for (int i = 0; i <= minor; ++i) + for (int i = minor; i >= 0; --i) if (hasCubinVersion(major, i)) return true; diff --git a/modules/gpu/src/precomp.hpp b/modules/gpu/src/precomp.hpp index a7ba6ffaba..03acb4a2bd 100644 --- a/modules/gpu/src/precomp.hpp +++ b/modules/gpu/src/precomp.hpp @@ -85,6 +85,10 @@ #error "Insufficient NPP version, please update it." #endif +#if defined(OPENCV_ARCH_GPU_OR_PTX_10) + #error "OpenCV GPU module doesn't support NVIDIA compute capability 1.0" +#endif + static inline void throw_nogpu() { CV_Error(CV_GpuNotSupported, "The called functionality is disabled for current build or platform"); } #else /* defined(HAVE_CUDA) */