updated gpu initialization functions, added compile-time error on CC 1.0

pull/13383/head
Alexey Spizhevoy 14 years ago
parent 6187b97199
commit 574b3f94a1
  1. 47
      CMakeLists.txt
  2. 23
      cvconfig.h.cmake
  3. 4
      doc/gpu_image_processing.tex
  4. 26
      doc/gpu_object_detection.tex
  5. 114
      modules/gpu/src/initialization.cpp
  6. 4
      modules/gpu/src/precomp.hpp

@ -708,47 +708,36 @@ if(WITH_CUDA)
message(STATUS "CUDA detected: " ${CUDA_VERSION}) message(STATUS "CUDA detected: " ${CUDA_VERSION})
set(CUDA_ARCH_GPU "1.3 2.0" CACHE STRING "Specify 'real' GPU architectures to build binaries for") set(CUDA_ARCH_GPU "1.3 2.0" CACHE STRING "Specify 'real' GPU architectures to build binaries for")
set(CUDA_ARCH_PTX "1.1 1.3" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for") set(CUDA_ARCH_PTX "1.1 1.3" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
# Architectures to be searched for in user's input # These variables are used in config templates
set (CUDA_ARCH_ALL 1.0 1.1 1.2 1.3 2.0 2.1) string(REGEX REPLACE "\\." "" ARCH_GPU_NO_POINTS "${CUDA_ARCH_GPU}")
string(REGEX REPLACE "\\." "" ARCH_PTX_NO_POINTS "${CUDA_ARCH_PTX}")
# Parse user's input # Ckeck if user specified 1.0 compute capability
foreach(ARCH IN LISTS CUDA_ARCH_ALL) string(REGEX MATCH "1.0" HAS_ARCH_10 "${CUDA_ARCH_GPU} ${CUDA_ARCH_PTX}")
string(REGEX MATCH ${ARCH} ARCH_GPU_MATCH "${CUDA_ARCH_GPU}") if(NOT ${HAS_ARCH_10} STREQUAL "")
string(REGEX MATCH ${ARCH} ARCH_PTX_MATCH "${CUDA_ARCH_PTX}") set(OPENCV_ARCH_GPU_OR_PTX_10 1)
string(REGEX REPLACE "\\." "" ARCH_GPU_AS_NUM "${ARCH_GPU_MATCH}") endif()
string(REGEX REPLACE "\\." "" ARCH_PTX_AS_NUM "${ARCH_PTX_MATCH}")
# Define variables indicating the architectures specified by user
if(NOT ${ARCH_GPU_AS_NUM} STREQUAL "")
set(OPENCV_ARCH_GPU_${ARCH_GPU_AS_NUM} 1)
endif()
if(NOT ${ARCH_PTX_AS_NUM} STREQUAL "")
set(OPENCV_ARCH_PTX_${ARCH_PTX_AS_NUM} 1)
endif()
endforeach()
set(NVCC_FLAGS_EXTRA "") set(NVCC_FLAGS_EXTRA "")
# Tell nvcc to add binaries for the specified GPUs # Tell nvcc to add binaries for the specified GPUs
string(REGEX REPLACE "\\." "" CUDA_ARCH_GPU "${CUDA_ARCH_GPU}") string(REGEX MATCHALL "[0-9]+" ARCH_LIST "${ARCH_GPU_NO_POINTS}")
string(REGEX MATCHALL "[0-9]+" CUDA_ARCH_GPU_LIST "${CUDA_ARCH_GPU}") foreach(ARCH IN LISTS ARCH_LIST)
foreach(ARCH_GPU IN LISTS CUDA_ARCH_GPU_LIST) set(NVCC_FLAGS_EXTRA ${NVCC_FLAGS_EXTRA} -gencode arch=compute_${ARCH},code=sm_${ARCH})
set(NVCC_FLAGS_EXTRA ${NVCC_FLAGS_EXTRA} -gencode arch=compute_${ARCH_GPU},code=sm_${ARCH_GPU})
endforeach() endforeach()
# Tell nvcc to add PTX intermediate code for the specified architectures # Tell nvcc to add PTX intermediate code for the specified architectures
string(REGEX REPLACE "\\." "" CUDA_ARCH_PTX "${CUDA_ARCH_PTX}") string(REGEX MATCHALL "[0-9]+" ARCH_LIST "${ARCH_PTX_NO_POINTS}")
string(REGEX MATCHALL "[0-9]+" CUDA_ARCH_PTX_LIST "${CUDA_ARCH_PTX}") foreach(ARCH IN LISTS ARCH_LIST)
foreach(ARCH_PTX IN LISTS CUDA_ARCH_PTX_LIST) set(NVCC_FLAGS_EXTRA ${NVCC_FLAGS_EXTRA} -gencode arch=compute_${ARCH},code=compute_${ARCH})
set(NVCC_FLAGS_EXTRA ${NVCC_FLAGS_EXTRA} -gencode arch=compute_${ARCH_PTX},code=compute_${ARCH_PTX})
endforeach() endforeach()
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${NVCC_FLAGS_EXTRA})
message(STATUS "CUDA NVCC flags: ${CUDA_NVCC_FLAGS}")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${NVCC_FLAGS_EXTRA})
set(OpenCV_CUDA_CC "${NVCC_FLAGS_EXTRA}") set(OpenCV_CUDA_CC "${NVCC_FLAGS_EXTRA}")
message(STATUS "CUDA NVCC flags: ${CUDA_NVCC_FLAGS}")
endif() endif()
endif() endif()

@ -163,21 +163,14 @@
/* NVidia Cuda Runtime API*/ /* NVidia Cuda Runtime API*/
#cmakedefine HAVE_CUDA #cmakedefine HAVE_CUDA
/* Compile for 'real' NVIDIA GPU architecture */ /* Compile for 'real' NVIDIA GPU architectures */
#cmakedefine OPENCV_ARCH_GPU_10 #define OPENCV_ARCH_GPU "${ARCH_GPU_NO_POINTS}"
#cmakedefine OPENCV_ARCH_GPU_11
#cmakedefine OPENCV_ARCH_GPU_12 /* Compile for 'virtual' NVIDIA PTX architectures */
#cmakedefine OPENCV_ARCH_GPU_13 #define OPENCV_ARCH_PTX "${ARCH_PTX_NO_POINTS}"
#cmakedefine OPENCV_ARCH_GPU_20
#cmakedefine OPENCV_ARCH_GPU_21 /* Create PTX or CUBIN for 1.0 compute capability */
#cmakedefine OPENCV_ARCH_GPU_OR_PTX_10
/* Compile for 'virtual' NVIDIA PTX architecture */
#cmakedefine OPENCV_ARCH_PTX_10
#cmakedefine OPENCV_ARCH_PTX_11
#cmakedefine OPENCV_ARCH_PTX_12
#cmakedefine OPENCV_ARCH_PTX_13
#cmakedefine OPENCV_ARCH_PTX_20
#cmakedefine OPENCV_ARCH_PTX_21
/* VideoInput library */ /* VideoInput library */
#cmakedefine HAVE_VIDEOINPUT #cmakedefine HAVE_VIDEOINPUT

@ -232,10 +232,10 @@ private:
\cvCppFunc{gpu::ConvolveBuf::ConvolveBuf} \cvCppFunc{gpu::ConvolveBuf::ConvolveBuf}
\cvdefCpp{ConvolveBuf();} \cvdefCpp{ConvolveBuf::ConvolveBuf();}
Constructs an empty buffer which will be properly resized after first call of the convolve function. Constructs an empty buffer which will be properly resized after first call of the convolve function.
\cvdefCpp{ConvolveBuf(Size image\_size, Size templ\_size);} \cvdefCpp{ConvolveBuf::ConvolveBuf(Size image\_size, Size templ\_size);}
Constructs a buffer for the convolve function with respectively arguments. Constructs a buffer for the convolve function with respectively arguments.

@ -82,13 +82,13 @@ Creates HOG descriptor and detector.
\cvCppFunc{gpu::HOGDescriptor::getDescriptorSize} \cvCppFunc{gpu::HOGDescriptor::getDescriptorSize}
Returns number of coefficients required for the classification. Returns number of coefficients required for the classification.
\cvdefCpp{size\_t getDescriptorSize() const;} \cvdefCpp{size\_t HOGDescriptor::getDescriptorSize() const;}
\cvCppFunc{gpu::HOGDescriptor::getBlockHistogramSize} \cvCppFunc{gpu::HOGDescriptor::getBlockHistogramSize}
Returns block histogram size. Returns block histogram size.
\cvdefCpp{size\_t getBlockHistogramSize() const;} \cvdefCpp{size\_t HOGDescriptor::getBlockHistogramSize() const;}
\cvCppFunc{gpu::HOGDescriptor::setSVMDetector} \cvCppFunc{gpu::HOGDescriptor::setSVMDetector}
@ -100,25 +100,25 @@ Sets coefficients for the linear SVM classifier.
\cvCppFunc{gpu::HOGDescriptor::getDefaultPeopleDetector} \cvCppFunc{gpu::HOGDescriptor::getDefaultPeopleDetector}
Returns coefficients of the classifier trained for people detection (for default window size). Returns coefficients of the classifier trained for people detection (for default window size).
\cvdefCpp{static vector<float> getDefaultPeopleDetector();} \cvdefCpp{static vector<float> HOGDescriptor::getDefaultPeopleDetector();}
\cvCppFunc{gpu::HOGDescriptor::getPeopleDetector48x96} \cvCppFunc{gpu::HOGDescriptor::getPeopleDetector48x96}
Returns coefficients of the classifier trained for people detection (for 48x96 windows). Returns coefficients of the classifier trained for people detection (for 48x96 windows).
\cvdefCpp{static vector<float> getPeopleDetector48x96();} \cvdefCpp{static vector<float> HOGDescriptor::getPeopleDetector48x96();}
\cvCppFunc{gpu::HOGDescriptor::getPeopleDetector64x128} \cvCppFunc{gpu::HOGDescriptor::getPeopleDetector64x128}
Returns coefficients of the classifier trained for people detection (for 64x128 windows). Returns coefficients of the classifier trained for people detection (for 64x128 windows).
\cvdefCpp{static vector<float> getPeopleDetector64x128();} \cvdefCpp{static vector<float> HOGDescriptor::getPeopleDetector64x128();}
\cvCppFunc{gpu::HOGDescriptor::detect} \cvCppFunc{gpu::HOGDescriptor::detect}
Perfroms object detection without multiscale window. Perfroms object detection without multiscale window.
\cvdefCpp{void detect(const GpuMat\& img, vector<Point>\& found\_locations,\par \cvdefCpp{void HOGDescriptor::detect(const GpuMat\& img, vector<Point>\& found\_locations,\par
double hit\_threshold=0, Size win\_stride=Size(),\par double hit\_threshold=0, Size win\_stride=Size(),\par
Size padding=Size());} Size padding=Size());}
@ -134,10 +134,10 @@ Perfroms object detection without multiscale window.
\cvCppFunc{gpu::HOGDescriptor::detectMultiScale} \cvCppFunc{gpu::HOGDescriptor::detectMultiScale}
Perfroms object detection with multiscale window. Perfroms object detection with multiscale window.
\cvdefCpp{void detectMultiScale(const GpuMat\& img, vector<Rect>\& found\_locations,\par \cvdefCpp{void HOGDescriptor::detectMultiScale(const GpuMat\& img,\par
double hit\_threshold=0, Size win\_stride=Size(),\par vector<Rect>\& found\_locations, double hit\_threshold=0,\par
Size padding=Size(), double scale0=1.05,\par Size win\_stride=Size(), Size padding=Size(),\par
int group\_threshold=2);} double scale0=1.05, int group\_threshold=2);}
\begin{description} \begin{description}
\cvarg{img}{Source image. See \cvCppCross{gpu::HOGDescriptor::detect} for type limitations.} \cvarg{img}{Source image. See \cvCppCross{gpu::HOGDescriptor::detect} for type limitations.}
@ -154,9 +154,9 @@ See \cvCppCross{groupRectangles}.}
\cvCppFunc{gpu::HOGDescriptor::getDescriptors} \cvCppFunc{gpu::HOGDescriptor::getDescriptors}
Returns block descriptors computed for the whole image. It's mainly used for classifier learning purposes. Returns block descriptors computed for the whole image. It's mainly used for classifier learning purposes.
\cvdefCpp{void getDescriptors(const GpuMat\& img, Size win\_stride,\par \cvdefCpp{void HOGDescriptor::getDescriptors(const GpuMat\& img,\par
GpuMat\& descriptors,\par Size win\_stride, GpuMat\& descriptors,\par
int descr\_format=DESCR\_FORMAT\_COL\_BY\_COL);} int descr\_format=DESCR\_FORMAT\_COL\_BY\_COL);}
\begin{description} \begin{description}
\cvarg{img}{Source image. See \cvCppCross{gpu::HOGDescriptor::detect} for type limitations.} \cvarg{img}{Source image. See \cvCppCross{gpu::HOGDescriptor::detect} for type limitations.}

@ -41,6 +41,7 @@
//M*/ //M*/
#include "precomp.hpp" #include "precomp.hpp"
#include <functional>
using namespace cv; using namespace cv;
using namespace cv::gpu; using namespace cv::gpu;
@ -58,12 +59,12 @@ CV_EXPORTS void cv::gpu::getGpuMemInfo(size_t& /*free*/, size_t& /*total*/) { t
CV_EXPORTS bool cv::gpu::hasNativeDoubleSupport(int /*device*/) { throw_nogpu(); return false; } CV_EXPORTS bool cv::gpu::hasNativeDoubleSupport(int /*device*/) { throw_nogpu(); return false; }
CV_EXPORTS bool cv::gpu::hasAtomicsSupport(int /*device*/) { throw_nogpu(); return false; } CV_EXPORTS bool cv::gpu::hasAtomicsSupport(int /*device*/) { throw_nogpu(); return false; }
CV_EXPORTS bool cv::gpu::hasPtxVersion(int major, int minor) { throw_nogpu(); return false; } CV_EXPORTS bool cv::gpu::hasPtxVersion(int major, int minor) { throw_nogpu(); return false; }
CV_EXPORTS bool cv::gpu::hasLessOrEqualPtxVersion(int major, int minor) { throw_nogpu(); return false; } CV_EXPORTS bool cv::gpu::hasLessOrEqualPtxVersion(int major, int minor) { return false; }
CV_EXPORTS bool cv::gpu::hasGreaterOrEqualPtxVersion(int major, int minor) { throw_nogpu(); return false; } CV_EXPORTS bool cv::gpu::hasGreaterOrEqualPtxVersion(int major, int minor) { return false; }
CV_EXPORTS bool cv::gpu::hasCubinVersion(int major, int minor) { throw_nogpu(); return false; } CV_EXPORTS bool cv::gpu::hasCubinVersion(int major, int minor) { return false; }
CV_EXPORTS bool cv::gpu::hasGreaterOrEqualCubinVersion(int major, int minor) { throw_nogpu(); return false; } CV_EXPORTS bool cv::gpu::hasGreaterOrEqualCubinVersion(int major, int minor) { return false; }
CV_EXPORTS bool cv::gpu::hasVersion(int major, int minor) { throw_nogpu(); return false; } CV_EXPORTS bool cv::gpu::hasVersion(int major, int minor) { return false; }
CV_EXPORTS bool cv::gpu::hasGreaterOrEqualVersion(int major, int minor) { throw_nogpu(); return false; } CV_EXPORTS bool cv::gpu::hasGreaterOrEqualVersion(int major, int minor) { return false; }
CV_EXPORTS bool cv::gpu::isCompatibleWith(int device) { throw_nogpu(); return false; } CV_EXPORTS bool cv::gpu::isCompatibleWith(int device) { throw_nogpu(); return false; }
@ -142,118 +143,55 @@ CV_EXPORTS bool cv::gpu::hasAtomicsSupport(int device)
namespace namespace
{ {
template <typename Comparer> template <typename Comparer>
bool checkPtxVersion(int major, int minor, Comparer cmp) bool compare(const std::string& str, int x, Comparer cmp)
{ {
#ifdef OPENCV_ARCH_PTX_10 std::stringstream stream(str);
if (cmp(1, 0, major, minor)) return true;
#endif
#ifdef OPENCV_ARCH_PTX_11
if (cmp(1, 1, major, minor)) return true;
#endif
#ifdef OPENCV_ARCH_PTX_12
if (cmp(1, 2, major, minor)) return true;
#endif
#ifdef OPENCV_ARCH_PTX_13
if (cmp(1, 3, major, minor)) return true;
#endif
#ifdef OPENCV_ARCH_PTX_20
if (cmp(2, 0, major, minor)) return true;
#endif
#ifdef OPENCV_ARCH_PTX_21
if (cmp(2, 1, major, minor)) return true;
#endif
return false;
}
template <typename Comparer>
bool checkCubinVersion(int major, int minor, Comparer cmp)
{
#ifdef OPENCV_ARCH_GPU_10
if (cmp(1, 0, major, minor)) return true;
#endif
#ifdef OPENCV_ARCH_GPU_11
if (cmp(1, 1, major, minor)) return true;
#endif
#ifdef OPENCV_ARCH_GPU_12 int val;
if (cmp(1, 2, major, minor)) return true; stream >> val;
#endif
#ifdef OPENCV_ARCH_GPU_13
if (cmp(1, 3, major, minor)) return true;
#endif
#ifdef OPENCV_ARCH_GPU_20
if (cmp(2, 0, major, minor)) return true;
#endif
#ifdef OPENCV_ARCH_GPU_21
if (cmp(2, 1, major, minor)) return true;
#endif
return false;
}
struct ComparerEqual while (!stream.eof() && !stream.fail())
{
bool operator()(int lhs1, int lhs2, int rhs1, int rhs2) const
{
return lhs1 == rhs1 && lhs2 == rhs2;
}
};
struct ComparerLessOrEqual
{
bool operator()(int lhs1, int lhs2, int rhs1, int rhs2) const
{ {
return lhs1 < rhs1 || (lhs1 == rhs1 && lhs2 <= rhs2); if (cmp(val, x))
return true;
stream >> val;
} }
};
struct ComparerGreaterOrEqual return false;
{ }
bool operator()(int lhs1, int lhs2, int rhs1, int rhs2) const
{
return lhs1 > rhs1 || (lhs1 == rhs1 && lhs2 >= rhs2);
}
};
} }
CV_EXPORTS bool cv::gpu::hasPtxVersion(int major, int minor) CV_EXPORTS bool cv::gpu::hasPtxVersion(int major, int minor)
{ {
return checkPtxVersion(major, minor, ComparerEqual()); return ::compare(OPENCV_ARCH_PTX, major * 10 + minor, std::equal_to<int>());
} }
CV_EXPORTS bool cv::gpu::hasLessOrEqualPtxVersion(int major, int minor) CV_EXPORTS bool cv::gpu::hasLessOrEqualPtxVersion(int major, int minor)
{ {
return checkPtxVersion(major, minor, ComparerLessOrEqual()); return ::compare(OPENCV_ARCH_PTX, major * 10 + minor,
std::less_equal<int>());
} }
CV_EXPORTS bool cv::gpu::hasGreaterOrEqualPtxVersion(int major, int minor) CV_EXPORTS bool cv::gpu::hasGreaterOrEqualPtxVersion(int major, int minor)
{ {
return checkPtxVersion(major, minor, ComparerGreaterOrEqual()); return ::compare(OPENCV_ARCH_PTX, major * 10 + minor,
std::greater_equal<int>());
} }
CV_EXPORTS bool cv::gpu::hasCubinVersion(int major, int minor) CV_EXPORTS bool cv::gpu::hasCubinVersion(int major, int minor)
{ {
return checkCubinVersion(major, minor, ComparerEqual()); return ::compare(OPENCV_ARCH_GPU, major * 10 + minor, std::equal_to<int>());
} }
CV_EXPORTS bool cv::gpu::hasGreaterOrEqualCubinVersion(int major, int minor) CV_EXPORTS bool cv::gpu::hasGreaterOrEqualCubinVersion(int major, int minor)
{ {
return checkCubinVersion(major, minor, ComparerGreaterOrEqual()); return ::compare(OPENCV_ARCH_GPU, major * 10 + minor,
std::greater_equal<int>());
} }
@ -284,7 +222,7 @@ CV_EXPORTS bool cv::gpu::isCompatibleWith(int device)
return true; return true;
// Check CUBIN compatibilty // Check CUBIN compatibilty
for (int i = 0; i <= minor; ++i) for (int i = minor; i >= 0; --i)
if (hasCubinVersion(major, i)) if (hasCubinVersion(major, i))
return true; return true;

@ -85,6 +85,10 @@
#error "Insufficient NPP version, please update it." #error "Insufficient NPP version, please update it."
#endif #endif
#if defined(OPENCV_ARCH_GPU_OR_PTX_10)
#error "OpenCV GPU module doesn't support NVIDIA compute capability 1.0"
#endif
static inline void throw_nogpu() { CV_Error(CV_GpuNotSupported, "The called functionality is disabled for current build or platform"); } static inline void throw_nogpu() { CV_Error(CV_GpuNotSupported, "The called functionality is disabled for current build or platform"); }
#else /* defined(HAVE_CUDA) */ #else /* defined(HAVE_CUDA) */

Loading…
Cancel
Save