diff --git a/3rdparty/NPP_staging/NPP_staging_static_Windows_32_v1.lib b/3rdparty/NPP_staging/NPP_staging_static_Windows_32_v1.lib index 39d29b8b1a..98d5c227a1 100644 Binary files a/3rdparty/NPP_staging/NPP_staging_static_Windows_32_v1.lib and b/3rdparty/NPP_staging/NPP_staging_static_Windows_32_v1.lib differ diff --git a/3rdparty/NPP_staging/npp_staging.h b/3rdparty/NPP_staging/npp_staging.h index c54af5c032..a9c6607dff 100644 --- a/3rdparty/NPP_staging/npp_staging.h +++ b/3rdparty/NPP_staging/npp_staging.h @@ -188,14 +188,14 @@ struct NppStSize32u enum NppStStatus { //already present in NPP - /* NPP_SUCCESS = 0, ///< Successful operation (same as NPP_NO_ERROR) - NPP_ERROR = -1, ///< Unknown error - NPP_CUDA_KERNEL_EXECUTION_ERROR = -3, ///< CUDA kernel execution error - NPP_NULL_POINTER_ERROR = -4, ///< NULL pointer argument error - NPP_TEXTURE_BIND_ERROR = -24, ///< CUDA texture binding error or non-zero offset returned - NPP_MEMCPY_ERROR = -13, ///< CUDA memory copy error - NPP_MEM_ALLOC_ERR = -12, ///< CUDA memory allocation error - NPP_MEMFREE_ERR = -15, ///< CUDA memory deallocation error*/ + //NPP_SUCCESS = 0, ///< Successful operation (same as NPP_NO_ERROR) + //NPP_ERROR = -1, ///< Unknown error + //NPP_CUDA_KERNEL_EXECUTION_ERROR = -3, ///< CUDA kernel execution error + //NPP_NULL_POINTER_ERROR = -4, ///< NULL pointer argument error + //NPP_TEXTURE_BIND_ERROR = -24, ///< CUDA texture binding error or non-zero offset returned + //NPP_MEMCPY_ERROR = -13, ///< CUDA memory copy error + //NPP_MEM_ALLOC_ERR = -12, ///< CUDA memory allocation error + //NPP_MEMFREE_ERR = -15, ///< CUDA memory deallocation error //to be added NPP_INVALID_ROI, ///< Invalid region of interest argument @@ -244,7 +244,7 @@ extern "C" { /** \defgroup core_npp NPP Core * Basic functions for CUDA streams management. - * WARNING: These functions couldn't be exported from NPP_staging library, so they can't be used + * WARNING: These functions couldn't be exported into DLL, so they can be used only with static version of NPP_staging * @{ */ @@ -569,6 +569,13 @@ NppStStatus nppiStTranspose_64f_C1R_host(NppSt64f *h_src, NppSt32u srcStride, NppStStatus nppiStIntegralGetSize_8u32u(NppStSize32u roiSize, NppSt32u *pBufsize); +/** + * Calculates the size of the temporary buffer for integral image creation + * \see nppiStIntegralGetSize_8u32u + */ +NppStStatus nppiStIntegralGetSize_32f32f(NppStSize32u roiSize, NppSt32u *pBufsize); + + /** * Creates an integral image representation for the input image * @@ -587,6 +594,15 @@ NppStStatus nppiStIntegral_8u32u_C1R(NppSt8u *d_src, NppSt32u srcStep, NppSt8u *pBuffer, NppSt32u bufSize); +/** + * Creates an integral image representation for the input image + * \see nppiStIntegral_8u32u_C1R + */ +NppStStatus nppiStIntegral_32f32f_C1R(NppSt32f *d_src, NppSt32u srcStep, + NppSt32f *d_dst, NppSt32u dstStep, NppStSize32u roiSize, + NppSt8u *pBuffer, NppSt32u bufSize); + + /** * Creates an integral image representation for the input image. Host implementation * @@ -602,6 +618,14 @@ NppStStatus nppiStIntegral_8u32u_C1R_host(NppSt8u *h_src, NppSt32u srcStep, NppSt32u *h_dst, NppSt32u dstStep, NppStSize32u roiSize); +/** + * Creates an integral image representation for the input image. Host implementation + * \see nppiStIntegral_8u32u_C1R_host + */ +NppStStatus nppiStIntegral_32f32f_C1R_host(NppSt32f *h_src, NppSt32u srcStep, + NppSt32f *h_dst, NppSt32u dstStep, NppStSize32u roiSize); + + /** * Calculates the size of the temporary buffer for squared integral image creation * diff --git a/modules/gpu/CMakeLists.txt b/modules/gpu/CMakeLists.txt index 3c10b0ee2b..f25036ac24 100644 --- a/modules/gpu/CMakeLists.txt +++ b/modules/gpu/CMakeLists.txt @@ -35,6 +35,13 @@ source_group("Include" FILES ${lib_hdrs}) file(GLOB lib_device_hdrs "src/opencv2/gpu/device/*.h*") source_group("Device" FILES ${lib_device_hdrs}) +if (HAVE_CUDA AND MSVC) + file(GLOB ncv_srcs "src/nvidia/*.cpp") + file(GLOB ncv_hdrs "src/nvidia/*.h*") + file(GLOB ncv_cuda "src/nvidia/*.cu") + source_group("Src\\NVidia" FILES ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda}) +endif() + if (HAVE_CUDA) get_filename_component(_path_to_findnpp "${CMAKE_CURRENT_LIST_FILE}" PATH) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${_path_to_findnpp}) @@ -68,19 +75,16 @@ if (HAVE_CUDA) string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}") string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") endif() + - CUDA_COMPILE(cuda_objs ${lib_cuda}) + include(FindNPP_staging.cmake) + include_directories(${NPPST_INC}) + + CUDA_COMPILE(cuda_objs ${lib_cuda} ${ncv_cuda}) #CUDA_BUILD_CLEAN_TARGET() endif() - -add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${cuda_objs}) - -IF (HAVE_CUDA) - include(FindNPP_staging.cmake) - include_directories(${NPPST_INC}) - target_link_libraries(${the_target} ${NPPST_LIB}) -endif() +add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda} ${cuda_objs}) if(PCHSupport_FOUND) set(pch_header ${CMAKE_CURRENT_SOURCE_DIR}/src/precomp.hpp) @@ -114,6 +118,7 @@ target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${DEPS} ) if (HAVE_CUDA) target_link_libraries(${the_target} ${CUDA_LIBRARIES} ${CUDA_NPP_LIBRARIES}) + target_link_libraries(${the_target} ${NPPST_LIB}) CUDA_ADD_CUFFT_TO_TARGET(${the_target}) endif() diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 70a34eee6e..3979d62dd7 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -1380,87 +1380,39 @@ namespace cv explicit BruteForceMatcher_GPU(L2 /*d*/) : BruteForceMatcher_GPU_base(L2Dist) {} }; - ////////////////////////////////// CascadeClassifier ////////////////////////////////////////// + ////////////////////////////////// CascadeClassifier_GPU ////////////////////////////////////////// // The cascade classifier class for object detection. - class CV_EXPORTS CascadeClassifier + class CV_EXPORTS CascadeClassifier_GPU { - public: - struct CV_EXPORTS DTreeNode - { - int featureIdx; - float threshold; // for ordered features only - int left; - int right; - }; - - struct CV_EXPORTS DTree - { - int nodeCount; - }; - - struct CV_EXPORTS Stage - { - int first; - int ntrees; - float threshold; - }; - - enum { BOOST = 0 }; - enum { DO_CANNY_PRUNING = 1, SCALE_IMAGE = 2,FIND_BIGGEST_OBJECT = 4, DO_ROUGH_SEARCH = 8 }; - - CascadeClassifier(); - CascadeClassifier(const string& filename); - ~CascadeClassifier(); + public: + CascadeClassifier_GPU(); + CascadeClassifier_GPU(const string& filename); + ~CascadeClassifier_GPU(); bool empty() const; bool load(const string& filename); - bool read(const FileNode& node); - - void detectMultiScale( const Mat& image, vector& objects, double scaleFactor=1.1, - int minNeighbors=3, int flags=0, Size minSize=Size(), Size maxSize=Size()); - - bool setImage( Ptr&, const Mat& ); - int runAt( Ptr&, Point ); - - bool isStumpBased; - - int stageType; - int featureType; - int ncategories; - Size origWinSize; - - vector stages; - vector classifiers; - vector nodes; - vector leaves; - vector subsets; + void release(); + + /* returns number of detected objects */ + int detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, double scaleFactor=1.2, int minNeighbors=4, Size minSize=Size()); + + bool findLargestObject; + bool visualizeInPlace; - Ptr feval; - Ptr oldCascade; + Size getClassifierSize() const; + private: + + struct CascadeClassifierImpl; + CascadeClassifierImpl* impl; }; - + ////////////////////////////////// SURF ////////////////////////////////////////// struct CV_EXPORTS SURFParams_GPU { - SURFParams_GPU() : - threshold(0.1f), - nOctaves(4), - nIntervals(4), - initialScale(2.f), - - l1(3.f/1.5f), - l2(5.f/1.5f), - l3(3.f/1.5f), - l4(1.f/1.5f), - edgeScale(0.81f), - initialStep(1), - - extended(true), - - featuresRatio(0.01f) - { - } + SURFParams_GPU() : threshold(0.1f), nOctaves(4), nIntervals(4), initialScale(2.f), + l1(3.f/1.5f), l2(5.f/1.5f), l3(3.f/1.5f), l4(1.f/1.5f), + edgeScale(0.81f), initialStep(1), extended(true), featuresRatio(0.01f) {} //! The interest operator threshold float threshold; diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp index 28acc6a713..968e192d55 100644 --- a/modules/gpu/src/arithm.cpp +++ b/modules/gpu/src/arithm.cpp @@ -170,8 +170,7 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst) if (src.type() == CV_8UC1) { - nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr(), src.step, dst.ptr(), dst.step, sz, - nppLut.ptr(), lvls.pLevels, 256) ); + nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr(), src.step, dst.ptr(), dst.step, sz, nppLut.ptr(), lvls.pLevels, 256) ); } else { @@ -186,8 +185,7 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst) pValues3[1] = nppLut3[1].ptr(); pValues3[2] = nppLut3[2].ptr(); } - nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr(), src.step, dst.ptr(), dst.step, sz, - pValues3, lvls.pLevels3, lvls.nValues3) ); + nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr(), src.step, dst.ptr(), dst.step, sz, pValues3, lvls.pLevels3, lvls.nValues3) ); } } diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp index e6a4e7295f..3a7d2ca47d 100644 --- a/modules/gpu/src/cascadeclassifier.cpp +++ b/modules/gpu/src/cascadeclassifier.cpp @@ -42,69 +42,751 @@ #include "precomp.hpp" - - - using namespace cv; using namespace cv::gpu; using namespace std; -#if !defined (HAVE_CUDA) -cv::gpu::CascadeClassifier::CascadeClassifier() { throw_nogpu(); } -cv::gpu::CascadeClassifier::CascadeClassifier(const string&) { throw_nogpu(); } -cv::gpu::CascadeClassifier::~CascadeClassifier() { throw_nogpu(); } +#if !defined (HAVE_CUDA) || (defined(_MSC_VER) && _MSC_VER != 1500) || !defined(_MSC_VER) -bool cv::gpu::CascadeClassifier::empty() const { throw_nogpu(); return true; } -bool cv::gpu::CascadeClassifier::load(const string& filename) { throw_nogpu(); return true; } -bool cv::gpu::CascadeClassifier::read(const FileNode& node) { throw_nogpu(); return true; } +cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU() { throw_nogpu(); } +cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU(const string&) { throw_nogpu(); } +cv::gpu::CascadeClassifier_GPU::~CascadeClassifier_GPU() { throw_nogpu(); } -void cv::gpu::CascadeClassifier::detectMultiScale( const Mat&, vector&, double, int, int, Size, Size) { throw_nogpu(); } +bool cv::gpu::CascadeClassifier_GPU::empty() const { throw_nogpu(); return true; } +bool cv::gpu::CascadeClassifier_GPU::load(const string&) { throw_nogpu(); return true; } +Size cv::gpu::CascadeClassifier_GPU::getClassifierSize() const { throw_nogpu(); return Size(); } - +int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat& , GpuMat& , double , int , Size) { throw_nogpu(); return 0; } +#if defined (HAVE_CUDA) + NCVStatus loadFromXML(const string&, HaarClassifierCascadeDescriptor&, vector&, + vector&, vector&) { throw_nogpu(); return NCVStatus(); } + void groupRectangles(vector&, int, double, vector*) { throw_nogpu(); } +#endif #else +struct cv::gpu::CascadeClassifier_GPU::CascadeClassifierImpl +{ + CascadeClassifierImpl(const string& filename) : lastAllocatedFrameSize(-1, -1) + { + ncvSetDebugOutputHandler(NCVDebugOutputHandler); + if (ncvStat != load(filename)) + CV_Error(CV_GpuApiCallError, "Error in GPU cacade load"); + } + NCVStatus process(const GpuMat& src, GpuMat& objects, float scaleStep, int minNeighbors, bool findLargestObject, bool visualizeInPlace, NcvSize32u ncvMinSize, /*out*/unsigned int& numDetections) + { + calculateMemReqsAndAllocate(src.size()); -cv::gpu::CascadeClassifier::CascadeClassifier() -{ + NCVMemPtr src_beg; + src_beg.ptr = (void*)src.ptr(); + src_beg.memtype = NCVMemoryTypeDevice; -} + NCVMemSegment src_seg; + src_seg.begin = src_beg; + src_seg.size = src.step * src.rows; -cv::gpu::CascadeClassifier::CascadeClassifier(const string& filename) -{ + NCVMatrixReuse d_src(src_seg, devProp.textureAlignment, src.cols, src.rows, src.step, true); + + //NCVMatrixAlloc d_src(*gpuAllocator, src.cols, src.rows); + //ncvAssertReturn(d_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); -} + //NCVMatrixAlloc h_src(*cpuAllocator, src.cols, src.rows); + //ncvAssertReturn(h_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); -cv::gpu::CascadeClassifier::~CascadeClassifier() -{ + CV_Assert(objects.rows == 1); + + NCVMemPtr objects_beg; + objects_beg.ptr = (void*)objects.ptr(); + objects_beg.memtype = NCVMemoryTypeDevice; + + NCVMemSegment objects_seg; + objects_seg.begin = objects_beg; + objects_seg.size = objects.step * objects.rows; + NCVVectorReuse d_rects(objects_seg, objects.cols); + //NCVVectorAlloc d_rects(*gpuAllocator, 100); + //ncvAssertReturn(d_rects.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + + NcvSize32u roi; + roi.width = d_src.width(); + roi.height = d_src.height(); + + Ncv32u flags = 0; + flags |= findLargestObject? NCVPipeObjDet_FindLargestObject : 0; + flags |= visualizeInPlace ? NCVPipeObjDet_VisualizeInPlace : 0; + + ncvStat = ncvDetectObjectsMultiScale_device( + d_src, roi, d_rects, numDetections, haar, *h_haarStages, + *d_haarStages, *d_haarNodes, *d_haarFeatures, + ncvMinSize, + minNeighbors, + scaleStep, 1, + flags, + *gpuAllocator, *cpuAllocator, devProp.major, devProp.minor, 0); + ncvAssertReturnNcvStat(ncvStat); + ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR); + + return NCV_SUCCESS; + } + //// + NcvSize32u getClassifierSize() const { return haar.ClassifierSize; } + cv::Size getClassifierCvSize() const { return cv::Size(haar.ClassifierSize.width, haar.ClassifierSize.height); } +private: + + static void NCVDebugOutputHandler(const char* msg) { CV_Error(CV_GpuApiCallError, msg); } + + NCVStatus load(const string& classifierFile) + { + int devId = cv::gpu::getDevice(); + ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), NCV_CUDA_ERROR); + + // Load the classifier from file (assuming its size is about 1 mb) using a simple allocator + gpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeDevice); + cpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeHostPinned); + + ncvAssertPrintReturn(gpuCascadeAllocator->isInitialized(), "Error creating cascade GPU allocator", NCV_CUDA_ERROR); + ncvAssertPrintReturn(cpuCascadeAllocator->isInitialized(), "Error creating cascade CPU allocator", NCV_CUDA_ERROR); + + Ncv32u haarNumStages, haarNumNodes, haarNumFeatures; + ncvStat = ncvHaarGetClassifierSize(classifierFile, haarNumStages, haarNumNodes, haarNumFeatures); + ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error reading classifier size (check the file)", NCV_FILE_ERROR); + + h_haarStages = new NCVVectorAlloc(*cpuCascadeAllocator, haarNumStages); + h_haarNodes = new NCVVectorAlloc(*cpuCascadeAllocator, haarNumNodes); + h_haarFeatures = new NCVVectorAlloc(*cpuCascadeAllocator, haarNumFeatures); + + ncvAssertPrintReturn(h_haarStages->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR); + ncvAssertPrintReturn(h_haarNodes->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR); + ncvAssertPrintReturn(h_haarFeatures->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR); + + ncvStat = ncvHaarLoadFromFile_host(classifierFile, haar, *h_haarStages, *h_haarNodes, *h_haarFeatures); + ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error loading classifier", NCV_FILE_ERROR); + + d_haarStages = new NCVVectorAlloc(*gpuCascadeAllocator, haarNumStages); + d_haarNodes = new NCVVectorAlloc(*gpuCascadeAllocator, haarNumNodes); + d_haarFeatures = new NCVVectorAlloc(*gpuCascadeAllocator, haarNumFeatures); + + ncvAssertPrintReturn(d_haarStages->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR); + ncvAssertPrintReturn(d_haarNodes->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR); + ncvAssertPrintReturn(d_haarFeatures->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR); + + ncvStat = h_haarStages->copySolid(*d_haarStages, 0); + ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR); + ncvStat = h_haarNodes->copySolid(*d_haarNodes, 0); + ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR); + ncvStat = h_haarFeatures->copySolid(*d_haarFeatures, 0); + ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR); + + return NCV_SUCCESS; + } + //// + + NCVStatus calculateMemReqsAndAllocate(const Size& frameSize) + { + if (lastAllocatedFrameSize == frameSize) + return NCV_SUCCESS; + + // Calculate memory requirements and create real allocators + NCVMemStackAllocator gpuCounter(devProp.textureAlignment); + NCVMemStackAllocator cpuCounter(devProp.textureAlignment); + + ncvAssertPrintReturn(gpuCounter.isInitialized(), "Error creating GPU memory counter", NCV_CUDA_ERROR); + ncvAssertPrintReturn(cpuCounter.isInitialized(), "Error creating CPU memory counter", NCV_CUDA_ERROR); + + NCVMatrixAlloc d_src(gpuCounter, frameSize.width, frameSize.height); + NCVMatrixAlloc h_src(cpuCounter, frameSize.width, frameSize.height); + + ncvAssertReturn(d_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + ncvAssertReturn(h_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + + NCVVectorAlloc d_rects(gpuCounter, 100); + ncvAssertReturn(d_rects.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + + NcvSize32u roi; + roi.width = d_src.width(); + roi.height = d_src.height(); + Ncv32u numDetections; + ncvStat = ncvDetectObjectsMultiScale_device(d_src, roi, d_rects, numDetections, haar, *h_haarStages, + *d_haarStages, *d_haarNodes, *d_haarFeatures, haar.ClassifierSize, 4, 1.2f, 1, 0, gpuCounter, cpuCounter, devProp.major, devProp.minor, 0); + + ncvAssertReturnNcvStat(ncvStat); + ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR); + + gpuAllocator = new NCVMemStackAllocator(NCVMemoryTypeDevice, gpuCounter.maxSize(), devProp.textureAlignment); + cpuAllocator = new NCVMemStackAllocator(NCVMemoryTypeHostPinned, cpuCounter.maxSize(), devProp.textureAlignment); + + ncvAssertPrintReturn(gpuAllocator->isInitialized(), "Error creating GPU memory allocator", NCV_CUDA_ERROR); + ncvAssertPrintReturn(cpuAllocator->isInitialized(), "Error creating CPU memory allocator", NCV_CUDA_ERROR); + return NCV_SUCCESS; + } + //// + + cudaDeviceProp devProp; + NCVStatus ncvStat; + + Ptr gpuCascadeAllocator; + Ptr cpuCascadeAllocator; + + Ptr > h_haarStages; + Ptr > h_haarNodes; + Ptr > h_haarFeatures; + + HaarClassifierCascadeDescriptor haar; + + Ptr > d_haarStages; + Ptr > d_haarNodes; + Ptr > d_haarFeatures; + + Size lastAllocatedFrameSize; + + Ptr gpuAllocator; + Ptr cpuAllocator; +}; + + + +cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU() : findLargestObject(false), visualizeInPlace(false), impl(0) {} +cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU(const string& filename) : findLargestObject(false), visualizeInPlace(false), impl(0) { load(filename); } +cv::gpu::CascadeClassifier_GPU::~CascadeClassifier_GPU() { release(); } +bool cv::gpu::CascadeClassifier_GPU::empty() const { return impl == 0; } + +void cv::gpu::CascadeClassifier_GPU::release() { if (impl) { delete impl; impl = 0; } } + +bool cv::gpu::CascadeClassifier_GPU::load(const string& filename) +{ + release(); + impl = new CascadeClassifierImpl(filename); + return !this->empty(); } -bool cv::gpu::CascadeClassifier::empty() const +Size cv::gpu::CascadeClassifier_GPU::getClassifierSize() const { - int *a = (int*)&nppiStTranspose_32u_C1R; - return *a == 0xFFFFF; - return true; + return this->empty() ? Size() : impl->getClassifierCvSize(); } + +int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, double scaleFactor, int minNeighbors, Size minSize) +{ + CV_Assert( scaleFactor > 1 && image.depth() == CV_8U); + CV_Assert( !this->empty()); + + const int defaultObjSearchNum = 100; + if (objectsBuf.empty()) + objectsBuf.create(1, defaultObjSearchNum, DataType::type); + + NcvSize32u ncvMinSize = impl->getClassifierSize(); -bool cv::gpu::CascadeClassifier::load(const string& filename) -{ - return true; + if (ncvMinSize.width < (unsigned)minSize.width && ncvMinSize.height < (unsigned)minSize.height) + { + ncvMinSize.width = minSize.width; + ncvMinSize.height = minSize.height; + } + + unsigned int numDetections; + NCVStatus ncvStat = impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, ncvMinSize, numDetections); + if (ncvStat != NCV_SUCCESS) + CV_Error(CV_GpuApiCallError, "Error in face detectioln"); + + return numDetections; } -bool cv::gpu::CascadeClassifier::read(const FileNode& node) + struct RectConvert + { + Rect operator()(const NcvRect32u& nr) const { return Rect(nr.x, nr.y, nr.width, nr.height); } + NcvRect32u operator()(const Rect& nr) const + { + NcvRect32u rect; + rect.x = nr.x; + rect.y = nr.y; + rect.width = nr.width; + rect.height = nr.height; + return rect; + } + }; + + void groupRectangles(std::vector &hypotheses, int groupThreshold, double eps, std::vector *weights) + { + vector rects(hypotheses.size()); + std::transform(hypotheses.begin(), hypotheses.end(), rects.begin(), RectConvert()); + + if (weights) + { + vector weights_int; + weights_int.assign(weights->begin(), weights->end()); + cv::groupRectangles(rects, weights_int, groupThreshold, eps); + } + else + { + cv::groupRectangles(rects, groupThreshold, eps); + } + std::transform(rects.begin(), rects.end(), hypotheses.begin(), RectConvert()); + hypotheses.resize(rects.size()); + } + + +#if 1 /* loadFromXML implementation switch */ + +NCVStatus loadFromXML(const std::string &filename, + HaarClassifierCascadeDescriptor &haar, + std::vector &haarStages, + std::vector &haarClassifierNodes, + std::vector &haarFeatures) { - return true; + NCVStatus ncvStat; + + haar.NumStages = 0; + haar.NumClassifierRootNodes = 0; + haar.NumClassifierTotalNodes = 0; + haar.NumFeatures = 0; + haar.ClassifierSize.width = 0; + haar.ClassifierSize.height = 0; + haar.bHasStumpsOnly = true; + haar.bNeedsTiltedII = false; + Ncv32u curMaxTreeDepth; + + std::vector xmlFileCont; + + std::vector h_TmpClassifierNotRootNodes; + haarStages.resize(0); + haarClassifierNodes.resize(0); + haarFeatures.resize(0); + + Ptr oldCascade = (CvHaarClassifierCascade*)cvLoad(filename.c_str(), 0, 0, 0); + if (oldCascade.empty()) + return NCV_HAAR_XML_LOADING_EXCEPTION; + + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + haar.ClassifierSize.width = oldCascade->orig_window_size.width; + haar.ClassifierSize.height = oldCascade->orig_window_size.height; + + int stagesCound = oldCascade->count; + for(int s = 0; s < stagesCound; ++s) // by stages + { + HaarStage64 curStage; + curStage.setStartClassifierRootNodeOffset(haarClassifierNodes.size()); + + curStage.setStageThreshold(oldCascade->stage_classifier[s].threshold); + + int treesCount = oldCascade->stage_classifier[s].count; + for(int t = 0; t < treesCount; ++t) // bytrees + { + Ncv32u nodeId = 0; + CvHaarClassifier* tree = &oldCascade->stage_classifier[s].classifier[t]; + + int nodesCount = tree->count; + for(int n = 0; n < nodesCount; ++n) //by features + { + CvHaarFeature* feature = &tree->haar_feature[n]; + + HaarClassifierNode128 curNode; + curNode.setThreshold(tree->threshold[n]); + + HaarClassifierNodeDescriptor32 nodeLeft; + if ( tree->left[n] <= 0 ) + { + Ncv32f leftVal = tree->alpha[-tree->left[n]]; + ncvStat = nodeLeft.create(leftVal); + ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat); + } + else + { + Ncv32u leftNodeOffset = tree->left[n]; + nodeLeft.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + leftNodeOffset - 1)); + haar.bHasStumpsOnly = false; + } + curNode.setLeftNodeDesc(nodeLeft); + + HaarClassifierNodeDescriptor32 nodeRight; + if ( tree->right[n] <= 0 ) + { + Ncv32f rightVal = tree->alpha[-tree->right[n]]; + ncvStat = nodeRight.create(rightVal); + ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat); + } + else + { + Ncv32u rightNodeOffset = tree->right[n]; + nodeRight.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + rightNodeOffset - 1)); + haar.bHasStumpsOnly = false; + } + curNode.setRightNodeDesc(nodeRight); + + Ncv32u tiltedVal = feature->tilted; + haar.bNeedsTiltedII = (tiltedVal != 0); + + Ncv32u featureId = 0; + for(int l = 0; l < CV_HAAR_FEATURE_MAX; ++l) //by rects + { + Ncv32u rectX = feature->rect[l].r.x; + Ncv32u rectY = feature->rect[l].r.y; + Ncv32u rectWidth = feature->rect[l].r.width; + Ncv32u rectHeight = feature->rect[l].r.height; + + Ncv32f rectWeight = feature->rect[l].weight; + + if (rectWeight == 0/* && rectX == 0 &&rectY == 0 && rectWidth == 0 && rectHeight == 0*/) + break; + + HaarFeature64 curFeature; + ncvStat = curFeature.setRect(rectX, rectY, rectWidth, rectHeight, haar.ClassifierSize.width, haar.ClassifierSize.height); + curFeature.setWeight(rectWeight); + ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat); + haarFeatures.push_back(curFeature); + + featureId++; + } + + HaarFeatureDescriptor32 tmpFeatureDesc; + ncvStat = tmpFeatureDesc.create(haar.bNeedsTiltedII, featureId, haarFeatures.size() - featureId); + ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat); + curNode.setFeatureDesc(tmpFeatureDesc); + + if (!nodeId) + { + //root node + haarClassifierNodes.push_back(curNode); + curMaxTreeDepth = 1; + } + else + { + //other node + h_TmpClassifierNotRootNodes.push_back(curNode); + curMaxTreeDepth++; + } + + nodeId++; + } + } + + curStage.setNumClassifierRootNodes(treesCount); + haarStages.push_back(curStage); + } +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + + //fill in cascade stats + haar.NumStages = haarStages.size(); + haar.NumClassifierRootNodes = haarClassifierNodes.size(); + haar.NumClassifierTotalNodes = haar.NumClassifierRootNodes + h_TmpClassifierNotRootNodes.size(); + haar.NumFeatures = haarFeatures.size(); + + //merge root and leaf nodes in one classifiers array + Ncv32u offsetRoot = haarClassifierNodes.size(); + for (Ncv32u i=0; i& objects, double scaleFactor, - int minNeighbors, int flags, Size minSize, Size maxSize) +//// + +#else /* loadFromXML implementation switch */ +#include "e:/devNPP-OpenCV/src/external/_rapidxml-1.13/rapidxml.hpp" + +NCVStatus loadFromXML(const std::string &filename, + HaarClassifierCascadeDescriptor &haar, + std::vector &haarStages, + std::vector &haarClassifierNodes, + std::vector &haarFeatures) { + NCVStatus ncvStat; + + haar.NumStages = 0; + haar.NumClassifierRootNodes = 0; + haar.NumClassifierTotalNodes = 0; + haar.NumFeatures = 0; + haar.ClassifierSize.width = 0; + haar.ClassifierSize.height = 0; + haar.bNeedsTiltedII = false; + haar.bHasStumpsOnly = false; + + FILE *fp; + fopen_s(&fp, filename.c_str(), "r"); + ncvAssertReturn(fp != NULL, NCV_FILE_ERROR); + + //get file size + fseek(fp, 0, SEEK_END); + Ncv32u xmlSize = ftell(fp); + fseek(fp, 0, SEEK_SET); + + //load file to vector + std::vector xmlFileCont; + xmlFileCont.resize(xmlSize+1); + memset(&xmlFileCont[0], 0, xmlSize+1); + fread_s(&xmlFileCont[0], xmlSize, 1, xmlSize, fp); + fclose(fp); + + haar.bHasStumpsOnly = true; + haar.bNeedsTiltedII = false; + Ncv32u curMaxTreeDepth; + + std::vector h_TmpClassifierNotRootNodes; + haarStages.resize(0); + haarClassifierNodes.resize(0); + haarFeatures.resize(0); + + //XML loading and OpenCV XML classifier syntax verification + try + { + rapidxml::xml_document<> doc; + doc.parse<0>(&xmlFileCont[0]); + + //opencv_storage + rapidxml::xml_node<> *parserGlobal = doc.first_node(); + ncvAssertReturn(!strcmp(parserGlobal->name(), "opencv_storage"), NCV_HAAR_XML_LOADING_EXCEPTION); + + //classifier type + parserGlobal = parserGlobal->first_node(); + ncvAssertReturn(parserGlobal, NCV_HAAR_XML_LOADING_EXCEPTION); + rapidxml::xml_attribute<> *attr = parserGlobal->first_attribute("type_id"); + ncvAssertReturn(!strcmp(attr->value(), "opencv-haar-classifier"), NCV_HAAR_XML_LOADING_EXCEPTION); + //classifier size + parserGlobal = parserGlobal->first_node("size"); + ncvAssertReturn(parserGlobal, NCV_HAAR_XML_LOADING_EXCEPTION); + sscanf_s(parserGlobal->value(), "%d %d", &(haar.ClassifierSize.width), &(haar.ClassifierSize.height)); + + //parse stages + parserGlobal = parserGlobal->next_sibling("stages"); + ncvAssertReturn(parserGlobal, NCV_HAAR_XML_LOADING_EXCEPTION); + parserGlobal = parserGlobal->first_node("_"); + ncvAssertReturn(parserGlobal, NCV_HAAR_XML_LOADING_EXCEPTION); + + while (parserGlobal) + { + HaarStage64 curStage; + curStage.setStartClassifierRootNodeOffset(haarClassifierNodes.size()); + Ncv32u tmpNumClassifierRootNodes = 0; + + rapidxml::xml_node<> *parserStageThreshold = parserGlobal->first_node("stage_threshold"); + ncvAssertReturn(parserStageThreshold, NCV_HAAR_XML_LOADING_EXCEPTION); + Ncv32f tmpStageThreshold; + sscanf_s(parserStageThreshold->value(), "%f", &tmpStageThreshold); + curStage.setStageThreshold(tmpStageThreshold); + + //parse trees + rapidxml::xml_node<> *parserTree; + parserTree = parserGlobal->first_node("trees"); + ncvAssertReturn(parserTree, NCV_HAAR_XML_LOADING_EXCEPTION); + parserTree = parserTree->first_node("_"); + ncvAssertReturn(parserTree, NCV_HAAR_XML_LOADING_EXCEPTION); + + while (parserTree) + { + rapidxml::xml_node<> *parserNode; + parserNode = parserTree->first_node("_"); + ncvAssertReturn(parserNode, NCV_HAAR_XML_LOADING_EXCEPTION); + Ncv32u nodeId = 0; + + while (parserNode) + { + HaarClassifierNode128 curNode; + + rapidxml::xml_node<> *parserNodeThreshold = parserNode->first_node("threshold"); + ncvAssertReturn(parserNodeThreshold, NCV_HAAR_XML_LOADING_EXCEPTION); + Ncv32f tmpThreshold; + sscanf_s(parserNodeThreshold->value(), "%f", &tmpThreshold); + curNode.setThreshold(tmpThreshold); + + rapidxml::xml_node<> *parserNodeLeft = parserNode->first_node("left_val"); + HaarClassifierNodeDescriptor32 nodeLeft; + if (parserNodeLeft) + { + Ncv32f leftVal; + sscanf_s(parserNodeLeft->value(), "%f", &leftVal); + ncvStat = nodeLeft.create(leftVal); + ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat); + } + else + { + parserNodeLeft = parserNode->first_node("left_node"); + ncvAssertReturn(parserNodeLeft, NCV_HAAR_XML_LOADING_EXCEPTION); + Ncv32u leftNodeOffset; + sscanf_s(parserNodeLeft->value(), "%d", &leftNodeOffset); + nodeLeft.create(h_TmpClassifierNotRootNodes.size() + leftNodeOffset - 1); + haar.bHasStumpsOnly = false; + } + curNode.setLeftNodeDesc(nodeLeft); + + rapidxml::xml_node<> *parserNodeRight = parserNode->first_node("right_val"); + HaarClassifierNodeDescriptor32 nodeRight; + if (parserNodeRight) + { + Ncv32f rightVal; + sscanf_s(parserNodeRight->value(), "%f", &rightVal); + ncvStat = nodeRight.create(rightVal); + ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat); + } + else + { + parserNodeRight = parserNode->first_node("right_node"); + ncvAssertReturn(parserNodeRight, NCV_HAAR_XML_LOADING_EXCEPTION); + Ncv32u rightNodeOffset; + sscanf_s(parserNodeRight->value(), "%d", &rightNodeOffset); + nodeRight.create(h_TmpClassifierNotRootNodes.size() + rightNodeOffset - 1); + haar.bHasStumpsOnly = false; + } + curNode.setRightNodeDesc(nodeRight); + + rapidxml::xml_node<> *parserNodeFeatures = parserNode->first_node("feature"); + ncvAssertReturn(parserNodeFeatures, NCV_HAAR_XML_LOADING_EXCEPTION); + + rapidxml::xml_node<> *parserNodeFeaturesTilted = parserNodeFeatures->first_node("tilted"); + ncvAssertReturn(parserNodeFeaturesTilted, NCV_HAAR_XML_LOADING_EXCEPTION); + Ncv32u tiltedVal; + sscanf_s(parserNodeFeaturesTilted->value(), "%d", &tiltedVal); + haar.bNeedsTiltedII = (tiltedVal != 0); + + rapidxml::xml_node<> *parserNodeFeaturesRects = parserNodeFeatures->first_node("rects"); + ncvAssertReturn(parserNodeFeaturesRects, NCV_HAAR_XML_LOADING_EXCEPTION); + parserNodeFeaturesRects = parserNodeFeaturesRects->first_node("_"); + ncvAssertReturn(parserNodeFeaturesRects, NCV_HAAR_XML_LOADING_EXCEPTION); + Ncv32u featureId = 0; + + while (parserNodeFeaturesRects) + { + Ncv32u rectX, rectY, rectWidth, rectHeight; + Ncv32f rectWeight; + sscanf_s(parserNodeFeaturesRects->value(), "%d %d %d %d %f", &rectX, &rectY, &rectWidth, &rectHeight, &rectWeight); + HaarFeature64 curFeature; + ncvStat = curFeature.setRect(rectX, rectY, rectWidth, rectHeight, haar.ClassifierSize.width, haar.ClassifierSize.height); + curFeature.setWeight(rectWeight); + ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat); + haarFeatures.push_back(curFeature); + + parserNodeFeaturesRects = parserNodeFeaturesRects->next_sibling("_"); + featureId++; + } + + HaarFeatureDescriptor32 tmpFeatureDesc; + ncvStat = tmpFeatureDesc.create(haar.bNeedsTiltedII, featureId, haarFeatures.size() - featureId); + ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat); + curNode.setFeatureDesc(tmpFeatureDesc); + + if (!nodeId) + { + //root node + haarClassifierNodes.push_back(curNode); + curMaxTreeDepth = 1; + } + else + { + //other node + h_TmpClassifierNotRootNodes.push_back(curNode); + curMaxTreeDepth++; + } + + parserNode = parserNode->next_sibling("_"); + nodeId++; + } + + parserTree = parserTree->next_sibling("_"); + tmpNumClassifierRootNodes++; + } + + curStage.setNumClassifierRootNodes(tmpNumClassifierRootNodes); + haarStages.push_back(curStage); + + parserGlobal = parserGlobal->next_sibling("_"); + } + } + catch (...) + { + return NCV_HAAR_XML_LOADING_EXCEPTION; + } + + //fill in cascade stats + haar.NumStages = haarStages.size(); + haar.NumClassifierRootNodes = haarClassifierNodes.size(); + haar.NumClassifierTotalNodes = haar.NumClassifierRootNodes + h_TmpClassifierNotRootNodes.size(); + haar.NumFeatures = haarFeatures.size(); + + //merge root and leaf nodes in one classifiers array + Ncv32u offsetRoot = haarClassifierNodes.size(); + for (Ncv32u i=0; i(), src1.step, - src2.ptr(), src2.step, - dst.ptr(), dst.step, sz, 0) ); + nppSafeCall( npp_func_8uc1(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz, 0) ); break; case CV_8UC4: - nppSafeCall( npp_func_8uc4(src1.ptr(), src1.step, - src2.ptr(), src2.step, - dst.ptr(), dst.step, sz, 0) ); + nppSafeCall( npp_func_8uc4(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz, 0) ); break; case CV_32SC1: - nppSafeCall( npp_func_32sc1(src1.ptr(), src1.step, - src2.ptr(), src2.step, - dst.ptr(), dst.step, sz) ); + nppSafeCall( npp_func_32sc1(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz) ); break; case CV_32FC1: - nppSafeCall( npp_func_32fc1(src1.ptr(), src1.step, - src2.ptr(), src2.step, - dst.ptr(), dst.step, sz) ); + nppSafeCall( npp_func_32fc1(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz) ); break; default: CV_Assert(!"Unsupported source type"); @@ -133,16 +120,15 @@ namespace template struct NppArithmScalarFunc; template<> struct NppArithmScalarFunc<1> { - typedef NppStatus (*func_ptr)(const Npp32f *pSrc, int nSrcStep, Npp32f nValue, Npp32f *pDst, - int nDstStep, NppiSize oSizeROI); + typedef NppStatus (*func_ptr)(const Npp32f *pSrc, int nSrcStep, Npp32f nValue, Npp32f *pDst, int nDstStep, NppiSize oSizeROI); }; template<> struct NppArithmScalarFunc<2> { - typedef NppStatus (*func_ptr)(const Npp32fc *pSrc, int nSrcStep, Npp32fc nValue, Npp32fc *pDst, - int nDstStep, NppiSize oSizeROI); + typedef NppStatus (*func_ptr)(const Npp32fc *pSrc, int nSrcStep, Npp32fc nValue, Npp32fc *pDst, int nDstStep, NppiSize oSizeROI); }; template::func_ptr func> struct NppArithmScalar; + template::func_ptr func> struct NppArithmScalar<1, func> { static void calc(const GpuMat& src, const Scalar& sc, GpuMat& dst) @@ -254,24 +240,16 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) switch (src1.type()) { case CV_8UC1: - nppSafeCall( nppiAbsDiff_8u_C1R(src1.ptr(), src1.step, - src2.ptr(), src2.step, - dst.ptr(), dst.step, sz) ); + nppSafeCall( nppiAbsDiff_8u_C1R(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz) ); break; case CV_8UC4: - nppSafeCall( nppiAbsDiff_8u_C4R(src1.ptr(), src1.step, - src2.ptr(), src2.step, - dst.ptr(), dst.step, sz) ); + nppSafeCall( nppiAbsDiff_8u_C4R(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz) ); break; case CV_32SC1: - nppSafeCall( nppiAbsDiff_32s_C1R(src1.ptr(), src1.step, - src2.ptr(), src2.step, - dst.ptr(), dst.step, sz) ); + nppSafeCall( nppiAbsDiff_32s_C1R(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz) ); break; case CV_32FC1: - nppSafeCall( nppiAbsDiff_32f_C1R(src1.ptr(), src1.step, - src2.ptr(), src2.step, - dst.ptr(), dst.step, sz) ); + nppSafeCall( nppiAbsDiff_32f_C1R(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz) ); break; default: CV_Assert(!"Unsupported source type"); diff --git a/modules/gpu/src/nvidia/FaceDetectionFeed.cpp_NvidiaAPI_sample b/modules/gpu/src/nvidia/FaceDetectionFeed.cpp_NvidiaAPI_sample new file mode 100644 index 0000000000..c1926a38c2 --- /dev/null +++ b/modules/gpu/src/nvidia/FaceDetectionFeed.cpp_NvidiaAPI_sample @@ -0,0 +1,362 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include +#include + +#define CV_NO_BACKWARD_COMPATIBILITY + +#include "opencv2/opencv.hpp" + +#include "NCVHaarObjectDetection.hpp" + +using namespace cv; +using namespace std; + +const Size preferredVideoFrameSize(640, 480); + +string preferredClassifier = "haarcascade_frontalface_alt.xml"; +string wndTitle = "NVIDIA Computer Vision SDK :: Face Detection in Video Feed"; + + +void printSyntax(void) +{ + printf("Syntax: FaceDetectionFeed.exe [-c cameranum | -v filename] classifier.xml\n"); +} + + +void imagePrintf(Mat& img, int lineOffsY, Scalar color, const char *format, ...) +{ + int fontFace = CV_FONT_HERSHEY_PLAIN; + double fontScale = 1; + + int baseline; + Size textSize = cv::getTextSize("T", fontFace, fontScale, 1, &baseline); + + va_list arg_ptr; + va_start(arg_ptr, format); + int len = _vscprintf(format, arg_ptr) + 1; + + vector strBuf(len); + vsprintf_s(&strBuf[0], len, format, arg_ptr); + + Point org(1, 3 * textSize.height * (lineOffsY + 1) / 2); + putText(img, &strBuf[0], org, fontFace, fontScale, color); + va_end(arg_ptr); +} + + +NCVStatus process(Mat *srcdst, + Ncv32u width, Ncv32u height, + NcvBool bShowAllHypotheses, NcvBool bLargestFace, + HaarClassifierCascadeDescriptor &haar, + NCVVector &d_haarStages, NCVVector &d_haarNodes, + NCVVector &d_haarFeatures, NCVVector &h_haarStages, + INCVMemAllocator &gpuAllocator, + INCVMemAllocator &cpuAllocator, + cudaDeviceProp &devProp) +{ + ncvAssertReturn(!((srcdst == NULL) ^ gpuAllocator.isCounting()), NCV_NULL_PTR); + + NCVStatus ncvStat; + + NCV_SET_SKIP_COND(gpuAllocator.isCounting()); + + NCVMatrixAlloc d_src(gpuAllocator, width, height); + ncvAssertReturn(d_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + NCVMatrixAlloc h_src(cpuAllocator, width, height); + ncvAssertReturn(h_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + NCVVectorAlloc d_rects(gpuAllocator, 100); + ncvAssertReturn(d_rects.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + + Mat h_src_hdr(Size(width, height), CV_8U, h_src.ptr(), h_src.stride()); + + NCV_SKIP_COND_BEGIN + + (*srcdst).copyTo(h_src_hdr); + + ncvStat = h_src.copySolid(d_src, 0); + ncvAssertReturnNcvStat(ncvStat); + ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR); + + NCV_SKIP_COND_END + + NcvSize32u roi; + roi.width = d_src.width(); + roi.height = d_src.height(); + + Ncv32u numDetections; + ncvStat = ncvDetectObjectsMultiScale_device( + d_src, roi, d_rects, numDetections, haar, h_haarStages, + d_haarStages, d_haarNodes, d_haarFeatures, + haar.ClassifierSize, + bShowAllHypotheses ? 0 : 4, + 1.2f, 1, + (bLargestFace ? NCVPipeObjDet_FindLargestObject : 0) | NCVPipeObjDet_VisualizeInPlace, + gpuAllocator, cpuAllocator, devProp.major, devProp.minor, 0); + ncvAssertReturnNcvStat(ncvStat); + ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR); + + NCV_SKIP_COND_BEGIN + + ncvStat = d_src.copySolid(h_src, 0); + ncvAssertReturnNcvStat(ncvStat); + ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR); + + h_src_hdr.copyTo(*srcdst); + + NCV_SKIP_COND_END + + return NCV_SUCCESS; +} + + +int main( int argc, const char** argv ) +{ + NCVStatus ncvStat; + + printf("NVIDIA Computer Vision SDK\n"); + printf("Face Detection in video and live feed\n"); + printf("=========================================\n"); + printf(" Esc - Quit\n"); + printf(" Space - Switch between NCV and OpenCV\n"); + printf(" L - Switch between FullSearch and LargestFace modes\n"); + printf(" U - Toggle unfiltered hypotheses visualization in FullSearch\n"); + + if (argc != 4 && argc != 1) + return printSyntax(), -1; + + VideoCapture capture; + Size frameSize; + + if (argc == 1 || strcmp(argv[1], "-c") == 0) + { + // Camera input is specified + int camIdx = (argc == 3) ? atoi(argv[2]) : 0; + if(!capture.open(camIdx)) + return printf("Error opening camera\n"), -1; + + capture.set(CV_CAP_PROP_FRAME_WIDTH, preferredVideoFrameSize.width); + capture.set(CV_CAP_PROP_FRAME_HEIGHT, preferredVideoFrameSize.height); + capture.set(CV_CAP_PROP_FPS, 25); + frameSize = preferredVideoFrameSize; + } + else if (strcmp(argv[1], "-v") == 0) + { + // Video file input (avi) + if(!capture.open(argv[2])) + return printf("Error opening video file\n"), -1; + + frameSize.width = (int)capture.get(CV_CAP_PROP_FRAME_WIDTH); + frameSize.height = (int)capture.get(CV_CAP_PROP_FRAME_HEIGHT); + } + else + return printSyntax(), -1; + + NcvBool bUseOpenCV = true; + NcvBool bLargestFace = true; + NcvBool bShowAllHypotheses = false; + + string classifierFile = (argc == 1) ? preferredClassifier : argv[3]; + + CascadeClassifier classifierOpenCV; + if (!classifierOpenCV.load(classifierFile)) + return printf("Error (in OpenCV) opening classifier\n"), printSyntax(), -1; + + int devId; + ncvAssertCUDAReturn(cudaGetDevice(&devId), -1); + cudaDeviceProp devProp; + ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), -1); + printf("Using GPU %d %s, arch=%d.%d\n", devId, devProp.name, devProp.major, devProp.minor); + + //============================================================================== + // + // Load the classifier from file (assuming its size is about 1 mb) + // using a simple allocator + // + //============================================================================== + + NCVMemNativeAllocator gpuCascadeAllocator(NCVMemoryTypeDevice); + ncvAssertPrintReturn(gpuCascadeAllocator.isInitialized(), "Error creating cascade GPU allocator", -1); + NCVMemNativeAllocator cpuCascadeAllocator(NCVMemoryTypeHostPinned); + ncvAssertPrintReturn(cpuCascadeAllocator.isInitialized(), "Error creating cascade CPU allocator", -1); + + Ncv32u haarNumStages, haarNumNodes, haarNumFeatures; + ncvStat = ncvHaarGetClassifierSize(classifierFile, haarNumStages, haarNumNodes, haarNumFeatures); + ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error reading classifier size (check the file)", -1); + + NCVVectorAlloc h_haarStages(cpuCascadeAllocator, haarNumStages); + ncvAssertPrintReturn(h_haarStages.isMemAllocated(), "Error in cascade CPU allocator", -1); + NCVVectorAlloc h_haarNodes(cpuCascadeAllocator, haarNumNodes); + ncvAssertPrintReturn(h_haarNodes.isMemAllocated(), "Error in cascade CPU allocator", -1); + NCVVectorAlloc h_haarFeatures(cpuCascadeAllocator, haarNumFeatures); + ncvAssertPrintReturn(h_haarFeatures.isMemAllocated(), "Error in cascade CPU allocator", -1); + + HaarClassifierCascadeDescriptor haar; + ncvStat = ncvHaarLoadFromFile_host(classifierFile, haar, h_haarStages, h_haarNodes, h_haarFeatures); + ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error loading classifier", -1); + + NCVVectorAlloc d_haarStages(gpuCascadeAllocator, haarNumStages); + ncvAssertPrintReturn(d_haarStages.isMemAllocated(), "Error in cascade GPU allocator", -1); + NCVVectorAlloc d_haarNodes(gpuCascadeAllocator, haarNumNodes); + ncvAssertPrintReturn(d_haarNodes.isMemAllocated(), "Error in cascade GPU allocator", -1); + NCVVectorAlloc d_haarFeatures(gpuCascadeAllocator, haarNumFeatures); + ncvAssertPrintReturn(d_haarFeatures.isMemAllocated(), "Error in cascade GPU allocator", -1); + + ncvStat = h_haarStages.copySolid(d_haarStages, 0); + ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", -1); + ncvStat = h_haarNodes.copySolid(d_haarNodes, 0); + ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", -1); + ncvStat = h_haarFeatures.copySolid(d_haarFeatures, 0); + ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", -1); + + //============================================================================== + // + // Calculate memory requirements and create real allocators + // + //============================================================================== + + NCVMemStackAllocator gpuCounter(devProp.textureAlignment); + ncvAssertPrintReturn(gpuCounter.isInitialized(), "Error creating GPU memory counter", -1); + NCVMemStackAllocator cpuCounter(devProp.textureAlignment); + ncvAssertPrintReturn(cpuCounter.isInitialized(), "Error creating CPU memory counter", -1); + + ncvStat = process(NULL, frameSize.width, frameSize.height, + false, false, haar, + d_haarStages, d_haarNodes, + d_haarFeatures, h_haarStages, + gpuCounter, cpuCounter, devProp); + ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error in memory counting pass", -1); + + NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, gpuCounter.maxSize(), devProp.textureAlignment); + ncvAssertPrintReturn(gpuAllocator.isInitialized(), "Error creating GPU memory allocator", -1); + NCVMemStackAllocator cpuAllocator(NCVMemoryTypeHostPinned, cpuCounter.maxSize(), devProp.textureAlignment); + ncvAssertPrintReturn(cpuAllocator.isInitialized(), "Error creating CPU memory allocator", -1); + + printf("Initialized for frame size [%dx%d]\n", frameSize.width, frameSize.height); + + //============================================================================== + // + // Main processing loop + // + //============================================================================== + + namedWindow(wndTitle, 1); + + Mat frame, gray, frameDisp; + + for(;;) + { + // For camera and video file, capture the next image + capture >> frame; + if (frame.empty()) + break; + + cvtColor(frame, gray, CV_BGR2GRAY); + + // process + NcvSize32u minSize = haar.ClassifierSize; + if (bLargestFace) + { + Ncv32u ratioX = preferredVideoFrameSize.width / minSize.width; + Ncv32u ratioY = preferredVideoFrameSize.height / minSize.height; + Ncv32u ratioSmallest = std::min(ratioX, ratioY); + ratioSmallest = (Ncv32u)std::max(ratioSmallest / 2.5f, 1.f); + minSize.width *= ratioSmallest; + minSize.height *= ratioSmallest; + } + + NcvTimer timer = ncvStartTimer(); + + if (!bUseOpenCV) + { + ncvStat = process(&gray, frameSize.width, frameSize.height, + bShowAllHypotheses, bLargestFace, haar, + d_haarStages, d_haarNodes, + d_haarFeatures, h_haarStages, + gpuAllocator, cpuAllocator, devProp); + ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error in memory counting pass", -1); + } + else + { + vector rectsOpenCV; + + classifierOpenCV.detectMultiScale( + gray, + rectsOpenCV, + 1.2f, + bShowAllHypotheses && !bLargestFace ? 0 : 4, + (bLargestFace ? CV_HAAR_FIND_BIGGEST_OBJECT : 0) | CV_HAAR_SCALE_IMAGE, + Size(minSize.width, minSize.height)); + + for (size_t rt = 0; rt < rectsOpenCV.size(); ++rt) + rectangle(gray, rectsOpenCV[rt], Scalar(255)); + } + + Ncv32f avgTime = (Ncv32f)ncvEndQueryTimerMs(timer); + + cvtColor(gray, frameDisp, CV_GRAY2BGR); + + imagePrintf(frameDisp, 0, CV_RGB(255, 0,0), "Space - Switch NCV%s / OpenCV%s", bUseOpenCV?"":" (ON)", bUseOpenCV?" (ON)":""); + imagePrintf(frameDisp, 1, CV_RGB(255, 0,0), "L - Switch FullSearch%s / LargestFace%s modes", bLargestFace?"":" (ON)", bLargestFace?" (ON)":""); + imagePrintf(frameDisp, 2, CV_RGB(255, 0,0), "U - Toggle unfiltered hypotheses visualization in FullSearch %s", bShowAllHypotheses?"(ON)":"(OFF)"); + imagePrintf(frameDisp, 3, CV_RGB(118,185,0), " Running at %f FPS on %s", 1000.0f / avgTime, bUseOpenCV?"CPU":"GPU"); + + cv::imshow(wndTitle, frameDisp); + + switch (cvWaitKey(1)) + { + case ' ': + bUseOpenCV = !bUseOpenCV; + break; + case 'L':case 'l': + bLargestFace = !bLargestFace; + break; + case 'U':case 'u': + bShowAllHypotheses = !bShowAllHypotheses; + break; + case 27: + return 0; + } + } + + return 0; +} diff --git a/modules/gpu/src/nvidia/NCV.cpp b/modules/gpu/src/nvidia/NCV.cpp new file mode 100644 index 0000000000..fef9c08f5b --- /dev/null +++ b/modules/gpu/src/nvidia/NCV.cpp @@ -0,0 +1,571 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + + +#include + + +#if !defined (HAVE_CUDA) + + +#else /* !defined (HAVE_CUDA) */ + + +#include +#include "NCV.hpp" + + +//============================================================================== +// +// Error handling helpers +// +//============================================================================== + + +static void stdioDebugOutput(const char *msg) +{ + printf("%s", msg); +} + + +static NCVDebugOutputHandler *debugOutputHandler = stdioDebugOutput; + + +void ncvDebugOutput(const char *msg, ...) +{ + const int K_DEBUG_STRING_MAXLEN = 1024; + char buffer[K_DEBUG_STRING_MAXLEN]; + va_list args; + va_start(args, msg); + vsnprintf_s(buffer, K_DEBUG_STRING_MAXLEN, K_DEBUG_STRING_MAXLEN-1, msg, args); + va_end (args); + debugOutputHandler(buffer); +} + + +void ncvSetDebugOutputHandler(NCVDebugOutputHandler *func) +{ + debugOutputHandler = func; +} + + +//============================================================================== +// +// Memory wrappers and helpers +// +//============================================================================== + + +NCVStatus GPUAlignmentValue(Ncv32u &alignment) +{ + int curDev; + cudaDeviceProp curProp; + ncvAssertCUDAReturn(cudaGetDevice(&curDev), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaGetDeviceProperties(&curProp, curDev), NCV_CUDA_ERROR); + alignment = curProp.textureAlignment; //GPUAlignmentValue(curProp.major); + return NCV_SUCCESS; +} + + +Ncv32u alignUp(Ncv32u what, Ncv32u alignment) +{ + Ncv32u alignMask = alignment-1; + Ncv32u inverseAlignMask = ~alignMask; + Ncv32u res = (what + alignMask) & inverseAlignMask; + return res; +} + + +void NCVMemPtr::clear() +{ + ptr = NULL; + memtype = NCVMemoryTypeNone; +} + + +void NCVMemSegment::clear() +{ + begin.clear(); + size = 0; +} + + +NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType, const void *src, NCVMemoryType srcType, size_t sz, cudaStream_t cuStream) +{ + NCVStatus ncvStat; + switch (dstType) + { + case NCVMemoryTypeHostPageable: + case NCVMemoryTypeHostPinned: + switch (srcType) + { + case NCVMemoryTypeHostPageable: + case NCVMemoryTypeHostPinned: + memcpy(dst, src, sz); + ncvStat = NCV_SUCCESS; + break; + case NCVMemoryTypeDevice: + if (cuStream != 0) + { + ncvAssertCUDAReturn(cudaMemcpyAsync(dst, src, sz, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR); + } + else + { + ncvAssertCUDAReturn(cudaMemcpy(dst, src, sz, cudaMemcpyDeviceToHost), NCV_CUDA_ERROR); + } + ncvStat = NCV_SUCCESS; + break; + default: + ncvStat = NCV_MEM_RESIDENCE_ERROR; + } + break; + case NCVMemoryTypeDevice: + switch (srcType) + { + case NCVMemoryTypeHostPageable: + case NCVMemoryTypeHostPinned: + if (cuStream != 0) + { + ncvAssertCUDAReturn(cudaMemcpyAsync(dst, src, sz, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR); + } + else + { + ncvAssertCUDAReturn(cudaMemcpy(dst, src, sz, cudaMemcpyHostToDevice), NCV_CUDA_ERROR); + } + ncvStat = NCV_SUCCESS; + break; + case NCVMemoryTypeDevice: + if (cuStream != 0) + { + ncvAssertCUDAReturn(cudaMemcpyAsync(dst, src, sz, cudaMemcpyDeviceToDevice, cuStream), NCV_CUDA_ERROR); + } + else + { + ncvAssertCUDAReturn(cudaMemcpy(dst, src, sz, cudaMemcpyDeviceToDevice), NCV_CUDA_ERROR); + } + ncvStat = NCV_SUCCESS; + break; + default: + ncvStat = NCV_MEM_RESIDENCE_ERROR; + } + break; + default: + ncvStat = NCV_MEM_RESIDENCE_ERROR; + } + + return ncvStat; +} + + +//=================================================================== +// +// NCVMemStackAllocator class members implementation +// +//=================================================================== + + +NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment) + : + currentSize(0), + _maxSize(0), + allocBegin(NULL), + begin(NULL), + _memType(NCVMemoryTypeNone), + _alignment(alignment) +{ + NcvBool bProperAlignment = (alignment & (alignment-1)) == 0; + ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: alignment not power of 2"); +} + + +NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment) + : + currentSize(0), + _maxSize(0), + allocBegin(NULL), + _memType(memT), + _alignment(alignment) +{ + NcvBool bProperAlignment = (alignment & (alignment-1)) == 0; + ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: _alignment not power of 2"); + + allocBegin = NULL; + + switch (memT) + { + case NCVMemoryTypeDevice: + ncvAssertCUDAReturn(cudaMalloc(&allocBegin, capacity), ); + break; + case NCVMemoryTypeHostPinned: + ncvAssertCUDAReturn(cudaMallocHost(&allocBegin, capacity), ); + break; + case NCVMemoryTypeHostPageable: + allocBegin = (Ncv8u *)malloc(capacity); + break; + } + + if (capacity == 0) + { + allocBegin = (Ncv8u *)(0x1); + } + + if (!isCounting()) + { + begin = allocBegin; + end = begin + capacity; + } +} + + +NCVMemStackAllocator::~NCVMemStackAllocator() +{ + if (allocBegin != NULL) + { + ncvAssertPrintCheck(currentSize == 0, "NCVMemStackAllocator dtor:: not all objects were deallocated properly, forcing destruction"); + switch (_memType) + { + case NCVMemoryTypeDevice: + ncvAssertCUDAReturn(cudaFree(allocBegin), ); + break; + case NCVMemoryTypeHostPinned: + ncvAssertCUDAReturn(cudaFreeHost(allocBegin), ); + break; + case NCVMemoryTypeHostPageable: + free(allocBegin); + break; + } + allocBegin = NULL; + } +} + + +NCVStatus NCVMemStackAllocator::alloc(NCVMemSegment &seg, size_t size) +{ + seg.clear(); + ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC); + + size = alignUp(size, this->_alignment); + this->currentSize += size; + this->_maxSize = std::max(this->_maxSize, this->currentSize); + + if (!isCounting()) + { + size_t availSize = end - begin; + ncvAssertReturn(size <= availSize, NCV_ALLOCATOR_INSUFFICIENT_CAPACITY); + } + + seg.begin.ptr = begin; + seg.begin.memtype = this->_memType; + seg.size = size; + begin += size; + + return NCV_SUCCESS; +} + + +NCVStatus NCVMemStackAllocator::dealloc(NCVMemSegment &seg) +{ + ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC); + ncvAssertReturn(seg.begin.memtype == this->_memType, NCV_ALLOCATOR_BAD_DEALLOC); + ncvAssertReturn(seg.begin.ptr != NULL || isCounting(), NCV_ALLOCATOR_BAD_DEALLOC); + ncvAssertReturn(seg.begin.ptr == begin - seg.size, NCV_ALLOCATOR_DEALLOC_ORDER); + + currentSize -= seg.size; + begin -= seg.size; + + seg.clear(); + + ncvAssertReturn(allocBegin <= begin, NCV_ALLOCATOR_BAD_DEALLOC); + + return NCV_SUCCESS; +} + + +NcvBool NCVMemStackAllocator::isInitialized(void) const +{ + return ((this->_alignment & (this->_alignment-1)) == 0) && isCounting() || this->allocBegin != NULL; +} + + +NcvBool NCVMemStackAllocator::isCounting(void) const +{ + return this->_memType == NCVMemoryTypeNone; +} + + +NCVMemoryType NCVMemStackAllocator::memType(void) const +{ + return this->_memType; +} + + +Ncv32u NCVMemStackAllocator::alignment(void) const +{ + return this->_alignment; +} + + +size_t NCVMemStackAllocator::maxSize(void) const +{ + return this->_maxSize; +} + + +//=================================================================== +// +// NCVMemNativeAllocator class members implementation +// +//=================================================================== + + +NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT) + : + currentSize(0), + _maxSize(0), + _memType(memT) +{ + ncvAssertPrintReturn(memT != NCVMemoryTypeNone, "NCVMemNativeAllocator ctor:: counting not permitted for this allocator type", ); + ncvAssertPrintReturn(NCV_SUCCESS == GPUAlignmentValue(this->_alignment), "NCVMemNativeAllocator ctor:: couldn't get device _alignment", ); +} + + +NCVMemNativeAllocator::~NCVMemNativeAllocator() +{ + ncvAssertPrintCheck(currentSize == 0, "NCVMemNativeAllocator dtor:: detected memory leak"); +} + + +NCVStatus NCVMemNativeAllocator::alloc(NCVMemSegment &seg, size_t size) +{ + seg.clear(); + ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC); + + switch (this->_memType) + { + case NCVMemoryTypeDevice: + ncvAssertCUDAReturn(cudaMalloc(&seg.begin.ptr, size), NCV_CUDA_ERROR); + break; + case NCVMemoryTypeHostPinned: + ncvAssertCUDAReturn(cudaMallocHost(&seg.begin.ptr, size), NCV_CUDA_ERROR); + break; + case NCVMemoryTypeHostPageable: + seg.begin.ptr = (Ncv8u *)malloc(size); + break; + } + + this->currentSize += alignUp(size, this->_alignment); + this->_maxSize = std::max(this->_maxSize, this->currentSize); + + seg.begin.memtype = this->_memType; + seg.size = size; + + return NCV_SUCCESS; +} + + +NCVStatus NCVMemNativeAllocator::dealloc(NCVMemSegment &seg) +{ + ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC); + ncvAssertReturn(seg.begin.memtype == this->_memType, NCV_ALLOCATOR_BAD_DEALLOC); + ncvAssertReturn(seg.begin.ptr != NULL, NCV_ALLOCATOR_BAD_DEALLOC); + + ncvAssertReturn(currentSize >= alignUp(seg.size, this->_alignment), NCV_ALLOCATOR_BAD_DEALLOC); + currentSize -= alignUp(seg.size, this->_alignment); + + switch (this->_memType) + { + case NCVMemoryTypeDevice: + ncvAssertCUDAReturn(cudaFree(seg.begin.ptr), NCV_CUDA_ERROR); + break; + case NCVMemoryTypeHostPinned: + ncvAssertCUDAReturn(cudaFreeHost(seg.begin.ptr), NCV_CUDA_ERROR); + break; + case NCVMemoryTypeHostPageable: + free(seg.begin.ptr); + break; + } + + seg.clear(); + + return NCV_SUCCESS; +} + + +NcvBool NCVMemNativeAllocator::isInitialized(void) const +{ + return (this->_alignment != 0); +} + + +NcvBool NCVMemNativeAllocator::isCounting(void) const +{ + return false; +} + + +NCVMemoryType NCVMemNativeAllocator::memType(void) const +{ + return this->_memType; +} + + +Ncv32u NCVMemNativeAllocator::alignment(void) const +{ + return this->_alignment; +} + + +size_t NCVMemNativeAllocator::maxSize(void) const +{ + return this->_maxSize; +} + + +//=================================================================== +// +// Time and timer routines +// +//=================================================================== + + +typedef struct _NcvTimeMoment NcvTimeMoment; + +#if defined(_WIN32) || defined(_WIN64) + + #include + + typedef struct _NcvTimeMoment + { + LONGLONG moment, freq; + } NcvTimeMoment; + + + static void _ncvQueryMoment(NcvTimeMoment *t) + { + QueryPerformanceFrequency((LARGE_INTEGER *)&(t->freq)); + QueryPerformanceCounter((LARGE_INTEGER *)&(t->moment)); + } + + + double _ncvMomentToMicroseconds(NcvTimeMoment *t) + { + return 1000000.0 * t->moment / t->freq; + } + + + double _ncvMomentsDiffToMicroseconds(NcvTimeMoment *t1, NcvTimeMoment *t2) + { + return 1000000.0 * 2 * ((t2->moment) - (t1->moment)) / (t1->freq + t2->freq); + } + + + double _ncvMomentsDiffToMilliseconds(NcvTimeMoment *t1, NcvTimeMoment *t2) + { + return 1000.0 * 2 * ((t2->moment) - (t1->moment)) / (t1->freq + t2->freq); + } + +#elif defined(__unix__) + + #include + + typedef struct _NcvTimeMoment + { + struct timeval tv; + struct timezone tz; + } NcvTimeMoment; + + + void _ncvQueryMoment(NcvTimeMoment *t) + { + gettimeofday(& t->tv, & t->tz); + } + + + double _ncvMomentToMicroseconds(NcvTimeMoment *t) + { + return 1000000.0 * t->tv.tv_sec + (double)t->tv.tv_usec; + } + + + double _ncvMomentsDiffToMicroseconds(NcvTimeMoment *t1, NcvTimeMoment *t2) + { + return (((double)t2->tv.tv_sec - (double)t1->tv.tv_sec) * 1000000 + (double)t2->tv.tv_usec - (double)t1->tv.tv_usec); + } + + +#endif //#if defined(_WIN32) || defined(_WIN64) + + +struct _NcvTimer +{ + NcvTimeMoment t1, t2; +}; + + +NcvTimer ncvStartTimer(void) +{ + struct _NcvTimer *t; + t = (struct _NcvTimer *)malloc(sizeof(struct _NcvTimer)); + _ncvQueryMoment(&t->t1); + return t; +} + + +double ncvEndQueryTimerUs(NcvTimer t) +{ + double res; + _ncvQueryMoment(&t->t2); + res = _ncvMomentsDiffToMicroseconds(&t->t1, &t->t2); + free(t); + return res; +} + + +double ncvEndQueryTimerMs(NcvTimer t) +{ + double res; + _ncvQueryMoment(&t->t2); + res = _ncvMomentsDiffToMilliseconds(&t->t1, &t->t2); + free(t); + return res; +} + +#endif /* !defined (HAVE_CUDA) */ \ No newline at end of file diff --git a/modules/gpu/src/nvidia/NCV.hpp b/modules/gpu/src/nvidia/NCV.hpp new file mode 100644 index 0000000000..a71f650252 --- /dev/null +++ b/modules/gpu/src/nvidia/NCV.hpp @@ -0,0 +1,837 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef _ncv_hpp_ +#define _ncv_hpp_ + +#include +#include "npp_staging.h" + + +//============================================================================== +// +// Alignment macros +// +//============================================================================== + + +#if !defined(__align__) && !defined(__CUDACC__) + #if defined(_WIN32) || defined(_WIN64) + #define __align__(n) __declspec(align(n)) + #elif defined(__unix__) + #define __align__(n) __attribute__((__aligned__(n))) + #endif +#endif + + +//============================================================================== +// +// Integral and compound types of guaranteed size +// +//============================================================================== + + +typedef bool NcvBool; +typedef long long Ncv64s; +typedef unsigned long long Ncv64u; +typedef int Ncv32s; +typedef unsigned int Ncv32u; +typedef short Ncv16s; +typedef unsigned short Ncv16u; +typedef char Ncv8s; +typedef unsigned char Ncv8u; +typedef float Ncv32f; +typedef double Ncv64f; + + +typedef struct +{ + Ncv8u x; + Ncv8u y; + Ncv8u width; + Ncv8u height; +} NcvRect8u; + + +typedef struct +{ + Ncv32s x; ///< x-coordinate of upper left corner. + Ncv32s y; ///< y-coordinate of upper left corner. + Ncv32s width; ///< Rectangle width. + Ncv32s height; ///< Rectangle height. +} NcvRect32s; + + +typedef struct +{ + Ncv32u x; ///< x-coordinate of upper left corner. + Ncv32u y; ///< y-coordinate of upper left corner. + Ncv32u width; ///< Rectangle width. + Ncv32u height; ///< Rectangle height. +} NcvRect32u; + + +typedef struct +{ + Ncv32s width; ///< Rectangle width. + Ncv32s height; ///< Rectangle height. +} NcvSize32s; + + +typedef struct +{ + Ncv32u width; ///< Rectangle width. + Ncv32u height; ///< Rectangle height. +} NcvSize32u; + + +NPPST_CT_ASSERT(sizeof(NcvBool) <= 4); +NPPST_CT_ASSERT(sizeof(Ncv64s) == 8); +NPPST_CT_ASSERT(sizeof(Ncv64u) == 8); +NPPST_CT_ASSERT(sizeof(Ncv32s) == 4); +NPPST_CT_ASSERT(sizeof(Ncv32u) == 4); +NPPST_CT_ASSERT(sizeof(Ncv16s) == 2); +NPPST_CT_ASSERT(sizeof(Ncv16u) == 2); +NPPST_CT_ASSERT(sizeof(Ncv8s) == 1); +NPPST_CT_ASSERT(sizeof(Ncv8u) == 1); +NPPST_CT_ASSERT(sizeof(Ncv32f) == 4); +NPPST_CT_ASSERT(sizeof(Ncv64f) == 8); +NPPST_CT_ASSERT(sizeof(NcvRect8u) == sizeof(Ncv32u)); +NPPST_CT_ASSERT(sizeof(NcvRect32s) == 4 * sizeof(Ncv32s)); +NPPST_CT_ASSERT(sizeof(NcvRect32u) == 4 * sizeof(Ncv32u)); +NPPST_CT_ASSERT(sizeof(NcvSize32u) == 2 * sizeof(Ncv32u)); + + +//============================================================================== +// +// Persistent constants +// +//============================================================================== + + +const Ncv32u K_WARP_SIZE = 32; +const Ncv32u K_LOG2_WARP_SIZE = 5; + + +//============================================================================== +// +// Error handling +// +//============================================================================== + + +#define NCV_CT_PREP_STRINGIZE_AUX(x) #x +#define NCV_CT_PREP_STRINGIZE(x) NCV_CT_PREP_STRINGIZE_AUX(x) + + +void ncvDebugOutput(const char *msg, ...); + + +typedef void NCVDebugOutputHandler(const char* msg); + + +void ncvSetDebugOutputHandler(NCVDebugOutputHandler* func); + + +#define ncvAssertPrintCheck(pred, msg) \ + ((pred) ? true : (ncvDebugOutput("\n%s\n", \ + "NCV Assertion Failed: " msg ", file=" __FILE__ ", line=" NCV_CT_PREP_STRINGIZE(__LINE__) \ + ), false)) + + +#define ncvAssertPrintReturn(pred, msg, err) \ + if (ncvAssertPrintCheck(pred, msg)) ; else return err + + +#define ncvAssertReturn(pred, err) \ + do \ + { \ + if (!(pred)) \ + { \ + ncvDebugOutput("\n%s%d%s\n", "NCV Assertion Failed: retcode=", (int)err, ", file=" __FILE__ ", line=" NCV_CT_PREP_STRINGIZE(__LINE__)); \ + return err; \ + } \ + } while (0) + + +#define ncvAssertReturnNcvStat(ncvOp) \ + do \ + { \ + NCVStatus _ncvStat = ncvOp; \ + if (NCV_SUCCESS != _ncvStat) \ + { \ + ncvDebugOutput("\n%s%d%s\n", "NCV Assertion Failed: NcvStat=", (int)_ncvStat, ", file=" __FILE__ ", line=" NCV_CT_PREP_STRINGIZE(__LINE__)); \ + return _ncvStat; \ + } \ + } while (0) + + +#define ncvAssertCUDAReturn(cudacall, errCode) \ + do \ + { \ + cudaError_t resCall = cudacall; \ + cudaError_t resGLE = cudaGetLastError(); \ + if (cudaSuccess != resCall || cudaSuccess != resGLE) \ + { \ + ncvDebugOutput("\n%s%d%s\n", "NCV CUDA Assertion Failed: cudaError_t=", (int)(resCall | resGLE), ", file=" __FILE__ ", line=" NCV_CT_PREP_STRINGIZE(__LINE__)); \ + return errCode; \ + } \ + } while (0) + + +/** +* Return-codes for status notification, errors and warnings +*/ +enum NCVStatus +{ + NCV_SUCCESS, + + NCV_CUDA_ERROR, + NCV_NPP_ERROR, + NCV_FILE_ERROR, + + NCV_NULL_PTR, + NCV_INCONSISTENT_INPUT, + NCV_TEXTURE_BIND_ERROR, + NCV_DIMENSIONS_INVALID, + + NCV_INVALID_ROI, + NCV_INVALID_STEP, + NCV_INVALID_SCALE, + + NCV_ALLOCATOR_NOT_INITIALIZED, + NCV_ALLOCATOR_BAD_ALLOC, + NCV_ALLOCATOR_BAD_DEALLOC, + NCV_ALLOCATOR_INSUFFICIENT_CAPACITY, + NCV_ALLOCATOR_DEALLOC_ORDER, + NCV_ALLOCATOR_BAD_REUSE, + + NCV_MEM_COPY_ERROR, + NCV_MEM_RESIDENCE_ERROR, + NCV_MEM_INSUFFICIENT_CAPACITY, + + NCV_HAAR_INVALID_PIXEL_STEP, + NCV_HAAR_TOO_MANY_FEATURES_IN_CLASSIFIER, + NCV_HAAR_TOO_MANY_FEATURES_IN_CASCADE, + NCV_HAAR_TOO_LARGE_FEATURES, + NCV_HAAR_XML_LOADING_EXCEPTION, + + NCV_NOIMPL_HAAR_TILTED_FEATURES, + + NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW, +}; + + +#define NCV_SET_SKIP_COND(x) \ + bool __ncv_skip_cond = x + + +#define NCV_RESET_SKIP_COND(x) \ + __ncv_skip_cond = x + + +#define NCV_SKIP_COND_BEGIN \ + if (!__ncv_skip_cond) { + + +#define NCV_SKIP_COND_END \ + } + + +//============================================================================== +// +// Timer +// +//============================================================================== + + +typedef struct _NcvTimer *NcvTimer; + +NcvTimer ncvStartTimer(void); + +double ncvEndQueryTimerUs(NcvTimer t); + +double ncvEndQueryTimerMs(NcvTimer t); + + +//============================================================================== +// +// Memory management classes template compound types +// +//============================================================================== + + +/** +* Alignment of GPU memory chunks in bytes +*/ +NCVStatus GPUAlignmentValue(Ncv32u &alignment); + + +/** +* Calculates the aligned top bound value +*/ +Ncv32u alignUp(Ncv32u what, Ncv32u alignment); + + +/** +* NCVMemoryType +*/ +enum NCVMemoryType +{ + NCVMemoryTypeNone, + NCVMemoryTypeHostPageable, + NCVMemoryTypeHostPinned, + NCVMemoryTypeDevice +}; + + +/** +* NCVMemPtr +*/ +struct NCVMemPtr +{ + void *ptr; + NCVMemoryType memtype; + void clear(); +}; + + +/** +* NCVMemSegment +*/ +struct NCVMemSegment +{ + NCVMemPtr begin; + size_t size; + void clear(); +}; + + +/** +* INCVMemAllocator (Interface) +*/ +class INCVMemAllocator +{ +public: + virtual ~INCVMemAllocator() = 0; + + virtual NCVStatus alloc(NCVMemSegment &seg, size_t size) = 0; + virtual NCVStatus dealloc(NCVMemSegment &seg) = 0; + + virtual NcvBool isInitialized(void) const = 0; + virtual NcvBool isCounting(void) const = 0; + + virtual NCVMemoryType memType(void) const = 0; + virtual Ncv32u alignment(void) const = 0; + virtual size_t maxSize(void) const = 0; +}; + +inline INCVMemAllocator::~INCVMemAllocator() {} + + +/** +* NCVMemStackAllocator +*/ +class NCVMemStackAllocator : public INCVMemAllocator +{ + NCVMemStackAllocator(); + NCVMemStackAllocator(const NCVMemStackAllocator &); + +public: + + explicit NCVMemStackAllocator(Ncv32u alignment); + NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment); + virtual ~NCVMemStackAllocator(); + + virtual NCVStatus alloc(NCVMemSegment &seg, size_t size); + virtual NCVStatus dealloc(NCVMemSegment &seg); + + virtual NcvBool isInitialized(void) const; + virtual NcvBool isCounting(void) const; + + virtual NCVMemoryType memType(void) const; + virtual Ncv32u alignment(void) const; + virtual size_t maxSize(void) const; + +private: + + NCVMemoryType _memType; + Ncv32u _alignment; + Ncv8u *allocBegin; + Ncv8u *begin; + Ncv8u *end; + size_t currentSize; + size_t _maxSize; +}; + + +/** +* NCVMemNativeAllocator +*/ +class NCVMemNativeAllocator : public INCVMemAllocator +{ +public: + + NCVMemNativeAllocator(NCVMemoryType memT); + virtual ~NCVMemNativeAllocator(); + + virtual NCVStatus alloc(NCVMemSegment &seg, size_t size); + virtual NCVStatus dealloc(NCVMemSegment &seg); + + virtual NcvBool isInitialized(void) const; + virtual NcvBool isCounting(void) const; + + virtual NCVMemoryType memType(void) const; + virtual Ncv32u alignment(void) const; + virtual size_t maxSize(void) const; + +private: + + NCVMemNativeAllocator(); + NCVMemNativeAllocator(const NCVMemNativeAllocator &); + + NCVMemoryType _memType; + Ncv32u _alignment; + size_t currentSize; + size_t _maxSize; +}; + + +/** +* Copy dispatcher +*/ +NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType, + const void *src, NCVMemoryType srcType, + size_t sz, cudaStream_t cuStream); + + +/** +* NCVVector (1D) +*/ +template +class NCVVector +{ + NCVVector(const NCVVector &); + +public: + + NCVVector() + { + clear(); + } + + virtual ~NCVVector() {} + + void clear() + { + _ptr = NULL; + _length = 0; + _memtype = NCVMemoryTypeNone; + } + + NCVStatus copySolid(NCVVector &dst, cudaStream_t cuStream, size_t howMuch=0) + { + if (howMuch == 0) + { + ncvAssertReturn(dst._length == this->_length, NCV_MEM_COPY_ERROR); + howMuch = this->_length * sizeof(T); + } + else + { + ncvAssertReturn(dst._length * sizeof(T) >= howMuch && + this->_length * sizeof(T) >= howMuch && + howMuch > 0, NCV_MEM_COPY_ERROR); + } + ncvAssertReturn((this->_ptr != NULL || this->_memtype == NCVMemoryTypeNone) && + (dst._ptr != NULL || dst._memtype == NCVMemoryTypeNone), NCV_NULL_PTR); + + NCVStatus ncvStat = NCV_SUCCESS; + if (this->_memtype != NCVMemoryTypeNone) + { + ncvStat = memSegCopyHelper(dst._ptr, dst._memtype, + this->_ptr, this->_memtype, + howMuch, cuStream); + } + + return ncvStat; + } + + T *ptr() const {return this->_ptr;} + size_t length() const {return this->_length;} + NCVMemoryType memType() const {return this->_memtype;} + +protected: + + T *_ptr; + size_t _length; + NCVMemoryType _memtype; +}; + + +/** +* NCVVectorAlloc +*/ +template +class NCVVectorAlloc : public NCVVector +{ + NCVVectorAlloc(); + NCVVectorAlloc(const NCVVectorAlloc &); + +public: + + NCVVectorAlloc(INCVMemAllocator &allocator, Ncv32u length) + : + allocator(allocator) + { + NCVStatus ncvStat; + + this->clear(); + this->allocatedMem.clear(); + + ncvStat = allocator.alloc(this->allocatedMem, length * sizeof(T)); + ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "NCVVectorAlloc ctor:: alloc failed", ); + + this->_ptr = (T *)this->allocatedMem.begin.ptr; + this->_length = length; + this->_memtype = this->allocatedMem.begin.memtype; + } + + + ~NCVVectorAlloc() + { + NCVStatus ncvStat; + + ncvStat = allocator.dealloc(this->allocatedMem); + ncvAssertPrintCheck(ncvStat == NCV_SUCCESS, "NCVVectorAlloc dtor:: dealloc failed"); + + this->clear(); + } + + + NcvBool isMemAllocated() const + { + return (this->allocatedMem.begin.ptr != NULL) || (this->allocator.isCounting()); + } + + + Ncv32u getAllocatorsAlignment() const + { + return allocator.alignment(); + } + + + NCVMemSegment getSegment() const + { + return allocatedMem; + } + +private: + + INCVMemAllocator &allocator; + NCVMemSegment allocatedMem; +}; + + +/** +* NCVVectorReuse +*/ +template +class NCVVectorReuse : public NCVVector +{ + NCVVectorReuse(); + NCVVectorReuse(const NCVVectorReuse &); + +public: + + explicit NCVVectorReuse(const NCVMemSegment &memSegment) + { + this->bReused = false; + this->clear(); + + this->_length = memSegment.size / sizeof(T); + this->_ptr = (T *)memSegment.begin.ptr; + this->_memtype = memSegment.begin.memtype; + + this->bReused = true; + } + + + NCVVectorReuse(const NCVMemSegment &memSegment, Ncv32u length) + { + this->bReused = false; + this->clear(); + + ncvAssertPrintReturn(length * sizeof(T) <= memSegment.size, \ + "NCVVectorReuse ctor:: memory binding failed due to size mismatch", ); + + this->_length = length; + this->_ptr = (T *)memSegment.begin.ptr; + this->_memtype = memSegment.begin.memtype; + + this->bReused = true; + } + + + NcvBool isMemReused() const + { + return this->bReused; + } + +private: + + NcvBool bReused; +}; + + +/** +* NCVMatrix (2D) +*/ +template +class NCVMatrix +{ + NCVMatrix(const NCVMatrix &); + +public: + + NCVMatrix() + { + clear(); + } + + virtual ~NCVMatrix() {} + + + void clear() + { + _ptr = NULL; + _pitch = 0; + _width = 0; + _height = 0; + _memtype = NCVMemoryTypeNone; + } + + + Ncv32u stride() const + { + return _pitch / sizeof(T); + } + + + NCVStatus copySolid(NCVMatrix &dst, cudaStream_t cuStream, size_t howMuch=0) + { + if (howMuch == 0) + { + ncvAssertReturn(dst._pitch == this->_pitch && + dst._height == this->_height, NCV_MEM_COPY_ERROR); + howMuch = this->_pitch * this->_height; + } + else + { + ncvAssertReturn(dst._pitch * dst._height >= howMuch && + this->_pitch * this->_height >= howMuch && + howMuch > 0, NCV_MEM_COPY_ERROR); + } + ncvAssertReturn((this->_ptr != NULL || this->_memtype == NCVMemoryTypeNone) && + (dst._ptr != NULL || dst._memtype == NCVMemoryTypeNone), NCV_NULL_PTR); + + NCVStatus ncvStat = NCV_SUCCESS; + if (this->_memtype != NCVMemoryTypeNone) + { + ncvStat = memSegCopyHelper(dst._ptr, dst._memtype, + this->_ptr, this->_memtype, + howMuch, cuStream); + } + + return ncvStat; + } + + T *ptr() const {return this->_ptr;} + Ncv32u width() const {return this->_width;} + Ncv32u height() const {return this->_height;} + Ncv32u pitch() const {return this->_pitch;} + NCVMemoryType memType() const {return this->_memtype;} + +protected: + + T *_ptr; + Ncv32u _width; + Ncv32u _height; + Ncv32u _pitch; + NCVMemoryType _memtype; +}; + + +/** +* NCVMatrixAlloc +*/ +template +class NCVMatrixAlloc : public NCVMatrix +{ + NCVMatrixAlloc(); + NCVMatrixAlloc(const NCVMatrixAlloc &); + +public: + + NCVMatrixAlloc(INCVMemAllocator &allocator, Ncv32u width, Ncv32u height, Ncv32u pitch=0) + : + allocator(allocator) + { + NCVStatus ncvStat; + + this->clear(); + this->allocatedMem.clear(); + + Ncv32u widthBytes = width * sizeof(T); + Ncv32u pitchBytes = alignUp(widthBytes, allocator.alignment()); + + if (pitch != 0) + { + ncvAssertPrintReturn(pitch >= pitchBytes && + (pitch & (allocator.alignment() - 1)) == 0, + "NCVMatrixAlloc ctor:: incorrect pitch passed", ); + pitchBytes = pitch; + } + + Ncv32u requiredAllocSize = pitchBytes * height; + + ncvStat = allocator.alloc(this->allocatedMem, requiredAllocSize); + ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "NCVMatrixAlloc ctor:: alloc failed", ); + + this->_ptr = (T *)this->allocatedMem.begin.ptr; + this->_width = width; + this->_height = height; + this->_pitch = pitchBytes; + this->_memtype = this->allocatedMem.begin.memtype; + } + + ~NCVMatrixAlloc() + { + NCVStatus ncvStat; + + ncvStat = allocator.dealloc(this->allocatedMem); + ncvAssertPrintCheck(ncvStat == NCV_SUCCESS, "NCVMatrixAlloc dtor:: dealloc failed"); + + this->clear(); + } + + + NcvBool isMemAllocated() const + { + return (this->allocatedMem.begin.ptr != NULL) || (this->allocator.isCounting()); + } + + + Ncv32u getAllocatorsAlignment() const + { + return allocator.alignment(); + } + + + NCVMemSegment getSegment() const + { + return allocatedMem; + } + +private: + + INCVMemAllocator &allocator; + NCVMemSegment allocatedMem; +}; + + +/** +* NCVMatrixReuse +*/ +template +class NCVMatrixReuse : public NCVMatrix +{ + NCVMatrixReuse(); + NCVMatrixReuse(const NCVMatrixReuse &); + +public: + + NCVMatrixReuse(const NCVMemSegment &memSegment, Ncv32u alignment, Ncv32u width, Ncv32u height, Ncv32u pitch=0, NcvBool bSkipPitchCheck=false) + { + this->bReused = false; + this->clear(); + + Ncv32u widthBytes = width * sizeof(T); + Ncv32u pitchBytes = alignUp(widthBytes, alignment); + + if (pitch != 0) + { + if (!bSkipPitchCheck) + { + ncvAssertPrintReturn(pitch >= pitchBytes && + (pitch & (alignment - 1)) == 0, + "NCVMatrixReuse ctor:: incorrect pitch passed", ); + } + else + { + ncvAssertPrintReturn(pitch >= widthBytes, "NCVMatrixReuse ctor:: incorrect pitch passed", ); + } + pitchBytes = pitch; + } + + ncvAssertPrintReturn(pitchBytes * height <= memSegment.size, \ + "NCVMatrixReuse ctor:: memory binding failed due to size mismatch", ); + + this->_width = width; + this->_height = height; + this->_pitch = pitchBytes; + this->_ptr = (T *)memSegment.begin.ptr; + this->_memtype = memSegment.begin.memtype; + + this->bReused = true; + } + + + NcvBool isMemReused() const + { + return this->bReused; + } + +private: + + NcvBool bReused; +}; + +#endif // _ncv_hpp_ diff --git a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu new file mode 100644 index 0000000000..a501d6525e --- /dev/null +++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu @@ -0,0 +1,2603 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +//////////////////////////////////////////////////////////////////////////////// +// +// NVIDIA CUDA implementation of Viola-Jones Object Detection Framework +// +// The algorithm and code are explained in the upcoming GPU Computing Gems +// chapter in detail: +// +// Anton Obukhov, "Haar Classifiers for Object Detection with CUDA" +// PDF URL placeholder +// email: aobukhov@nvidia.com, devsupport@nvidia.com +// +// Credits for help with the code to: +// Alexey Mendelenko, Cyril Crassin, and Mikhail Smirnov. +// +//////////////////////////////////////////////////////////////////////////////// + +#include + +#include "npp.h" +#include "NCV.hpp" +#include "NCVRuntimeTemplates.hpp" +#include "NCVHaarObjectDetection.hpp" + +void groupRectangles(std::vector &hypotheses, int groupThreshold, double eps, std::vector *weights); + + +//============================================================================== +// +// BlockScan file +// +//============================================================================== + + +//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads() +//assuming size <= WARP_SIZE and size is power of 2 +template +inline __device__ T warpScanInclusive(T idata, volatile T *s_Data) +{ + Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1)); + s_Data[pos] = 0; + pos += K_WARP_SIZE; + s_Data[pos] = idata; + + for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1) + { + s_Data[pos] += s_Data[pos - offset]; + } + + return s_Data[pos]; +} + + +template +inline __device__ T warpScanExclusive(T idata, volatile T *s_Data) +{ + return warpScanInclusive(idata, s_Data) - idata; +} + + +template +inline __device__ T blockScanInclusive(T idata, volatile T *s_Data) +{ + if (tiNumScanThreads > K_WARP_SIZE) + { + //Bottom-level inclusive warp scan + T warpResult = warpScanInclusive(idata, s_Data); + + //Save top elements of each warp for exclusive warp scan + //sync to wait for warp scans to complete (because s_Data is being overwritten) + __syncthreads(); + if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) ) + { + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult; + } + + //wait for warp scans to complete + __syncthreads(); + + if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) ) + { + //grab top warp elements + T val = s_Data[threadIdx.x]; + //calculate exclusive scan and write back to shared memory + s_Data[threadIdx.x] = warpScanExclusive(val, s_Data); + } + + //return updated warp scans with exclusive scan results + __syncthreads(); + return warpResult + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE]; + } + else + { + return warpScanInclusive(idata, s_Data); + } +} + + +//============================================================================== +// +// HaarClassifierCascade file +// +//============================================================================== + + +const Ncv32u MAX_GRID_DIM = 65535; + + +const Ncv32u NUM_THREADS_ANCHORSPARALLEL = 64; + + +#define NUM_THREADS_CLASSIFIERPARALLEL_LOG2 6 +#define NUM_THREADS_CLASSIFIERPARALLEL (1 << NUM_THREADS_CLASSIFIERPARALLEL_LOG2) + + +/** \internal +* Haar features solid array. +*/ +texture texHaarFeatures; + + +/** \internal +* Haar classifiers flattened trees container. +* Two parts: first contains root nodes, second - nodes that are referred by root nodes. +* Drawback: breaks tree locality (might cause more cache misses +* Advantage: No need to introduce additional 32-bit field to index root nodes offsets +*/ +texture texHaarClassifierNodes; + + +texture texIImage; + + +__device__ HaarStage64 getStage(Ncv32u iStage, HaarStage64 *d_Stages) +{ + return d_Stages[iStage]; +} + + +template +__device__ HaarClassifierNode128 getClassifierNode(Ncv32u iNode, HaarClassifierNode128 *d_ClassifierNodes) +{ + HaarClassifierNode128 tmpNode; + if (tbCacheTextureCascade) + { + tmpNode._ui4 = tex1Dfetch(texHaarClassifierNodes, iNode); + } + else + { + tmpNode = d_ClassifierNodes[iNode]; + } + return tmpNode; +} + + +template +__device__ void getFeature(Ncv32u iFeature, HaarFeature64 *d_Features, + Ncv32f *weight, + Ncv32u *rectX, Ncv32u *rectY, Ncv32u *rectWidth, Ncv32u *rectHeight) +{ + HaarFeature64 feature; + if (tbCacheTextureCascade) + { + feature._ui2 = tex1Dfetch(texHaarFeatures, iFeature); + } + else + { + feature = d_Features[iFeature]; + } + feature.getRect(rectX, rectY, rectWidth, rectHeight); + *weight = feature.getWeight(); +} + + +template +__device__ Ncv32u getElemIImg(Ncv32u x, Ncv32u *d_IImg) +{ + if (tbCacheTextureIImg) + { + return tex1Dfetch(texIImage, x); + } + else + { + return d_IImg[x]; + } +} + + +__device__ Ncv32f reduceSpecialization(Ncv32f partialSum) +{ + __shared__ volatile Ncv32f reductor[NUM_THREADS_CLASSIFIERPARALLEL]; + reductor[threadIdx.x] = partialSum; + __syncthreads(); + +#if defined CPU_FP_COMPLIANCE + if (!threadIdx.x) + { + Ncv32f sum = 0.0f; + for (int i=0; i= 8 + if (threadIdx.x < 128) + { + reductor[threadIdx.x] += reductor[threadIdx.x + 128]; + } + __syncthreads(); +#endif +#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 7 + if (threadIdx.x < 64) + { + reductor[threadIdx.x] += reductor[threadIdx.x + 64]; + } + __syncthreads(); +#endif + + if (threadIdx.x < 32) + { +#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 6 + reductor[threadIdx.x] += reductor[threadIdx.x + 32]; +#endif +#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 5 + reductor[threadIdx.x] += reductor[threadIdx.x + 16]; +#endif + reductor[threadIdx.x] += reductor[threadIdx.x + 8]; + reductor[threadIdx.x] += reductor[threadIdx.x + 4]; + reductor[threadIdx.x] += reductor[threadIdx.x + 2]; + reductor[threadIdx.x] += reductor[threadIdx.x + 1]; + } +#endif + + __syncthreads(); + + return reductor[0]; +} + + +__device__ Ncv32u d_outMaskPosition; + + +__inline __device__ void compactBlockWriteOutAnchorParallel(NcvBool threadPassFlag, + Ncv32u threadElem, + Ncv32u *vectorOut) +{ +#if __CUDA_ARCH__ >= 110 + Ncv32u passMaskElem = threadPassFlag ? 1 : 0; + __shared__ Ncv32u shmem[NUM_THREADS_ANCHORSPARALLEL * 2]; + Ncv32u incScan = blockScanInclusive(passMaskElem, shmem); + __syncthreads(); + Ncv32u excScan = incScan - passMaskElem; + + __shared__ Ncv32u numPassed; + __shared__ Ncv32u outMaskOffset; + if (threadIdx.x == NUM_THREADS_ANCHORSPARALLEL-1) + { + numPassed = incScan; + outMaskOffset = atomicAdd(&d_outMaskPosition, incScan); + } + __syncthreads(); + + if (threadPassFlag) + { + shmem[excScan] = threadElem; + } + __syncthreads(); + + if (threadIdx.x < numPassed) + { + vectorOut[outMaskOffset + threadIdx.x] = shmem[threadIdx.x]; + } +#endif +} + + +template +__global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStride, + Ncv32f *d_weights, Ncv32u weightsStride, + HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, + Ncv32u *d_inMask, Ncv32u *d_outMask, + Ncv32u mask1Dlen, Ncv32u mask2Dstride, + NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea) +{ + Ncv32u y_offs; + Ncv32u x_offs; + Ncv32u maskOffset; + Ncv32u outMaskVal; + + NcvBool bInactiveThread = false; + + if (tbReadPixelIndexFromVector) + { + maskOffset = (MAX_GRID_DIM * blockIdx.y + blockIdx.x) * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x; + + if (maskOffset >= mask1Dlen) + { + if (tbDoAtomicCompaction) bInactiveThread = true; else return; + } + + if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread) + { + outMaskVal = d_inMask[maskOffset]; + y_offs = outMaskVal >> 16; + x_offs = outMaskVal & 0xFFFF; + } + } + else + { + y_offs = blockIdx.y; + x_offs = blockIdx.x * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x; + + if (x_offs >= mask2Dstride) + { + if (tbDoAtomicCompaction) bInactiveThread = true; else return; + } + + if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread) + { + maskOffset = y_offs * mask2Dstride + x_offs; + + if ((x_offs >= anchorsRoi.width) || + (!tbInitMaskPositively && + d_inMask != d_outMask && + d_inMask[maskOffset] == OBJDET_MASK_ELEMENT_INVALID_32U)) + { + if (tbDoAtomicCompaction) + { + bInactiveThread = true; + } + else + { + d_outMask[maskOffset] = OBJDET_MASK_ELEMENT_INVALID_32U; + return; + } + } + + outMaskVal = (y_offs << 16) | x_offs; + } + } + + NcvBool bPass = true; + + if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread) + { + Ncv32f pixelStdDev = d_weights[y_offs * weightsStride + x_offs]; + + for (Ncv32u iStage = startStageInc; iStage(iNode, d_ClassifierNodes); + HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc(); + Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures(); + Ncv32u iFeature = featuresDesc.getFeaturesOffset(); + + Ncv32f curNodeVal = 0.0f; + + for (Ncv32u iRect=0; iRect + (iFeature + iRect, d_Features, + &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight); + + Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX); + Ncv32u iioffsTR = iioffsTL + rectWidth; + Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride; + Ncv32u iioffsBR = iioffsBL + rectWidth; + + Ncv32u rectSum = getElemIImg(iioffsBR, d_IImg) - + getElemIImg(iioffsBL, d_IImg) + + getElemIImg(iioffsTL, d_IImg) - + getElemIImg(iioffsTR, d_IImg); + +#if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY + curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight); +#else + curNodeVal += (Ncv32f)rectSum * rectWeight; +#endif + } + + HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc(); + HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc(); + Ncv32f nodeThreshold = curNode.getThreshold(); + HaarClassifierNodeDescriptor32 nextNodeDescriptor; + nextNodeDescriptor = (curNodeVal < scaleArea * pixelStdDev * nodeThreshold) ? nodeLeft : nodeRight; + + if (nextNodeDescriptor.isLeaf()) + { + Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValue(); + curStageSum += tmpLeafValue; + bMoreNodesToTraverse = false; + } + else + { + iNode = nextNodeDescriptor.getNextNodeOffset(); + } + } + + __syncthreads(); + curRootNodeOffset++; + } + + if (curStageSum < stageThreshold) + { + bPass = false; + outMaskVal = OBJDET_MASK_ELEMENT_INVALID_32U; + break; + } + } + } + + __syncthreads(); + + if (!tbDoAtomicCompaction) + { + if (!tbReadPixelIndexFromVector || + (tbReadPixelIndexFromVector && (!bPass || d_inMask != d_outMask))) + { + d_outMask[maskOffset] = outMaskVal; + } + } + else + { + compactBlockWriteOutAnchorParallel(bPass && !bInactiveThread, + outMaskVal, + d_outMask); + } +} + + +template +__global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IImgStride, + Ncv32f *d_weights, Ncv32u weightsStride, + HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, + Ncv32u *d_inMask, Ncv32u *d_outMask, + Ncv32u mask1Dlen, Ncv32u mask2Dstride, + NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea) +{ + Ncv32u maskOffset = MAX_GRID_DIM * blockIdx.y + blockIdx.x; + + if (maskOffset >= mask1Dlen) + { + return; + } + + Ncv32u outMaskVal = d_inMask[maskOffset]; + Ncv32u y_offs = outMaskVal >> 16; + Ncv32u x_offs = outMaskVal & 0xFFFF; + + Ncv32f pixelStdDev = d_weights[y_offs * weightsStride + x_offs]; + NcvBool bPass = true; + + for (Ncv32u iStage = startStageInc; iStage> NUM_THREADS_CLASSIFIERPARALLEL_LOG2; + + for (Ncv32u chunkId=0; chunkId(iNode, d_ClassifierNodes); + HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc(); + Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures(); + Ncv32u iFeature = featuresDesc.getFeaturesOffset(); + + Ncv32f curNodeVal = 0.0f; + //TODO: fetch into shmem if size suffices. Shmem can be shared with reduce + for (Ncv32u iRect=0; iRect + (iFeature + iRect, d_Features, + &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight); + + Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX); + Ncv32u iioffsTR = iioffsTL + rectWidth; + Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride; + Ncv32u iioffsBR = iioffsBL + rectWidth; + + Ncv32u rectSum = getElemIImg(iioffsBR, d_IImg) - + getElemIImg(iioffsBL, d_IImg) + + getElemIImg(iioffsTL, d_IImg) - + getElemIImg(iioffsTR, d_IImg); + +#if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY + curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight); +#else + curNodeVal += (Ncv32f)rectSum * rectWeight; +#endif + } + + HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc(); + HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc(); + Ncv32f nodeThreshold = curNode.getThreshold(); + HaarClassifierNodeDescriptor32 nextNodeDescriptor; + nextNodeDescriptor = (curNodeVal < scaleArea * pixelStdDev * nodeThreshold) ? nodeLeft : nodeRight; + + if (nextNodeDescriptor.isLeaf()) + { + Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValue(); + curStageSum += tmpLeafValue; + bMoreNodesToTraverse = false; + } + else + { + iNode = nextNodeDescriptor.getNextNodeOffset(); + } + } + } + __syncthreads(); + + curRootNodeOffset += NUM_THREADS_CLASSIFIERPARALLEL; + } + + Ncv32f finalStageSum = reduceSpecialization(curStageSum); + + if (finalStageSum < stageThreshold) + { + bPass = false; + outMaskVal = OBJDET_MASK_ELEMENT_INVALID_32U; + break; + } + } + + if (!tbDoAtomicCompaction) + { + if (!bPass || d_inMask != d_outMask) + { + if (!threadIdx.x) + { + d_outMask[maskOffset] = outMaskVal; + } + } + } + else + { +#if __CUDA_ARCH__ >= 110 + if (bPass && !threadIdx.x) + { + Ncv32u outMaskOffset = atomicAdd(&d_outMaskPosition, 1); + d_outMask[outMaskOffset] = outMaskVal; + } +#endif + } +} + + +template +__global__ void initializeMaskVector(Ncv32u *d_inMask, Ncv32u *d_outMask, + Ncv32u mask1Dlen, Ncv32u mask2Dstride, + NcvSize32u anchorsRoi, Ncv32u step) +{ + Ncv32u y_offs = blockIdx.y; + Ncv32u x_offs = blockIdx.x * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x; + Ncv32u outMaskOffset = y_offs * gridDim.x * blockDim.x + x_offs; + + Ncv32u y_offs_upsc = step * y_offs; + Ncv32u x_offs_upsc = step * x_offs; + Ncv32u inMaskOffset = y_offs_upsc * mask2Dstride + x_offs_upsc; + + Ncv32u outElem = OBJDET_MASK_ELEMENT_INVALID_32U; + + if (x_offs_upsc < anchorsRoi.width && + (!tbMaskByInmask || d_inMask[inMaskOffset] != OBJDET_MASK_ELEMENT_INVALID_32U)) + { + outElem = (y_offs_upsc << 16) | x_offs_upsc; + } + + if (!tbDoAtomicCompaction) + { + d_outMask[outMaskOffset] = outElem; + } + else + { + compactBlockWriteOutAnchorParallel(outElem != OBJDET_MASK_ELEMENT_INVALID_32U, + outElem, + d_outMask); + } +} + + +struct applyHaarClassifierAnchorParallelFunctor +{ + dim3 gridConf, blockConf; + cudaStream_t cuStream; + + //Kernel arguments are stored as members; + Ncv32u *d_IImg; + Ncv32u IImgStride; + Ncv32f *d_weights; + Ncv32u weightsStride; + HaarFeature64 *d_Features; + HaarClassifierNode128 *d_ClassifierNodes; + HaarStage64 *d_Stages; + Ncv32u *d_inMask; + Ncv32u *d_outMask; + Ncv32u mask1Dlen; + Ncv32u mask2Dstride; + NcvSize32u anchorsRoi; + Ncv32u startStageInc; + Ncv32u endStageExc; + Ncv32f scaleArea; + + //Arguments are passed through the constructor + applyHaarClassifierAnchorParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream, + Ncv32u *_d_IImg, Ncv32u _IImgStride, + Ncv32f *_d_weights, Ncv32u _weightsStride, + HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages, + Ncv32u *_d_inMask, Ncv32u *_d_outMask, + Ncv32u _mask1Dlen, Ncv32u _mask2Dstride, + NcvSize32u _anchorsRoi, Ncv32u _startStageInc, + Ncv32u _endStageExc, Ncv32f _scaleArea) : + gridConf(_gridConf), + blockConf(_blockConf), + cuStream(_cuStream), + d_IImg(_d_IImg), + IImgStride(_IImgStride), + d_weights(_d_weights), + weightsStride(_weightsStride), + d_Features(_d_Features), + d_ClassifierNodes(_d_ClassifierNodes), + d_Stages(_d_Stages), + d_inMask(_d_inMask), + d_outMask(_d_outMask), + mask1Dlen(_mask1Dlen), + mask2Dstride(_mask2Dstride), + anchorsRoi(_anchorsRoi), + startStageInc(_startStageInc), + endStageExc(_endStageExc), + scaleArea(_scaleArea) + {} + + template + void call(TList tl) + { + applyHaarClassifierAnchorParallel < + Loki::TL::TypeAt::Result::value, + Loki::TL::TypeAt::Result::value, + Loki::TL::TypeAt::Result::value, + Loki::TL::TypeAt::Result::value, + Loki::TL::TypeAt::Result::value > + <<>> + (d_IImg, IImgStride, + d_weights, weightsStride, + d_Features, d_ClassifierNodes, d_Stages, + d_inMask, d_outMask, + mask1Dlen, mask2Dstride, + anchorsRoi, startStageInc, + endStageExc, scaleArea); + } +}; + + +void applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively, + NcvBool tbCacheTextureIImg, + NcvBool tbCacheTextureCascade, + NcvBool tbReadPixelIndexFromVector, + NcvBool tbDoAtomicCompaction, + + dim3 gridConf, dim3 blockConf, cudaStream_t cuStream, + + Ncv32u *d_IImg, Ncv32u IImgStride, + Ncv32f *d_weights, Ncv32u weightsStride, + HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, + Ncv32u *d_inMask, Ncv32u *d_outMask, + Ncv32u mask1Dlen, Ncv32u mask2Dstride, + NcvSize32u anchorsRoi, Ncv32u startStageInc, + Ncv32u endStageExc, Ncv32f scaleArea) +{ + //Second parameter is the number of "dynamic" template parameters + NCVRuntimeTemplateBool::KernelCaller + ::call( applyHaarClassifierAnchorParallelFunctor(gridConf, blockConf, cuStream, + d_IImg, IImgStride, + d_weights, weightsStride, + d_Features, d_ClassifierNodes, d_Stages, + d_inMask, d_outMask, + mask1Dlen, mask2Dstride, + anchorsRoi, startStageInc, + endStageExc, scaleArea), + 0xC001C0DE, //this is dummy int for the va_args C compatibility + tbInitMaskPositively, + tbCacheTextureIImg, + tbCacheTextureCascade, + tbReadPixelIndexFromVector, + tbDoAtomicCompaction); +} + + +struct applyHaarClassifierClassifierParallelFunctor +{ + dim3 gridConf, blockConf; + cudaStream_t cuStream; + + //Kernel arguments are stored as members; + Ncv32u *d_IImg; + Ncv32u IImgStride; + Ncv32f *d_weights; + Ncv32u weightsStride; + HaarFeature64 *d_Features; + HaarClassifierNode128 *d_ClassifierNodes; + HaarStage64 *d_Stages; + Ncv32u *d_inMask; + Ncv32u *d_outMask; + Ncv32u mask1Dlen; + Ncv32u mask2Dstride; + NcvSize32u anchorsRoi; + Ncv32u startStageInc; + Ncv32u endStageExc; + Ncv32f scaleArea; + + //Arguments are passed through the constructor + applyHaarClassifierClassifierParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream, + Ncv32u *_d_IImg, Ncv32u _IImgStride, + Ncv32f *_d_weights, Ncv32u _weightsStride, + HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages, + Ncv32u *_d_inMask, Ncv32u *_d_outMask, + Ncv32u _mask1Dlen, Ncv32u _mask2Dstride, + NcvSize32u _anchorsRoi, Ncv32u _startStageInc, + Ncv32u _endStageExc, Ncv32f _scaleArea) : + gridConf(_gridConf), + blockConf(_blockConf), + cuStream(_cuStream), + d_IImg(_d_IImg), + IImgStride(_IImgStride), + d_weights(_d_weights), + weightsStride(_weightsStride), + d_Features(_d_Features), + d_ClassifierNodes(_d_ClassifierNodes), + d_Stages(_d_Stages), + d_inMask(_d_inMask), + d_outMask(_d_outMask), + mask1Dlen(_mask1Dlen), + mask2Dstride(_mask2Dstride), + anchorsRoi(_anchorsRoi), + startStageInc(_startStageInc), + endStageExc(_endStageExc), + scaleArea(_scaleArea) + {} + + template + void call(TList tl) + { + applyHaarClassifierClassifierParallel < + Loki::TL::TypeAt::Result::value, + Loki::TL::TypeAt::Result::value, + Loki::TL::TypeAt::Result::value > + <<>> + (d_IImg, IImgStride, + d_weights, weightsStride, + d_Features, d_ClassifierNodes, d_Stages, + d_inMask, d_outMask, + mask1Dlen, mask2Dstride, + anchorsRoi, startStageInc, + endStageExc, scaleArea); + } +}; + + +void applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg, + NcvBool tbCacheTextureCascade, + NcvBool tbDoAtomicCompaction, + + dim3 gridConf, dim3 blockConf, cudaStream_t cuStream, + + Ncv32u *d_IImg, Ncv32u IImgStride, + Ncv32f *d_weights, Ncv32u weightsStride, + HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, + Ncv32u *d_inMask, Ncv32u *d_outMask, + Ncv32u mask1Dlen, Ncv32u mask2Dstride, + NcvSize32u anchorsRoi, Ncv32u startStageInc, + Ncv32u endStageExc, Ncv32f scaleArea) +{ + //Second parameter is the number of "dynamic" template parameters + NCVRuntimeTemplateBool::KernelCaller + ::call( applyHaarClassifierClassifierParallelFunctor(gridConf, blockConf, cuStream, + d_IImg, IImgStride, + d_weights, weightsStride, + d_Features, d_ClassifierNodes, d_Stages, + d_inMask, d_outMask, + mask1Dlen, mask2Dstride, + anchorsRoi, startStageInc, + endStageExc, scaleArea), + 0xC001C0DE, //this is dummy int for the va_args C compatibility + tbCacheTextureIImg, + tbCacheTextureCascade, + tbDoAtomicCompaction); +} + + +struct initializeMaskVectorFunctor +{ + dim3 gridConf, blockConf; + cudaStream_t cuStream; + + //Kernel arguments are stored as members; + Ncv32u *d_inMask; + Ncv32u *d_outMask; + Ncv32u mask1Dlen; + Ncv32u mask2Dstride; + NcvSize32u anchorsRoi; + Ncv32u step; + + //Arguments are passed through the constructor + initializeMaskVectorFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream, + Ncv32u *_d_inMask, Ncv32u *_d_outMask, + Ncv32u _mask1Dlen, Ncv32u _mask2Dstride, + NcvSize32u _anchorsRoi, Ncv32u _step) : + gridConf(_gridConf), + blockConf(_blockConf), + cuStream(_cuStream), + d_inMask(_d_inMask), + d_outMask(_d_outMask), + mask1Dlen(_mask1Dlen), + mask2Dstride(_mask2Dstride), + anchorsRoi(_anchorsRoi), + step(_step) + {} + + template + void call(TList tl) + { + initializeMaskVector < + Loki::TL::TypeAt::Result::value, + Loki::TL::TypeAt::Result::value > + <<>> + (d_inMask, d_outMask, + mask1Dlen, mask2Dstride, + anchorsRoi, step); + } +}; + + +void initializeMaskVectorDynTemplate(NcvBool tbMaskByInmask, + NcvBool tbDoAtomicCompaction, + + dim3 gridConf, dim3 blockConf, cudaStream_t cuStream, + + Ncv32u *d_inMask, Ncv32u *d_outMask, + Ncv32u mask1Dlen, Ncv32u mask2Dstride, + NcvSize32u anchorsRoi, Ncv32u step) +{ + //Second parameter is the number of "dynamic" template parameters + NCVRuntimeTemplateBool::KernelCaller + ::call( initializeMaskVectorFunctor(gridConf, blockConf, cuStream, + d_inMask, d_outMask, + mask1Dlen, mask2Dstride, + anchorsRoi, step), + 0xC001C0DE, //this is dummy int for the va_args C compatibility + tbMaskByInmask, + tbDoAtomicCompaction); +} + + +Ncv32u getStageNumWithNotLessThanNclassifiers(Ncv32u N, HaarClassifierCascadeDescriptor &haar, + NCVVector &h_HaarStages) +{ + Ncv32u i = 0; + for (; i= N) + { + break; + } + } + return i; +} + + +template +void swap(T &p1, T &p2) +{ + T tmp = p1; + p1 = p2; + p2 = tmp; +} + + +NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix &d_integralImage, + NCVMatrix &d_weights, + NCVMatrixAlloc &d_pixelMask, + Ncv32u &numDetections, + HaarClassifierCascadeDescriptor &haar, + NCVVector &h_HaarStages, + NCVVector &d_HaarStages, + NCVVector &d_HaarNodes, + NCVVector &d_HaarFeatures, + NcvBool bMaskElements, + NcvSize32u anchorsRoi, + Ncv32u pixelStep, + Ncv32f scaleArea, + INCVMemAllocator &gpuAllocator, + INCVMemAllocator &cpuAllocator, + Ncv32u devPropMajor, + Ncv32u devPropMinor, + cudaStream_t cuStream) +{ + ncvAssertReturn(d_integralImage.memType() == d_weights.memType() && + d_integralImage.memType() == d_pixelMask.memType() && + d_integralImage.memType() == gpuAllocator.memType() && + (d_integralImage.memType() == NCVMemoryTypeDevice || + d_integralImage.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR); + ncvAssertReturn(d_HaarStages.memType() == d_HaarNodes.memType() && + d_HaarStages.memType() == d_HaarFeatures.memType() && + (d_HaarStages.memType() == NCVMemoryTypeDevice || + d_HaarStages.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR); + ncvAssertReturn(h_HaarStages.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR); + ncvAssertReturn(gpuAllocator.isInitialized() && cpuAllocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED); + ncvAssertReturn((d_integralImage.ptr() != NULL && d_weights.ptr() != NULL && d_pixelMask.ptr() != NULL && + h_HaarStages.ptr() != NULL && d_HaarStages.ptr() != NULL && d_HaarNodes.ptr() != NULL && + d_HaarFeatures.ptr() != NULL) || gpuAllocator.isCounting(), NCV_NULL_PTR); + ncvAssertReturn(anchorsRoi.width > 0 && anchorsRoi.height > 0 && + d_pixelMask.width() >= anchorsRoi.width && d_pixelMask.height() >= anchorsRoi.height && + d_weights.width() >= anchorsRoi.width && d_weights.height() >= anchorsRoi.height && + d_integralImage.width() >= anchorsRoi.width + haar.ClassifierSize.width && + d_integralImage.height() >= anchorsRoi.height + haar.ClassifierSize.height, NCV_DIMENSIONS_INVALID); + ncvAssertReturn(scaleArea > 0, NCV_INVALID_SCALE); + ncvAssertReturn(d_HaarStages.length() >= haar.NumStages && + d_HaarNodes.length() >= haar.NumClassifierTotalNodes && + d_HaarFeatures.length() >= haar.NumFeatures && + d_HaarStages.length() == h_HaarStages.length() && + haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID); + ncvAssertReturn(haar.bNeedsTiltedII == false || gpuAllocator.isCounting(), NCV_NOIMPL_HAAR_TILTED_FEATURES); + ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP); + + NCV_SET_SKIP_COND(gpuAllocator.isCounting()); + +#if defined _SELF_TEST_ + + NCVStatus ncvStat; + + NCVMatrixAlloc h_integralImage(cpuAllocator, d_integralImage.width, d_integralImage.height, d_integralImage.pitch); + ncvAssertReturn(h_integralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + NCVMatrixAlloc h_weights(cpuAllocator, d_weights.width, d_weights.height, d_weights.pitch); + ncvAssertReturn(h_weights.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + NCVMatrixAlloc h_pixelMask(cpuAllocator, d_pixelMask.width, d_pixelMask.height, d_pixelMask.pitch); + ncvAssertReturn(h_pixelMask.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + NCVVectorAlloc h_HaarNodes(cpuAllocator, d_HaarNodes.length); + ncvAssertReturn(h_HaarNodes.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + NCVVectorAlloc h_HaarFeatures(cpuAllocator, d_HaarFeatures.length); + ncvAssertReturn(h_HaarFeatures.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + + NCVMatrixAlloc h_pixelMask_d(cpuAllocator, d_pixelMask.width, d_pixelMask.height, d_pixelMask.pitch); + ncvAssertReturn(h_pixelMask_d.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + + NCV_SKIP_COND_BEGIN + + ncvStat = d_pixelMask.copySolid(h_pixelMask, 0); + ncvAssertReturnNcvStat(ncvStat); + ncvStat = d_integralImage.copySolid(h_integralImage, 0); + ncvAssertReturnNcvStat(ncvStat); + ncvStat = d_weights.copySolid(h_weights, 0); + ncvAssertReturnNcvStat(ncvStat); + ncvStat = d_HaarNodes.copySolid(h_HaarNodes, 0); + ncvAssertReturnNcvStat(ncvStat); + ncvStat = d_HaarFeatures.copySolid(h_HaarFeatures, 0); + ncvAssertReturnNcvStat(ncvStat); + ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR); + + for (Ncv32u i=0; i<(Ncv32u)anchorsRoi.height; i++) + { + for (Ncv32u j=0; j d_vecPixelMask(d_pixelMask.getSegment(), anchorsRoi.height * d_pixelMask.stride()); + ncvAssertReturn(d_vecPixelMask.isMemReused(), NCV_ALLOCATOR_BAD_REUSE); + + NCVVectorAlloc d_vecPixelMaskTmp(gpuAllocator, d_vecPixelMask.length()); + ncvAssertReturn(d_vecPixelMaskTmp.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + + NCVVectorAlloc hp_pool32u(cpuAllocator, 2); + ncvAssertReturn(hp_pool32u.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + Ncv32u *hp_zero = &hp_pool32u.ptr()[0]; + Ncv32u *hp_numDet = &hp_pool32u.ptr()[1]; + + NCV_SKIP_COND_BEGIN + *hp_zero = 0; + *hp_numDet = 0; + NCV_SKIP_COND_END + + Ncv32f scaleAreaPixels = scaleArea * ((haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER) * + (haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER)); + + NcvBool bTexCacheCascade = devPropMajor < 2; + NcvBool bTexCacheIImg = true; //this works better even on Fermi so far + NcvBool bDoAtomicCompaction = devPropMajor >= 2 || (devPropMajor == 1 && devPropMinor >= 3); + + NCVVector *d_ptrNowData = &d_vecPixelMask; + NCVVector *d_ptrNowTmp = &d_vecPixelMaskTmp; + + Ncv32u szNppCompactTmpBuf; + nppsStCompactGetSize_32u(d_vecPixelMask.length(), &szNppCompactTmpBuf); + if (bDoAtomicCompaction) + { + szNppCompactTmpBuf = 0; + } + NCVVectorAlloc d_tmpBufCompact(gpuAllocator, szNppCompactTmpBuf); + + NCV_SKIP_COND_BEGIN + + if (bTexCacheIImg) + { + cudaChannelFormatDesc cfdTexIImage; + cfdTexIImage = cudaCreateChannelDesc(); + + size_t alignmentOffset; + ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texIImage, d_integralImage.ptr(), cfdTexIImage, + (anchorsRoi.height + haar.ClassifierSize.height) * d_integralImage.pitch()), NCV_CUDA_ERROR); + ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR); + } + + if (bTexCacheCascade) + { + cudaChannelFormatDesc cfdTexHaarFeatures; + cudaChannelFormatDesc cfdTexHaarClassifierNodes; + cfdTexHaarFeatures = cudaCreateChannelDesc(); + cfdTexHaarClassifierNodes = cudaCreateChannelDesc(); + + size_t alignmentOffset; + ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarFeatures, + d_HaarFeatures.ptr(), cfdTexHaarFeatures,sizeof(HaarFeature64) * haar.NumFeatures), NCV_CUDA_ERROR); + ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR); + ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarClassifierNodes, + d_HaarNodes.ptr(), cfdTexHaarClassifierNodes, sizeof(HaarClassifierNode128) * haar.NumClassifierTotalNodes), NCV_CUDA_ERROR); + ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR); + } + + Ncv32u stageStartAnchorParallel = 0; + Ncv32u stageMiddleSwitch = getStageNumWithNotLessThanNclassifiers(NUM_THREADS_CLASSIFIERPARALLEL, + haar, h_HaarStages); + Ncv32u stageEndClassifierParallel = haar.NumStages; + if (stageMiddleSwitch == 0) + { + stageMiddleSwitch = 1; + } + + //create stages subdivision for pixel-parallel processing + const Ncv32u compactEveryNstage = bDoAtomicCompaction ? 7 : 1; + Ncv32u curStop = stageStartAnchorParallel; + std::vector pixParallelStageStops; + while (curStop < stageMiddleSwitch) + { + pixParallelStageStops.push_back(curStop); + curStop += compactEveryNstage; + } + if (curStop > compactEveryNstage && curStop - stageMiddleSwitch > compactEveryNstage / 2) + { + pixParallelStageStops[pixParallelStageStops.size()-1] = + (stageMiddleSwitch - (curStop - 2 * compactEveryNstage)) / 2; + } + pixParallelStageStops.push_back(stageMiddleSwitch); + Ncv32u pixParallelStageStopsIndex = 0; + + if (pixelStep != 1 || bMaskElements) + { + if (bDoAtomicCompaction) + { + ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u), + 0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); + } + + dim3 gridInit((((anchorsRoi.width + pixelStep - 1) / pixelStep + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL), + (anchorsRoi.height + pixelStep - 1) / pixelStep); + dim3 blockInit(NUM_THREADS_ANCHORSPARALLEL); + + if (gridInit.x == 0 || gridInit.y == 0) + { + numDetections = 0; + return NCV_SUCCESS; + } + + initializeMaskVectorDynTemplate(bMaskElements, + bDoAtomicCompaction, + gridInit, blockInit, cuStream, + d_ptrNowData->ptr(), + d_ptrNowTmp->ptr(), + d_vecPixelMask.length(), d_pixelMask.stride(), + anchorsRoi, pixelStep); + ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR); + + if (bDoAtomicCompaction) + { + ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u), + 0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); + swap(d_ptrNowData, d_ptrNowTmp); + } + else + { + NppStStatus nppSt; + nppSt = nppsStCompact_32u(d_ptrNowTmp->ptr(), d_vecPixelMask.length(), + d_ptrNowData->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U, + d_tmpBufCompact.ptr(), szNppCompactTmpBuf); + ncvAssertReturn(nppSt == NPP_SUCCESS, NCV_NPP_ERROR); + } + numDetections = *hp_numDet; + } + else + { + // + // 1. Run the first pixel-input pixel-parallel classifier for few stages + // + + if (bDoAtomicCompaction) + { + ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u), + 0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); + } + + dim3 grid1(((d_pixelMask.stride() + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL), + anchorsRoi.height); + dim3 block1(NUM_THREADS_ANCHORSPARALLEL); + applyHaarClassifierAnchorParallelDynTemplate( + true, //tbInitMaskPositively + bTexCacheIImg, //tbCacheTextureIImg + bTexCacheCascade, //tbCacheTextureCascade + pixParallelStageStops[pixParallelStageStopsIndex] != 0,//tbReadPixelIndexFromVector + bDoAtomicCompaction, //tbDoAtomicCompaction + grid1, + block1, + cuStream, + d_integralImage.ptr(), d_integralImage.stride(), + d_weights.ptr(), d_weights.stride(), + d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(), + d_ptrNowData->ptr(), + bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(), + 0, + d_pixelMask.stride(), + anchorsRoi, + pixParallelStageStops[pixParallelStageStopsIndex], + pixParallelStageStops[pixParallelStageStopsIndex+1], + scaleAreaPixels); + ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR); + + if (bDoAtomicCompaction) + { + ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u), + 0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); + } + else + { + NppStStatus nppSt; + nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), d_vecPixelMask.length(), + d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U, + d_tmpBufCompact.ptr(), szNppCompactTmpBuf); + ncvAssertReturn(nppSt == NPP_SUCCESS, NCV_NPP_ERROR); + } + + swap(d_ptrNowData, d_ptrNowTmp); + numDetections = *hp_numDet; + + pixParallelStageStopsIndex++; + } + + // + // 2. Run pixel-parallel stages + // + + for (; pixParallelStageStopsIndex < pixParallelStageStops.size()-1; pixParallelStageStopsIndex++) + { + if (numDetections == 0) + { + break; + } + + if (bDoAtomicCompaction) + { + ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u), + 0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); + } + + dim3 grid2((numDetections + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL); + if (numDetections > MAX_GRID_DIM) + { + grid2.x = MAX_GRID_DIM; + grid2.y = (numDetections + MAX_GRID_DIM - 1) / MAX_GRID_DIM; + } + dim3 block2(NUM_THREADS_ANCHORSPARALLEL); + + applyHaarClassifierAnchorParallelDynTemplate( + false, //tbInitMaskPositively + bTexCacheIImg, //tbCacheTextureIImg + bTexCacheCascade, //tbCacheTextureCascade + pixParallelStageStops[pixParallelStageStopsIndex] != 0 || pixelStep != 1 || bMaskElements,//tbReadPixelIndexFromVector + bDoAtomicCompaction, //tbDoAtomicCompaction + grid2, + block2, + cuStream, + d_integralImage.ptr(), d_integralImage.stride(), + d_weights.ptr(), d_weights.stride(), + d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(), + d_ptrNowData->ptr(), + bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(), + numDetections, + d_pixelMask.stride(), + anchorsRoi, + pixParallelStageStops[pixParallelStageStopsIndex], + pixParallelStageStops[pixParallelStageStopsIndex+1], + scaleAreaPixels); + ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR); + + if (bDoAtomicCompaction) + { + ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u), + 0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); + } + else + { + NppStStatus nppSt; + nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), numDetections, + d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U, + d_tmpBufCompact.ptr(), szNppCompactTmpBuf); + ncvAssertReturn(nppSt == NPP_SUCCESS, NCV_NPP_ERROR); + } + + swap(d_ptrNowData, d_ptrNowTmp); + numDetections = *hp_numDet; + } + + // + // 3. Run all left stages in one stage-parallel kernel + // + + if (numDetections > 0 && stageMiddleSwitch < stageEndClassifierParallel) + { + if (bDoAtomicCompaction) + { + ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u), + 0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); + } + + dim3 grid3(numDetections); + if (numDetections > MAX_GRID_DIM) + { + grid3.x = MAX_GRID_DIM; + grid3.y = (numDetections + MAX_GRID_DIM - 1) / MAX_GRID_DIM; + } + dim3 block3(NUM_THREADS_CLASSIFIERPARALLEL); + + applyHaarClassifierClassifierParallelDynTemplate( + bTexCacheIImg, //tbCacheTextureIImg + bTexCacheCascade, //tbCacheTextureCascade + bDoAtomicCompaction, //tbDoAtomicCompaction + grid3, + block3, + cuStream, + d_integralImage.ptr(), d_integralImage.stride(), + d_weights.ptr(), d_weights.stride(), + d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(), + d_ptrNowData->ptr(), + bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(), + numDetections, + d_pixelMask.stride(), + anchorsRoi, + stageMiddleSwitch, + stageEndClassifierParallel, + scaleAreaPixels); + ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR); + + if (bDoAtomicCompaction) + { + ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u), + 0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); + } + else + { + NppStStatus nppSt; + nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), numDetections, + d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U, + d_tmpBufCompact.ptr(), szNppCompactTmpBuf); + ncvAssertReturn(nppSt == NPP_SUCCESS, NCV_NPP_ERROR); + } + + swap(d_ptrNowData, d_ptrNowTmp); + numDetections = *hp_numDet; + } + + if (d_ptrNowData != &d_vecPixelMask) + { + d_vecPixelMaskTmp.copySolid(d_vecPixelMask, cuStream); + ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); + } + +#if defined _SELF_TEST_ + + ncvStat = d_pixelMask.copySolid(h_pixelMask_d, 0); + ncvAssertReturnNcvStat(ncvStat); + ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); + + if (bDoAtomicCompaction) + { + std::sort(h_pixelMask_d.ptr, h_pixelMask_d.ptr + numDetections); + } + + Ncv32u fpu_oldcw, fpu_cw; + _controlfp_s(&fpu_cw, 0, 0); + fpu_oldcw = fpu_cw; + _controlfp_s(&fpu_cw, _PC_24, _MCW_PC); + Ncv32u numDetGold; + ncvStat = ncvApplyHaarClassifierCascade_host(h_integralImage, h_weights, h_pixelMask, numDetGold, haar, + h_HaarStages, h_HaarNodes, h_HaarFeatures, + bMaskElements, anchorsRoi, pixelStep, scaleArea); + ncvAssertReturnNcvStat(ncvStat); + _controlfp_s(&fpu_cw, fpu_oldcw, _MCW_PC); + + bool bPass = true; + + if (numDetGold != numDetections) + { + printf("NCVHaarClassifierCascade::applyHaarClassifierCascade numdetections don't match: cpu=%d, gpu=%d\n", numDetGold, numDetections); + bPass = false; + } + else + { + for (Ncv32u i=0; i> 16)); + res.width = (Ncv32u)(scale * width); + res.height = (Ncv32u)(scale * height); + return res; +} + + +__global__ void growDetectionsKernel(Ncv32u *pixelMask, Ncv32u numElements, + NcvRect32u *hypotheses, + Ncv32u rectWidth, Ncv32u rectHeight, Ncv32f curScale) +{ + Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x; + Ncv32u elemAddr = blockId * NUM_GROW_THREADS + threadIdx.x; + if (elemAddr >= numElements) + { + return; + } + hypotheses[elemAddr] = pixelToRect(pixelMask[elemAddr], rectWidth, rectHeight, curScale); +} + + +NCVStatus ncvGrowDetectionsVector_device(NCVVector &pixelMask, + Ncv32u numPixelMaskDetections, + NCVVector &hypotheses, + Ncv32u &totalDetections, + Ncv32u totalMaxDetections, + Ncv32u rectWidth, + Ncv32u rectHeight, + Ncv32f curScale, + cudaStream_t cuStream) +{ + ncvAssertReturn(pixelMask.ptr() != NULL && hypotheses.ptr() != NULL, NCV_NULL_PTR); + ncvAssertReturn(pixelMask.memType() == hypotheses.memType() && + pixelMask.memType() == NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR); + ncvAssertReturn(rectWidth > 0 && rectHeight > 0 && curScale > 0, NCV_INVALID_ROI); + ncvAssertReturn(curScale > 0, NCV_INVALID_SCALE); + ncvAssertReturn(totalMaxDetections <= hypotheses.length() && + numPixelMaskDetections <= pixelMask.length() && + totalMaxDetections <= totalMaxDetections, NCV_INCONSISTENT_INPUT); + + NCVStatus ncvStat = NCV_SUCCESS; + Ncv32u numDetsToCopy = numPixelMaskDetections; + + if (numDetsToCopy == 0) + { + return ncvStat; + } + + if (totalDetections + numPixelMaskDetections > totalMaxDetections) + { + ncvStat = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW; + numDetsToCopy = totalMaxDetections - totalDetections; + } + + dim3 block(NUM_GROW_THREADS); + dim3 grid((numDetsToCopy + NUM_GROW_THREADS - 1) / NUM_GROW_THREADS); + if (grid.x > 65535) + { + grid.y = (grid.x + 65534) / 65535; + grid.x = 65535; + } + growDetectionsKernel<<>>(pixelMask.ptr(), numDetsToCopy, + hypotheses.ptr() + totalDetections, + rectWidth, rectHeight, curScale); + ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR); + + totalDetections += numDetsToCopy; + return ncvStat; +} + + +//============================================================================== +// +// Visualize file +// +//============================================================================== + + +const Ncv32u NUMTHREADS_DRAWRECTS = 32; +const Ncv32u NUMTHREADS_DRAWRECTS_LOG2 = 5; + + +template +__global__ void drawRects(T *d_dst, + Ncv32u dstStride, + Ncv32u dstWidth, + Ncv32u dstHeight, + NcvRect32u *d_rects, + Ncv32u numRects, + T color) +{ + Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x; + if (blockId > numRects * 4) + { + return; + } + + NcvRect32u curRect = d_rects[blockId >> 2]; + NcvBool bVertical = blockId & 0x1; + NcvBool bTopLeft = blockId & 0x2; + + Ncv32u pt0x, pt0y; + if (bVertical) + { + Ncv32u numChunks = (curRect.height + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2; + + pt0x = bTopLeft ? curRect.x : curRect.x + curRect.width - 1; + pt0y = curRect.y; + + if (pt0x < dstWidth) + { + for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++) + { + Ncv32u ptY = pt0y + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x; + if (ptY < pt0y + curRect.height && ptY < dstHeight) + { + d_dst[ptY * dstStride + pt0x] = color; + } + } + } + } + else + { + Ncv32u numChunks = (curRect.width + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2; + + pt0x = curRect.x; + pt0y = bTopLeft ? curRect.y : curRect.y + curRect.height - 1; + + if (pt0y < dstHeight) + { + for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++) + { + Ncv32u ptX = pt0x + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x; + if (ptX < pt0x + curRect.width && ptX < dstWidth) + { + d_dst[pt0y * dstStride + ptX] = color; + } + } + } + } +} + + +template +static NCVStatus drawRectsWrapperDevice(T *d_dst, + Ncv32u dstStride, + Ncv32u dstWidth, + Ncv32u dstHeight, + NcvRect32u *d_rects, + Ncv32u numRects, + T color, + cudaStream_t cuStream) +{ + ncvAssertReturn(d_dst != NULL && d_rects != NULL, NCV_NULL_PTR); + ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID); + ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP); + ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID); + + if (numRects == 0) + { + return NCV_SUCCESS; + } + +#if defined _SELF_TEST_ + T *h_dst; + ncvAssertCUDAReturn(cudaMallocHost(&h_dst, dstStride * dstHeight * sizeof(T)), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaMemcpy(h_dst, d_dst, dstStride * dstHeight * sizeof(T), cudaMemcpyDeviceToHost), NCV_CUDA_ERROR); + NcvRect32s *h_rects; + ncvAssertCUDAReturn(cudaMallocHost(&h_rects, numRects * sizeof(NcvRect32s)), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaMemcpy(h_rects, d_rects, numRects * sizeof(NcvRect32s), cudaMemcpyDeviceToHost), NCV_CUDA_ERROR); + ncvAssertReturnNcvStat(drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color)); +#endif + + dim3 grid(numRects * 4); + dim3 block(NUMTHREADS_DRAWRECTS); + if (grid.x > 65535) + { + grid.y = (grid.x + 65534) / 65535; + grid.x = 65535; + } + + drawRects<<>>(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color); + + ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR); + +#if defined _SELF_TEST_ + T *h_dst_after; + ncvAssertCUDAReturn(cudaMallocHost(&h_dst_after, dstStride * dstHeight * sizeof(T)), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaMemcpy(h_dst_after, d_dst, dstStride * dstHeight * sizeof(T), cudaMemcpyDeviceToHost), NCV_CUDA_ERROR); + bool bPass = true; + for (Ncv32u i=0; i &d_srcImg, + NcvSize32u srcRoi, + NCVVector &d_dstRects, + Ncv32u &dstNumRects, + + HaarClassifierCascadeDescriptor &haar, + NCVVector &h_HaarStages, + NCVVector &d_HaarStages, + NCVVector &d_HaarNodes, + NCVVector &d_HaarFeatures, + + NcvSize32u minObjSize, + Ncv32u minNeighbors, //default 4 + Ncv32f scaleStep, //default 1.2f + Ncv32u pixelStep, //default 1 + Ncv32u flags, //default NCVPipeObjDet_Default + + INCVMemAllocator &gpuAllocator, + INCVMemAllocator &cpuAllocator, + Ncv32u devPropMajor, + Ncv32u devPropMinor, + cudaStream_t cuStream) +{ + ncvAssertReturn(d_srcImg.memType() == d_dstRects.memType() && + d_srcImg.memType() == gpuAllocator.memType() && + (d_srcImg.memType() == NCVMemoryTypeDevice || + d_srcImg.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR); + ncvAssertReturn(d_HaarStages.memType() == d_HaarNodes.memType() && + d_HaarStages.memType() == d_HaarFeatures.memType() && + (d_HaarStages.memType() == NCVMemoryTypeDevice || + d_HaarStages.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR); + ncvAssertReturn(h_HaarStages.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR); + ncvAssertReturn(gpuAllocator.isInitialized() && cpuAllocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED); + ncvAssertReturn((d_srcImg.ptr() != NULL && d_dstRects.ptr() != NULL && + h_HaarStages.ptr() != NULL && d_HaarStages.ptr() != NULL && d_HaarNodes.ptr() != NULL && + d_HaarFeatures.ptr() != NULL) || gpuAllocator.isCounting(), NCV_NULL_PTR); + ncvAssertReturn(srcRoi.width > 0 && srcRoi.height > 0 && + d_srcImg.width() >= srcRoi.width && d_srcImg.height() >= srcRoi.height && + srcRoi.width >= minObjSize.width && srcRoi.height >= minObjSize.height && + d_dstRects.length() >= 1, NCV_DIMENSIONS_INVALID); + ncvAssertReturn(scaleStep > 1.0f, NCV_INVALID_SCALE); + ncvAssertReturn(d_HaarStages.length() >= haar.NumStages && + d_HaarNodes.length() >= haar.NumClassifierTotalNodes && + d_HaarFeatures.length() >= haar.NumFeatures && + d_HaarStages.length() == h_HaarStages.length() && + haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID); + ncvAssertReturn(haar.bNeedsTiltedII == false, NCV_NOIMPL_HAAR_TILTED_FEATURES); + ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP); + + //TODO: set NPP active stream to cuStream + + NCVStatus ncvStat; + NCV_SET_SKIP_COND(gpuAllocator.isCounting()); + + Ncv32u integralWidth = d_srcImg.width() + 1; + Ncv32u integralHeight = d_srcImg.height() + 1; + + NCVMatrixAlloc d_integralImage(gpuAllocator, integralWidth, integralHeight); + ncvAssertReturn(d_integralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + NCVMatrixAlloc d_sqIntegralImage(gpuAllocator, integralWidth, integralHeight); + ncvAssertReturn(d_sqIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + + NCVMatrixAlloc d_rectStdDev(gpuAllocator, d_srcImg.width(), d_srcImg.height()); + ncvAssertReturn(d_rectStdDev.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + NCVMatrixAlloc d_pixelMask(gpuAllocator, d_srcImg.width(), d_srcImg.height()); + ncvAssertReturn(d_pixelMask.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + + NCVMatrixAlloc d_scaledIntegralImage(gpuAllocator, integralWidth, integralHeight); + ncvAssertReturn(d_scaledIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + NCVMatrixAlloc d_scaledSqIntegralImage(gpuAllocator, integralWidth, integralHeight); + ncvAssertReturn(d_scaledSqIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + + NCVVectorAlloc d_hypothesesIntermediate(gpuAllocator, d_srcImg.width() * d_srcImg.height()); + ncvAssertReturn(d_hypothesesIntermediate.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + NCVVectorAlloc h_hypothesesIntermediate(cpuAllocator, d_srcImg.width() * d_srcImg.height()); + ncvAssertReturn(h_hypothesesIntermediate.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + + NppStStatus nppStat; + Ncv32u szTmpBufIntegral, szTmpBufSqIntegral; + nppStat = nppiStIntegralGetSize_8u32u(NppStSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufIntegral); + ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR); + nppStat = nppiStSqrIntegralGetSize_8u64u(NppStSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufSqIntegral); + ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR); + NCVVectorAlloc d_tmpIIbuf(gpuAllocator, std::max(szTmpBufIntegral, szTmpBufSqIntegral)); + ncvAssertReturn(d_tmpIIbuf.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + + NCV_SKIP_COND_BEGIN + + nppStat = nppiStIntegral_8u32u_C1R(d_srcImg.ptr(), d_srcImg.pitch(), + d_integralImage.ptr(), d_integralImage.pitch(), + NppStSize32u(d_srcImg.width(), d_srcImg.height()), + d_tmpIIbuf.ptr(), szTmpBufIntegral); + ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR); + + nppStat = nppiStSqrIntegral_8u64u_C1R(d_srcImg.ptr(), d_srcImg.pitch(), + d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(), + NppStSize32u(d_srcImg.width(), d_srcImg.height()), + d_tmpIIbuf.ptr(), szTmpBufSqIntegral); + ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR); + + NCV_SKIP_COND_END + + dstNumRects = 0; + + Ncv32u lastCheckedScale = 0; + NcvBool bReverseTraverseScale = ((flags & NCVPipeObjDet_FindLargestObject) != 0); + std::vector scalesVector; + + NcvBool bFoundLargestFace = false; + + for (Ncv32f scaleIter = 1.0f; ; scaleIter *= scaleStep) + { + Ncv32u scale = (Ncv32u)scaleIter; + if (lastCheckedScale == scale) + { + continue; + } + lastCheckedScale = scale; + + if (haar.ClassifierSize.width * (Ncv32s)scale < minObjSize.width || + haar.ClassifierSize.height * (Ncv32s)scale < minObjSize.height) + { + continue; + } + + NcvSize32s srcRoi, srcIIRoi, scaledIIRoi, searchRoi; + + srcRoi.width = d_srcImg.width(); + srcRoi.height = d_srcImg.height(); + + srcIIRoi.width = srcRoi.width + 1; + srcIIRoi.height = srcRoi.height + 1; + + scaledIIRoi.width = srcIIRoi.width / scale; + scaledIIRoi.height = srcIIRoi.height / scale; + + searchRoi.width = scaledIIRoi.width - haar.ClassifierSize.width; + searchRoi.height = scaledIIRoi.height - haar.ClassifierSize.height; + + if (searchRoi.width <= 0 || searchRoi.height <= 0) + { + break; + } + + scalesVector.push_back(scale); + + if (gpuAllocator.isCounting()) + { + break; + } + } + + if (bReverseTraverseScale) + { + std::reverse(scalesVector.begin(), scalesVector.end()); + } + + //TODO: handle _fair_scale_ flag + for (Ncv32u i=0; i d_vecPixelMask(d_pixelMask.getSegment()); + ncvStat = ncvGrowDetectionsVector_device( + d_vecPixelMask, + detectionsOnThisScale, + d_hypothesesIntermediate, + dstNumRects, + d_hypothesesIntermediate.length(), + haar.ClassifierSize.width, + haar.ClassifierSize.height, + (Ncv32f)scale, + cuStream); + ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat); + + if (flags & NCVPipeObjDet_FindLargestObject) + { + if (dstNumRects == 0) + { + continue; + } + + if (dstNumRects != 0) + { + ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); + ncvStat = d_hypothesesIntermediate.copySolid(h_hypothesesIntermediate, cuStream, + dstNumRects * sizeof(NcvRect32u)); + ncvAssertReturnNcvStat(ncvStat); + ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); + } + + Ncv32u numStrongHypothesesNow = dstNumRects; + ncvStat = ncvFilterHypotheses_host( + h_hypothesesIntermediate, + numStrongHypothesesNow, + minNeighbors, + RECT_SIMILARITY_PROPORTION, + NULL); + ncvAssertReturnNcvStat(ncvStat); + + if (numStrongHypothesesNow > 0) + { + NcvRect32u maxRect = h_hypothesesIntermediate.ptr()[0]; + for (Ncv32u j=1; j d_dstRects.length()) + { + ncvRetCode = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW; + dstNumRects = d_dstRects.length(); + } + + if (dstNumRects != 0) + { + ncvStat = h_hypothesesIntermediate.copySolid(d_dstRects, cuStream, + dstNumRects * sizeof(NcvRect32u)); + ncvAssertReturnNcvStat(ncvStat); + } + } + + if (flags & NCVPipeObjDet_VisualizeInPlace) + { + ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); + ncvDrawRects_8u_device(d_srcImg.ptr(), d_srcImg.stride(), + d_srcImg.width(), d_srcImg.height(), + d_dstRects.ptr(), dstNumRects, 255, cuStream); + } + + NCV_SKIP_COND_END + + return ncvRetCode; +} + + +//============================================================================== +// +// Purely Host code: classifier IO, mock-ups +// +//============================================================================== + + +#ifdef _SELF_TEST_ +#include +#endif + + +NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix &h_integralImage, + NCVMatrix &h_weights, + NCVMatrixAlloc &h_pixelMask, + Ncv32u &numDetections, + HaarClassifierCascadeDescriptor &haar, + NCVVector &h_HaarStages, + NCVVector &h_HaarNodes, + NCVVector &h_HaarFeatures, + NcvBool bMaskElements, + NcvSize32u anchorsRoi, + Ncv32u pixelStep, + Ncv32f scaleArea) +{ + ncvAssertReturn(h_integralImage.memType() == h_weights.memType() && + h_integralImage.memType() == h_pixelMask.memType() && + (h_integralImage.memType() == NCVMemoryTypeHostPageable || + h_integralImage.memType() == NCVMemoryTypeHostPinned), NCV_MEM_RESIDENCE_ERROR); + ncvAssertReturn(h_HaarStages.memType() == h_HaarNodes.memType() && + h_HaarStages.memType() == h_HaarFeatures.memType() && + (h_HaarStages.memType() == NCVMemoryTypeHostPageable || + h_HaarStages.memType() == NCVMemoryTypeHostPinned), NCV_MEM_RESIDENCE_ERROR); + ncvAssertReturn(h_integralImage.ptr() != NULL && h_weights.ptr() != NULL && h_pixelMask.ptr() != NULL && + h_HaarStages.ptr() != NULL && h_HaarNodes.ptr() != NULL && h_HaarFeatures.ptr() != NULL, NCV_NULL_PTR); + ncvAssertReturn(anchorsRoi.width > 0 && anchorsRoi.height > 0 && + h_pixelMask.width() >= anchorsRoi.width && h_pixelMask.height() >= anchorsRoi.height && + h_weights.width() >= anchorsRoi.width && h_weights.height() >= anchorsRoi.height && + h_integralImage.width() >= anchorsRoi.width + haar.ClassifierSize.width && + h_integralImage.height() >= anchorsRoi.height + haar.ClassifierSize.height, NCV_DIMENSIONS_INVALID); + ncvAssertReturn(scaleArea > 0, NCV_INVALID_SCALE); + ncvAssertReturn(h_HaarStages.length() >= haar.NumStages && + h_HaarNodes.length() >= haar.NumClassifierTotalNodes && + h_HaarFeatures.length() >= haar.NumFeatures && + h_HaarStages.length() == h_HaarStages.length() && + haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID); + ncvAssertReturn(haar.bNeedsTiltedII == false, NCV_NOIMPL_HAAR_TILTED_FEATURES); + ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP); + + Ncv32f scaleAreaPixels = scaleArea * ((haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER) * + (haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER)); + + for (Ncv32u i=0; i= anchorsRoi.width) + { + h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = OBJDET_MASK_ELEMENT_INVALID_32U; + } + else + { + for (Ncv32u iStage = 0; iStage < haar.NumStages; iStage++) + { + Ncv32f curStageSum = 0.0f; + Ncv32u numRootNodesInStage = h_HaarStages.ptr()[iStage].getNumClassifierRootNodes(); + Ncv32u curRootNodeOffset = h_HaarStages.ptr()[iStage].getStartClassifierRootNodeOffset(); + + if (iStage == 0) + { + if (bMaskElements && h_pixelMask.ptr()[i * h_pixelMask.stride() + j] == OBJDET_MASK_ELEMENT_INVALID_32U) + { + break; + } + else + { + h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = ((i << 16) | j); + } + } + else if (h_pixelMask.ptr()[i * h_pixelMask.stride() + j] == OBJDET_MASK_ELEMENT_INVALID_32U) + { + break; + } + + while (numRootNodesInStage--) + { + NcvBool bMoreNodesToTraverse = true; + Ncv32u curNodeOffset = curRootNodeOffset; + + while (bMoreNodesToTraverse) + { + HaarClassifierNode128 curNode = h_HaarNodes.ptr()[curNodeOffset]; + Ncv32u curNodeFeaturesNum = curNode.getFeatureDesc().getNumFeatures(); + Ncv32u curNodeFeaturesOffs = curNode.getFeatureDesc().getFeaturesOffset(); + + Ncv32f curNodeVal = 0.f; + for (Ncv32u iRect=0; iRect &pixelMask, + Ncv32u numPixelMaskDetections, + NCVVector &hypotheses, + Ncv32u &totalDetections, + Ncv32u totalMaxDetections, + Ncv32u rectWidth, + Ncv32u rectHeight, + Ncv32f curScale) +{ + ncvAssertReturn(pixelMask.ptr() != NULL && hypotheses.ptr() != NULL, NCV_NULL_PTR); + ncvAssertReturn(pixelMask.memType() == hypotheses.memType() && + pixelMask.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR); + ncvAssertReturn(rectWidth > 0 && rectHeight > 0 && curScale > 0, NCV_INVALID_ROI); + ncvAssertReturn(curScale > 0, NCV_INVALID_SCALE); + ncvAssertReturn(totalMaxDetections <= hypotheses.length() && + numPixelMaskDetections <= pixelMask.length() && + totalMaxDetections <= totalMaxDetections, NCV_INCONSISTENT_INPUT); + + NCVStatus ncvStat = NCV_SUCCESS; + Ncv32u numDetsToCopy = numPixelMaskDetections; + + if (numDetsToCopy == 0) + { + return ncvStat; + } + + if (totalDetections + numPixelMaskDetections > totalMaxDetections) + { + ncvStat = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW; + numDetsToCopy = totalMaxDetections - totalDetections; + } + + for (Ncv32u i=0; i &hypotheses, + Ncv32u &numHypotheses, + Ncv32u minNeighbors, + Ncv32f intersectEps, + NCVVector *hypothesesWeights) +{ + ncvAssertReturn(hypotheses.memType() == NCVMemoryTypeHostPageable || + hypotheses.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR); + if (hypothesesWeights != NULL) + { + ncvAssertReturn(hypothesesWeights->memType() == NCVMemoryTypeHostPageable || + hypothesesWeights->memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR); + } + + if (numHypotheses == 0) + { + return NCV_SUCCESS; + } + + std::vector rects(numHypotheses); + memcpy(&rects[0], hypotheses.ptr(), numHypotheses * sizeof(NcvRect32u)); + + std::vector weights; + if (hypothesesWeights != NULL) + { + groupRectangles(rects, minNeighbors, intersectEps, &weights); + } + else + { + groupRectangles(rects, minNeighbors, intersectEps, NULL); + } + + numHypotheses = (Ncv32u)rects.size(); + if (numHypotheses > 0) + { + memcpy(hypotheses.ptr(), &rects[0], numHypotheses * sizeof(NcvRect32u)); + } + + if (hypothesesWeights != NULL) + { + memcpy(hypothesesWeights->ptr(), &weights[0], numHypotheses * sizeof(Ncv32u)); + } + + return NCV_SUCCESS; +} + + +template +static NCVStatus drawRectsWrapperHost(T *h_dst, + Ncv32u dstStride, + Ncv32u dstWidth, + Ncv32u dstHeight, + NcvRect32u *h_rects, + Ncv32u numRects, + T color) +{ + ncvAssertReturn(h_dst != NULL && h_rects != NULL, NCV_NULL_PTR); + ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID); + ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP); + ncvAssertReturn(numRects != 0, NCV_SUCCESS); + ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID); + + for (Ncv32u i=0; i &haarStages, + std::vector &haarClassifierNodes, + std::vector &haarFeatures); + + +#define NVBIN_HAAR_SIZERESERVED 16 +#define NVBIN_HAAR_VERSION 0x1 + + +static NCVStatus loadFromNVBIN(const std::string &filename, + HaarClassifierCascadeDescriptor &haar, + std::vector &haarStages, + std::vector &haarClassifierNodes, + std::vector &haarFeatures) +{ + FILE *fp; + fopen_s(&fp, filename.c_str(), "rb"); + ncvAssertReturn(fp != NULL, NCV_FILE_ERROR); + Ncv32u fileVersion; + fread_s(&fileVersion, sizeof(Ncv32u), sizeof(Ncv32u), 1, fp); + ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR); + Ncv32u fsize; + fread_s(&fsize, sizeof(Ncv32u), sizeof(Ncv32u), 1, fp); + fseek(fp, 0, SEEK_END); + Ncv32u fsizeActual = ftell(fp); + ncvAssertReturn(fsize == fsizeActual, NCV_FILE_ERROR); + + std::vector fdata; + fdata.resize(fsize); + Ncv32u dataOffset = 0; + fseek(fp, 0, SEEK_SET); + fread_s(&fdata[0], fsize, fsize, 1, fp); + fclose(fp); + + //data + dataOffset = NVBIN_HAAR_SIZERESERVED; + haar.NumStages = *(Ncv32u *)(&fdata[0]+dataOffset); + dataOffset += sizeof(Ncv32u); + haar.NumClassifierRootNodes = *(Ncv32u *)(&fdata[0]+dataOffset); + dataOffset += sizeof(Ncv32u); + haar.NumClassifierTotalNodes = *(Ncv32u *)(&fdata[0]+dataOffset); + dataOffset += sizeof(Ncv32u); + haar.NumFeatures = *(Ncv32u *)(&fdata[0]+dataOffset); + dataOffset += sizeof(Ncv32u); + haar.ClassifierSize = *(NcvSize32u *)(&fdata[0]+dataOffset); + dataOffset += sizeof(NcvSize32u); + haar.bNeedsTiltedII = *(NcvBool *)(&fdata[0]+dataOffset); + dataOffset += sizeof(NcvBool); + haar.bHasStumpsOnly = *(NcvBool *)(&fdata[0]+dataOffset); + dataOffset += sizeof(NcvBool); + + haarStages.resize(haar.NumStages); + haarClassifierNodes.resize(haar.NumClassifierTotalNodes); + haarFeatures.resize(haar.NumFeatures); + + Ncv32u szStages = haar.NumStages * sizeof(HaarStage64); + Ncv32u szClassifiers = haar.NumClassifierTotalNodes * sizeof(HaarClassifierNode128); + Ncv32u szFeatures = haar.NumFeatures * sizeof(HaarFeature64); + + memcpy(&haarStages[0], &fdata[0]+dataOffset, szStages); + dataOffset += szStages; + memcpy(&haarClassifierNodes[0], &fdata[0]+dataOffset, szClassifiers); + dataOffset += szClassifiers; + memcpy(&haarFeatures[0], &fdata[0]+dataOffset, szFeatures); + dataOffset += szFeatures; + + return NCV_SUCCESS; +} + + +NCVStatus ncvHaarGetClassifierSize(const std::string &filename, Ncv32u &numStages, + Ncv32u &numNodes, Ncv32u &numFeatures) +{ + NCVStatus ncvStat; + + std::string fext = filename.substr(filename.find_last_of(".") + 1); + std::transform(fext.begin(), fext.end(), fext.begin(), ::tolower); + + if (fext == "nvbin") + { + FILE *fp; + fopen_s(&fp, filename.c_str(), "rb"); + ncvAssertReturn(fp != NULL, NCV_FILE_ERROR); + Ncv32u fileVersion; + fread_s(&fileVersion, sizeof(Ncv32u), sizeof(Ncv32u), 1, fp); + ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR); + fseek(fp, NVBIN_HAAR_SIZERESERVED, SEEK_SET); + Ncv32u tmp; + fread_s(&numStages, sizeof(Ncv32u), sizeof(Ncv32u), 1, fp); + fread_s(&tmp, sizeof(Ncv32u), sizeof(Ncv32u), 1, fp); + fread_s(&numNodes, sizeof(Ncv32u), sizeof(Ncv32u), 1, fp); + fread_s(&numFeatures, sizeof(Ncv32u), sizeof(Ncv32u), 1, fp); + fclose(fp); + } + else if (fext == "xml") + { + HaarClassifierCascadeDescriptor haar; + std::vector haarStages; + std::vector haarNodes; + std::vector haarFeatures; + + ncvStat = loadFromXML(filename, haar, haarStages, haarNodes, haarFeatures); + ncvAssertReturnNcvStat(ncvStat); + + numStages = haar.NumStages; + numNodes = haar.NumClassifierTotalNodes; + numFeatures = haar.NumFeatures; + } + else + { + return NCV_HAAR_XML_LOADING_EXCEPTION; + } + + return NCV_SUCCESS; +} + + +NCVStatus ncvHaarLoadFromFile_host(const std::string &filename, + HaarClassifierCascadeDescriptor &haar, + NCVVector &h_HaarStages, + NCVVector &h_HaarNodes, + NCVVector &h_HaarFeatures) +{ + ncvAssertReturn(h_HaarStages.memType() == NCVMemoryTypeHostPinned && + h_HaarNodes.memType() == NCVMemoryTypeHostPinned && + h_HaarFeatures.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR); + + NCVStatus ncvStat; + + std::string fext = filename.substr(filename.find_last_of(".") + 1); + std::transform(fext.begin(), fext.end(), fext.begin(), ::tolower); + + std::vector haarStages; + std::vector haarNodes; + std::vector haarFeatures; + + if (fext == "nvbin") + { + ncvStat = loadFromNVBIN(filename, haar, haarStages, haarNodes, haarFeatures); + ncvAssertReturnNcvStat(ncvStat); + } + else if (fext == "xml") + { + ncvStat = loadFromXML(filename, haar, haarStages, haarNodes, haarFeatures); + ncvAssertReturnNcvStat(ncvStat); + } + else + { + return NCV_HAAR_XML_LOADING_EXCEPTION; + } + + ncvAssertReturn(h_HaarStages.length() >= haarStages.size(), NCV_MEM_INSUFFICIENT_CAPACITY); + ncvAssertReturn(h_HaarNodes.length() >= haarNodes.size(), NCV_MEM_INSUFFICIENT_CAPACITY); + ncvAssertReturn(h_HaarFeatures.length() >= haarFeatures.size(), NCV_MEM_INSUFFICIENT_CAPACITY); + + memcpy(h_HaarStages.ptr(), &haarStages[0], haarStages.size()*sizeof(HaarStage64)); + memcpy(h_HaarNodes.ptr(), &haarNodes[0], haarNodes.size()*sizeof(HaarClassifierNode128)); + memcpy(h_HaarFeatures.ptr(), &haarFeatures[0], haarFeatures.size()*sizeof(HaarFeature64)); + + return NCV_SUCCESS; +} + + +NCVStatus ncvHaarStoreNVBIN_host(std::string &filename, + HaarClassifierCascadeDescriptor haar, + NCVVector &h_HaarStages, + NCVVector &h_HaarNodes, + NCVVector &h_HaarFeatures) +{ + ncvAssertReturn(h_HaarStages.length() >= haar.NumStages, NCV_INCONSISTENT_INPUT); + ncvAssertReturn(h_HaarNodes.length() >= haar.NumClassifierTotalNodes, NCV_INCONSISTENT_INPUT); + ncvAssertReturn(h_HaarFeatures.length() >= haar.NumFeatures, NCV_INCONSISTENT_INPUT); + ncvAssertReturn(h_HaarStages.memType() == NCVMemoryTypeHostPinned && + h_HaarNodes.memType() == NCVMemoryTypeHostPinned && + h_HaarFeatures.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR); + + Ncv32u szStages = haar.NumStages * sizeof(HaarStage64); + Ncv32u szClassifiers = haar.NumClassifierTotalNodes * sizeof(HaarClassifierNode128); + Ncv32u szFeatures = haar.NumFeatures * sizeof(HaarFeature64); + + Ncv32u dataOffset = 0; + std::vector fdata; + fdata.resize(szStages+szClassifiers+szFeatures+1024, 0); + + //header + *(Ncv32u *)(&fdata[0]+dataOffset) = NVBIN_HAAR_VERSION; + + //data + dataOffset = NVBIN_HAAR_SIZERESERVED; + *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumStages; + dataOffset += sizeof(Ncv32u); + *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumClassifierRootNodes; + dataOffset += sizeof(Ncv32u); + *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumClassifierTotalNodes; + dataOffset += sizeof(Ncv32u); + *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumFeatures; + dataOffset += sizeof(Ncv32u); + *(NcvSize32u *)(&fdata[0]+dataOffset) = haar.ClassifierSize; + dataOffset += sizeof(NcvSize32u); + *(NcvBool *)(&fdata[0]+dataOffset) = haar.bNeedsTiltedII; + dataOffset += sizeof(NcvBool); + *(NcvBool *)(&fdata[0]+dataOffset) = haar.bHasStumpsOnly; + dataOffset += sizeof(NcvBool); + + memcpy(&fdata[0]+dataOffset, h_HaarStages.ptr(), szStages); + dataOffset += szStages; + memcpy(&fdata[0]+dataOffset, h_HaarNodes.ptr(), szClassifiers); + dataOffset += szClassifiers; + memcpy(&fdata[0]+dataOffset, h_HaarFeatures.ptr(), szFeatures); + dataOffset += szFeatures; + Ncv32u fsize = dataOffset; + + //TODO: CRC32 here + + //update header + dataOffset = sizeof(Ncv32u); + *(Ncv32u *)(&fdata[0]+dataOffset) = fsize; + + FILE *fp; + fopen_s(&fp, filename.c_str(), "wb"); + ncvAssertReturn(fp != NULL, NCV_FILE_ERROR); + fwrite(&fdata[0], fsize, 1, fp); + fclose(fp); + return NCV_SUCCESS; +} diff --git a/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp b/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp new file mode 100644 index 0000000000..d9f500d7a0 --- /dev/null +++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp @@ -0,0 +1,501 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +//////////////////////////////////////////////////////////////////////////////// +// +// NVIDIA CUDA implementation of Viola-Jones Object Detection Framework +// +// The algorithm and code are explained in the upcoming GPU Computing Gems +// chapter in detail: +// +// Anton Obukhov, "Haar Classifiers for Object Detection with CUDA" +// PDF URL placeholder +// email: aobukhov@nvidia.com, devsupport@nvidia.com +// +// Credits for help with the code to: +// Alexey Mendelenko, Cyril Crassin, and Mikhail Smirnov. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _ncvhaarobjectdetection_hpp_ +#define _ncvhaarobjectdetection_hpp_ + +#include +#include "NCV.hpp" + + +//============================================================================== +// +// Guaranteed size cross-platform classifier structures +// +//============================================================================== + + +struct HaarFeature64 +{ + uint2 _ui2; + +#define HaarFeature64_CreateCheck_MaxRectField 0xFF + + __host__ NCVStatus setRect(Ncv32u rectX, Ncv32u rectY, Ncv32u rectWidth, Ncv32u rectHeight, Ncv32u clsWidth, Ncv32u clsHeight) + { + ncvAssertReturn(rectWidth <= HaarFeature64_CreateCheck_MaxRectField && rectHeight <= HaarFeature64_CreateCheck_MaxRectField, NCV_HAAR_TOO_LARGE_FEATURES); + ((NcvRect8u*)&(this->_ui2.x))->x = rectX; + ((NcvRect8u*)&(this->_ui2.x))->y = rectY; + ((NcvRect8u*)&(this->_ui2.x))->width = rectWidth; + ((NcvRect8u*)&(this->_ui2.x))->height = rectHeight; + return NCV_SUCCESS; + } + + __host__ NCVStatus setWeight(Ncv32f weight) + { + ((Ncv32f*)&(this->_ui2.y))[0] = weight; + return NCV_SUCCESS; + } + + __device__ __host__ void getRect(Ncv32u *rectX, Ncv32u *rectY, Ncv32u *rectWidth, Ncv32u *rectHeight) + { + NcvRect8u tmpRect = *(NcvRect8u*)(&this->_ui2.x); + *rectX = tmpRect.x; + *rectY = tmpRect.y; + *rectWidth = tmpRect.width; + *rectHeight = tmpRect.height; + } + + __device__ __host__ Ncv32f getWeight(void) + { + return *(Ncv32f*)(&this->_ui2.y); + } +}; + + +struct HaarFeatureDescriptor32 +{ +private: + +#define HaarFeatureDescriptor32_Interpret_MaskFlagTilted 0x80000000 +#define HaarFeatureDescriptor32_CreateCheck_MaxNumFeatures 0x7F +#define HaarFeatureDescriptor32_NumFeatures_Shift 24 +#define HaarFeatureDescriptor32_CreateCheck_MaxFeatureOffset 0x00FFFFFF + + Ncv32u desc; + +public: + + __host__ NCVStatus create(NcvBool bTilted, Ncv32u numFeatures, Ncv32u offsetFeatures) + { + if (numFeatures > HaarFeatureDescriptor32_CreateCheck_MaxNumFeatures) + { + return NCV_HAAR_TOO_MANY_FEATURES_IN_CLASSIFIER; + } + if (offsetFeatures > HaarFeatureDescriptor32_CreateCheck_MaxFeatureOffset) + { + return NCV_HAAR_TOO_MANY_FEATURES_IN_CASCADE; + } + this->desc = 0; + this->desc |= (bTilted ? HaarFeatureDescriptor32_Interpret_MaskFlagTilted : 0); + this->desc |= (numFeatures << HaarFeatureDescriptor32_NumFeatures_Shift); + this->desc |= offsetFeatures; + return NCV_SUCCESS; + } + + __device__ __host__ NcvBool isTilted(void) + { + return (this->desc & HaarFeatureDescriptor32_Interpret_MaskFlagTilted) != 0; + } + + __device__ __host__ Ncv32u getNumFeatures(void) + { + return (this->desc & ~HaarFeatureDescriptor32_Interpret_MaskFlagTilted) >> HaarFeatureDescriptor32_NumFeatures_Shift; + } + + __device__ __host__ Ncv32u getFeaturesOffset(void) + { + return this->desc & HaarFeatureDescriptor32_CreateCheck_MaxFeatureOffset; + } +}; + + +struct HaarClassifierNodeDescriptor32 +{ + uint1 _ui1; + +#define HaarClassifierNodeDescriptor32_Interpret_MaskSwitch (1 << 30) + + __host__ NCVStatus create(Ncv32f leafValue) + { + if ((*(Ncv32u *)&leafValue) & HaarClassifierNodeDescriptor32_Interpret_MaskSwitch) + { + return NCV_HAAR_XML_LOADING_EXCEPTION; + } + *(Ncv32f *)&this->_ui1 = leafValue; + return NCV_SUCCESS; + } + + __host__ NCVStatus create(Ncv32u offsetHaarClassifierNode) + { + if (offsetHaarClassifierNode >= HaarClassifierNodeDescriptor32_Interpret_MaskSwitch) + { + return NCV_HAAR_XML_LOADING_EXCEPTION; + } + this->_ui1.x = offsetHaarClassifierNode; + this->_ui1.x |= HaarClassifierNodeDescriptor32_Interpret_MaskSwitch; + return NCV_SUCCESS; + } + + __device__ __host__ NcvBool isLeaf(void) + { + return !(this->_ui1.x & HaarClassifierNodeDescriptor32_Interpret_MaskSwitch); + } + + __host__ Ncv32f getLeafValueHost(void) + { + return *(Ncv32f *)&this->_ui1.x; + } + +#ifdef __CUDACC__ + __device__ Ncv32f getLeafValue(void) + { + return __int_as_float(this->_ui1.x); + } +#endif + + __device__ __host__ Ncv32u getNextNodeOffset(void) + { + return (this->_ui1.x & ~HaarClassifierNodeDescriptor32_Interpret_MaskSwitch); + } +}; + + +struct HaarClassifierNode128 +{ + uint4 _ui4; + + __host__ NCVStatus setFeatureDesc(HaarFeatureDescriptor32 f) + { + this->_ui4.x = *(Ncv32u *)&f; + return NCV_SUCCESS; + } + + __host__ NCVStatus setThreshold(Ncv32f t) + { + this->_ui4.y = *(Ncv32u *)&t; + return NCV_SUCCESS; + } + + __host__ NCVStatus setLeftNodeDesc(HaarClassifierNodeDescriptor32 nl) + { + this->_ui4.z = *(Ncv32u *)&nl; + return NCV_SUCCESS; + } + + __host__ NCVStatus setRightNodeDesc(HaarClassifierNodeDescriptor32 nr) + { + this->_ui4.w = *(Ncv32u *)&nr; + return NCV_SUCCESS; + } + + __host__ __device__ HaarFeatureDescriptor32 getFeatureDesc(void) + { + return *(HaarFeatureDescriptor32 *)&this->_ui4.x; + } + + __host__ __device__ Ncv32f getThreshold(void) + { + return *(Ncv32f*)&this->_ui4.y; + } + + __host__ __device__ HaarClassifierNodeDescriptor32 getLeftNodeDesc(void) + { + return *(HaarClassifierNodeDescriptor32 *)&this->_ui4.z; + } + + __host__ __device__ HaarClassifierNodeDescriptor32 getRightNodeDesc(void) + { + return *(HaarClassifierNodeDescriptor32 *)&this->_ui4.w; + } +}; + + +struct HaarStage64 +{ +#define HaarStage64_Interpret_MaskRootNodes 0x0000FFFF +#define HaarStage64_Interpret_MaskRootNodeOffset 0xFFFF0000 +#define HaarStage64_Interpret_ShiftRootNodeOffset 16 + + uint2 _ui2; + + __host__ NCVStatus setStageThreshold(Ncv32f t) + { + this->_ui2.x = *(Ncv32u *)&t; + return NCV_SUCCESS; + } + + __host__ NCVStatus setStartClassifierRootNodeOffset(Ncv32u val) + { + if (val > (HaarStage64_Interpret_MaskRootNodeOffset >> HaarStage64_Interpret_ShiftRootNodeOffset)) + { + return NCV_HAAR_XML_LOADING_EXCEPTION; + } + this->_ui2.y = (val << HaarStage64_Interpret_ShiftRootNodeOffset) | (this->_ui2.y & HaarStage64_Interpret_MaskRootNodes); + return NCV_SUCCESS; + } + + __host__ NCVStatus setNumClassifierRootNodes(Ncv32u val) + { + if (val > HaarStage64_Interpret_MaskRootNodes) + { + return NCV_HAAR_XML_LOADING_EXCEPTION; + } + this->_ui2.y = val | (this->_ui2.y & HaarStage64_Interpret_MaskRootNodeOffset); + return NCV_SUCCESS; + } + + __host__ __device__ Ncv32f getStageThreshold(void) + { + return *(Ncv32f*)&this->_ui2.x; + } + + __host__ __device__ Ncv32u getStartClassifierRootNodeOffset(void) + { + return (this->_ui2.y >> HaarStage64_Interpret_ShiftRootNodeOffset); + } + + __host__ __device__ Ncv32u getNumClassifierRootNodes(void) + { + return (this->_ui2.y & HaarStage64_Interpret_MaskRootNodes); + } +}; + + +NPPST_CT_ASSERT(sizeof(HaarFeature64) == 8); +NPPST_CT_ASSERT(sizeof(HaarFeatureDescriptor32) == 4); +NPPST_CT_ASSERT(sizeof(HaarClassifierNodeDescriptor32) == 4); +NPPST_CT_ASSERT(sizeof(HaarClassifierNode128) == 16); +NPPST_CT_ASSERT(sizeof(HaarStage64) == 8); + + +//============================================================================== +// +// Classifier cascade descriptor +// +//============================================================================== + + +struct HaarClassifierCascadeDescriptor +{ + Ncv32u NumStages; + Ncv32u NumClassifierRootNodes; + Ncv32u NumClassifierTotalNodes; + Ncv32u NumFeatures; + NcvSize32u ClassifierSize; + NcvBool bNeedsTiltedII; + NcvBool bHasStumpsOnly; +}; + + +//============================================================================== +// +// Functional interface +// +//============================================================================== + + +enum +{ + NCVPipeObjDet_Default = 0x000, + NCVPipeObjDet_UseFairImageScaling = 0x001, + NCVPipeObjDet_FindLargestObject = 0x002, + NCVPipeObjDet_VisualizeInPlace = 0x004, +}; + + +NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix &d_srcImg, + NcvSize32u srcRoi, + NCVVector &d_dstRects, + Ncv32u &dstNumRects, + + HaarClassifierCascadeDescriptor &haar, + NCVVector &h_HaarStages, + NCVVector &d_HaarStages, + NCVVector &d_HaarNodes, + NCVVector &d_HaarFeatures, + + NcvSize32u minObjSize, + Ncv32u minNeighbors, //default 4 + Ncv32f scaleStep, //default 1.2f + Ncv32u pixelStep, //default 1 + Ncv32u flags, //default NCVPipeObjDet_Default + + INCVMemAllocator &gpuAllocator, + INCVMemAllocator &cpuAllocator, + Ncv32u devPropMajor, + Ncv32u devPropMinor, + cudaStream_t cuStream); + + +#define OBJDET_MASK_ELEMENT_INVALID_32U 0xFFFFFFFF +#define HAAR_STDDEV_BORDER 1 + + +NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix &d_integralImage, + NCVMatrix &d_weights, + NCVMatrixAlloc &d_pixelMask, + Ncv32u &numDetections, + HaarClassifierCascadeDescriptor &haar, + NCVVector &h_HaarStages, + NCVVector &d_HaarStages, + NCVVector &d_HaarNodes, + NCVVector &d_HaarFeatures, + NcvBool bMaskElements, + NcvSize32u anchorsRoi, + Ncv32u pixelStep, + Ncv32f scaleArea, + INCVMemAllocator &gpuAllocator, + INCVMemAllocator &cpuAllocator, + Ncv32u devPropMajor, + Ncv32u devPropMinor, + cudaStream_t cuStream); + + +NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix &h_integralImage, + NCVMatrix &h_weights, + NCVMatrixAlloc &h_pixelMask, + Ncv32u &numDetections, + HaarClassifierCascadeDescriptor &haar, + NCVVector &h_HaarStages, + NCVVector &h_HaarNodes, + NCVVector &h_HaarFeatures, + NcvBool bMaskElements, + NcvSize32u anchorsRoi, + Ncv32u pixelStep, + Ncv32f scaleArea); + + +NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst, + Ncv32u dstStride, + Ncv32u dstWidth, + Ncv32u dstHeight, + NcvRect32u *d_rects, + Ncv32u numRects, + Ncv8u color, + cudaStream_t cuStream); + + +NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst, + Ncv32u dstStride, + Ncv32u dstWidth, + Ncv32u dstHeight, + NcvRect32u *d_rects, + Ncv32u numRects, + Ncv32u color, + cudaStream_t cuStream); + + +NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst, + Ncv32u dstStride, + Ncv32u dstWidth, + Ncv32u dstHeight, + NcvRect32u *h_rects, + Ncv32u numRects, + Ncv8u color); + + +NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst, + Ncv32u dstStride, + Ncv32u dstWidth, + Ncv32u dstHeight, + NcvRect32u *h_rects, + Ncv32u numRects, + Ncv32u color); + + +#define RECT_SIMILARITY_PROPORTION 0.2f + + +NCVStatus ncvGrowDetectionsVector_device(NCVVector &pixelMask, + Ncv32u numPixelMaskDetections, + NCVVector &hypotheses, + Ncv32u &totalDetections, + Ncv32u totalMaxDetections, + Ncv32u rectWidth, + Ncv32u rectHeight, + Ncv32f curScale, + cudaStream_t cuStream); + + +NCVStatus ncvGrowDetectionsVector_host(NCVVector &pixelMask, + Ncv32u numPixelMaskDetections, + NCVVector &hypotheses, + Ncv32u &totalDetections, + Ncv32u totalMaxDetections, + Ncv32u rectWidth, + Ncv32u rectHeight, + Ncv32f curScale); + + +NCVStatus ncvFilterHypotheses_host(NCVVector &hypotheses, + Ncv32u &numHypotheses, + Ncv32u minNeighbors, + Ncv32f intersectEps, + NCVVector *hypothesesWeights); + + +NCVStatus ncvHaarGetClassifierSize(const std::string &filename, Ncv32u &numStages, + Ncv32u &numNodes, Ncv32u &numFeatures); + + +NCVStatus ncvHaarLoadFromFile_host(const std::string &filename, + HaarClassifierCascadeDescriptor &haar, + NCVVector &h_HaarStages, + NCVVector &h_HaarNodes, + NCVVector &h_HaarFeatures); + + +NCVStatus ncvHaarStoreNVBIN_host(const std::string &filename, + HaarClassifierCascadeDescriptor haar, + NCVVector &h_HaarStages, + NCVVector &h_HaarNodes, + NCVVector &h_HaarFeatures); + + + +#endif // _ncvhaarobjectdetection_hpp_ diff --git a/modules/gpu/src/nvidia/NCVRuntimeTemplates.hpp b/modules/gpu/src/nvidia/NCVRuntimeTemplates.hpp new file mode 100644 index 0000000000..14d16bb3b9 --- /dev/null +++ b/modules/gpu/src/nvidia/NCVRuntimeTemplates.hpp @@ -0,0 +1,174 @@ +//////////////////////////////////////////////////////////////////////////////// +// The Loki Library +// Copyright (c) 2001 by Andrei Alexandrescu +// This code accompanies the book: +// Alexandrescu, Andrei. "Modern C++ Design: Generic Programming and Design +// Patterns Applied". Copyright (c) 2001. Addison-Wesley. +// Permission to use, copy, modify, distribute and sell this software for any +// purpose is hereby granted without fee, provided that the above copyright +// notice appear in all copies and that both that copyright notice and this +// permission notice appear in supporting documentation. +// The author or Addison-Welsey Longman make no representations about the +// suitability of this software for any purpose. It is provided "as is" +// without express or implied warranty. +// http://loki-lib.sourceforge.net/index.php?n=Main.License +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _ncvruntimetemplates_hpp_ +#define _ncvruntimetemplates_hpp_ + +#include +#include + + +namespace Loki +{ + //============================================================================== + // class NullType + // Used as a placeholder for "no type here" + // Useful as an end marker in typelists + //============================================================================== + + class NullType {}; + + //============================================================================== + // class template Typelist + // The building block of typelists of any length + // Use it through the LOKI_TYPELIST_NN macros + // Defines nested types: + // Head (first element, a non-typelist type by convention) + // Tail (second element, can be another typelist) + //============================================================================== + + template + struct Typelist + { + typedef T Head; + typedef U Tail; + }; + + //============================================================================== + // class template Int2Type + // Converts each integral constant into a unique type + // Invocation: Int2Type where v is a compile-time constant integral + // Defines 'value', an enum that evaluates to v + //============================================================================== + + template + struct Int2Type + { + enum { value = v }; + }; + + namespace TL + { + //============================================================================== + // class template TypeAt + // Finds the type at a given index in a typelist + // Invocation (TList is a typelist and index is a compile-time integral + // constant): + // TypeAt::Result + // returns the type in position 'index' in TList + // If you pass an out-of-bounds index, the result is a compile-time error + //============================================================================== + + template struct TypeAt; + + template + struct TypeAt, 0> + { + typedef Head Result; + }; + + template + struct TypeAt, i> + { + typedef typename TypeAt::Result Result; + }; + } +} + + +//////////////////////////////////////////////////////////////////////////////// +// Runtime boolean template instance dispatcher +// Cyril Crassin +// NVIDIA, 2010 +//////////////////////////////////////////////////////////////////////////////// + +namespace NCVRuntimeTemplateBool +{ + //This struct is used to transform a list of parameters into template arguments + //The idea is to build a typelist containing the arguments + //and to pass this typelist to a user defined functor + template + struct KernelCaller + { + //Convenience function used by the user + //Takes a variable argument list, transforms it into a list + static void call(Func &functor, int dummy, ...) + { + //Vector used to collect arguments + std::vector templateParamList; + + //Variable argument list manipulation + va_list listPointer; + va_start(listPointer, dummy); + //Collect parameters into the list + for(int i=0; i &templateParamList) + { + //Get current parameter value in the list + int val = templateParamList[templateParamList.size() - 1]; + templateParamList.pop_back(); + + //Select the compile time value to add into the typelist + //depending on the runtime variable and make recursive call. + //Both versions are really instantiated + if(val) + { + KernelCaller< + Loki::Typelist, TList >, + NumArguments-1, Func > + ::call(functor, templateParamList); + } + else + { + KernelCaller< + Loki::Typelist, TList >, + NumArguments-1, Func > + ::call(functor, templateParamList); + } + } + }; + + //Specialization for 0 value left in the list + //-> actual kernel functor call + template + struct KernelCaller + { + static void call(Func &functor) + { + //Call to the functor's kernel call method + functor.call(TList()); //TList instantiated to get the method template parameter resolved + } + + static void call(Func &functor, std::vector &templateParams) + { + functor.call(TList()); + } + }; +} + +#endif //_ncvruntimetemplates_hpp_ diff --git a/modules/gpu/src/precomp.hpp b/modules/gpu/src/precomp.hpp index eb906e3368..a7ba6ffaba 100644 --- a/modules/gpu/src/precomp.hpp +++ b/modules/gpu/src/precomp.hpp @@ -71,6 +71,9 @@ #include "npp_staging.h" #include "surf_key_point.h" + #include "nvidia/NCV.hpp" + #include "nvidia/NCVHaarObjectDetection.hpp" + #define CUDART_MINIMUM_REQUIRED_VERSION 3020 #define NPP_MINIMUM_REQUIRED_VERSION 3216 diff --git a/samples/gpu/cascadeclassifier.cpp b/samples/gpu/cascadeclassifier.cpp new file mode 100644 index 0000000000..7f0a0e74db --- /dev/null +++ b/samples/gpu/cascadeclassifier.cpp @@ -0,0 +1,193 @@ +// WARNING: this sample is under construction! Use it on your own risk. + +#include +#include +#include +#include +#include + +#include +#include +#include + +using namespace std; +using namespace cv; +using namespace cv::gpu; + +void help() +{ + cout << "Usage: ./cascadeclassifier \n" + "Using OpenCV version " << CV_VERSION << endl << endl; +} + +void DetectAndDraw(Mat& img, CascadeClassifier_GPU& cascade); + +String cascadeName = "../../data/haarcascades/haarcascade_frontalface_alt.xml"; +String nestedCascadeName = "../../data/haarcascades/haarcascade_eye_tree_eyeglasses.xml"; + + + +template void convertAndReseize(const T& src, T& gray, T& resized, double scale = 2.0) +{ + if (src.channels() == 3) + cvtColor( src, gray, CV_BGR2GRAY ); + else + gray = src; + + Size sz(cvRound(gray.cols * scale), cvRound(gray.rows * scale)); + if (scale != 1) + resize(gray, resized, sz); + else + resized = gray; +} + + + +int main( int argc, const char** argv ) +{ + if (argc != 3) + return help(), -1; + + if (cv::gpu::getCudaEnabledDeviceCount() == 0) + return cerr << "No GPU found or the library is compiled without GPU support" << endl, -1; + + VideoCapture capture; + + string cascadeName = argv[1]; + string inputName = argv[2]; + + cv::gpu::CascadeClassifier_GPU cascade_gpu; + if( !cascade_gpu.load( cascadeName ) ) + return cerr << "ERROR: Could not load cascade classifier \"" << cascadeName << "\"" << endl, help(), -1; + + cv::CascadeClassifier cascade_cpu; + if( !cascade_cpu.load( cascadeName ) ) + return cerr << "ERROR: Could not load cascade classifier \"" << cascadeName << "\"" << endl, help(), -1; + + Mat image = imread( inputName); + if( image.empty() ) + if (!capture.open(inputName)) + { + int camid = 0; + sscanf(inputName.c_str(), "%d", &camid); + if(!capture.open(camid)) + cout << "Can't open source" << endl; + } + + namedWindow( "result", 1 ); + Size fontSz = cv::getTextSize("T[]", FONT_HERSHEY_SIMPLEX, 1.0, 2, 0); + + Mat frame, frame_cpu, gray_cpu, resized_cpu, faces_downloaded, frameDisp; + vector facesBuf_cpu; + + GpuMat frame_gpu, gray_gpu, resized_gpu, facesBuf_gpu; + + /* parameters */ + bool useGPU = true; + double scale_factor = 2; + + bool visualizeInPlace = false; + bool findLargestObject = false; + + printf("\t - toggle GPU/CPU\n"); + printf("\tL - toggle lagest faces\n"); + printf("\tV - toggle visualisation in-place (for GPU only)\n"); + printf("\t1/q - inc/dec scale\n"); + + int detections_num; + for(;;) + { + if( capture.isOpened() ) + { + capture >> frame; + if( frame.empty()) + break; + } + + (image.empty() ? frame : image).copyTo(frame_cpu); + frame_gpu.upload( image.empty() ? frame : image); + + convertAndReseize(frame_gpu, gray_gpu, resized_gpu, scale_factor); + convertAndReseize(frame_cpu, gray_cpu, resized_cpu, scale_factor); + + cv::TickMeter tm; + tm.start(); + + if (useGPU) + { + cascade_gpu.visualizeInPlace = visualizeInPlace; + cascade_gpu.findLargestObject = findLargestObject; + + detections_num = cascade_gpu.detectMultiScale( resized_gpu, facesBuf_gpu ); + facesBuf_gpu.colRange(0, detections_num).download(faces_downloaded); + + } + else /* so use CPU */ + { + Size minSize = cascade_gpu.getClassifierSize(); + if (findLargestObject) + { + float ratio = (float)std::min(frame.cols / minSize.width, frame.rows / minSize.height); + ratio = std::max(ratio / 2.5f, 1.f); + minSize = Size(cvRound(minSize.width * ratio), cvRound(minSize.height * ratio)); + } + + cascade_cpu.detectMultiScale(resized_cpu, facesBuf_cpu, 1.2, 4, (findLargestObject ? CV_HAAR_FIND_BIGGEST_OBJECT : 0) | CV_HAAR_SCALE_IMAGE, minSize); + detections_num = (int)facesBuf_cpu.size(); + } + + tm.stop(); + printf( "detection time = %g ms\n", tm.getTimeMilli() ); + + if (useGPU) + resized_gpu.download(resized_cpu); + + if (!visualizeInPlace || !useGPU) + if (detections_num) + { + Rect* faces = useGPU ? faces_downloaded.ptr() : &facesBuf_cpu[0]; + for(int i = 0; i < detections_num; ++i) + cv::rectangle(resized_cpu, faces[i], Scalar(255)); + } + + Point text_pos(5, 25); + int offs = fontSz.height + 5; + Scalar color = CV_RGB(255, 0, 0); + + + cv::cvtColor(resized_cpu, frameDisp, CV_GRAY2BGR); + + char buf[4096]; + sprintf(buf, "%s, FPS = %0.3g", useGPU ? "GPU" : "CPU", 1.0/tm.getTimeSec()); + putText(frameDisp, buf, text_pos, FONT_HERSHEY_SIMPLEX, 1.0, color, 2); + sprintf(buf, "scale = %0.3g, [%d*scale x %d*scale]", scale_factor, frame.cols, frame.rows); + putText(frameDisp, buf, text_pos+=Point(0,offs), FONT_HERSHEY_SIMPLEX, 1.0, color, 2); + putText(frameDisp, "Hotkeys: space, 1, Q, L, V, Esc", text_pos+=Point(0,offs), FONT_HERSHEY_SIMPLEX, 1.0, color, 2); + + if (findLargestObject) + putText(frameDisp, "FindLargestObject", text_pos+=Point(0,offs), FONT_HERSHEY_SIMPLEX, 1.0, color, 2); + + if (visualizeInPlace && useGPU) + putText(frameDisp, "VisualizeInPlace", text_pos+Point(0,offs), FONT_HERSHEY_SIMPLEX, 1.0, color, 2); + + cv::imshow( "result", frameDisp); + + int key = waitKey( 5 ); + if( key == 27) + break; + + switch (key) + { + case (int)' ': useGPU = !useGPU; printf("Using %s\n", useGPU ? "GPU" : "CPU");break; + case (int)'v': case (int)'V': visualizeInPlace = !visualizeInPlace; printf("VisualizeInPlace = %d\n", visualizeInPlace); break; + case (int)'l': case (int)'L': findLargestObject = !findLargestObject; printf("FindLargestObject = %d\n", findLargestObject); break; + case (int)'1': scale_factor*=1.05; printf("Scale factor = %g\n", scale_factor); break; + case (int)'q': case (int)'Q':scale_factor/=1.05; printf("Scale factor = %g\n", scale_factor); break; + } + + } + return 0; +} + + +