diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 4c3af5956c..fd72d9e9d7 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -1435,7 +1435,7 @@ public: bool load(const std::string& filename); void release(); - int detectMultiScale(const GpuMat& image, GpuMat& scaledImageBuffer, GpuMat& objectsBuf, double scaleFactor = 1.1, int minNeighbors = 4, + int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, double scaleFactor = 1.1, int minNeighbors = 4, cv::Size maxObjectSize = cv::Size()/*, Size minSize = Size()*/); void preallocateIntegralBuffer(cv::Size desired); Size getClassifierSize() const; diff --git a/modules/gpu/perf/perf_objdetect.cpp b/modules/gpu/perf/perf_objdetect.cpp index e1990488a3..a9b3d7e547 100644 --- a/modules/gpu/perf/perf_objdetect.cpp +++ b/modules/gpu/perf/perf_objdetect.cpp @@ -69,16 +69,14 @@ GPU_PERF_TEST_1(LBPClassifier, cv::gpu::DeviceInfo) cv::gpu::GpuMat img(img_host); - cv::gpu::GpuMat gpu_rects, buffer; + cv::gpu::GpuMat gpu_rects; cv::gpu::CascadeClassifier_GPU_LBP cascade(img.size()); ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/lbpcascade/lbpcascade_frontalface.xml"))); - // cascade.detectMultiScale(img, objects_buffer); - cascade.detectMultiScale(img, buffer, gpu_rects); - + cascade.detectMultiScale(img, gpu_rects); TEST_CYCLE() { - cascade.detectMultiScale(img, buffer, gpu_rects); + cascade.detectMultiScale(img, gpu_rects); } } diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp index e8522568e3..5422dcf99d 100644 --- a/modules/gpu/src/cascadeclassifier.cpp +++ b/modules/gpu/src/cascadeclassifier.cpp @@ -70,7 +70,7 @@ Size cv::gpu::CascadeClassifier_GPU_LBP::getClassifierSize() const void cv::gpu::CascadeClassifier_GPU_LBP::preallocateIntegralBuffer(cv::Size /*desired*/) { throw_nogpu();} void cv::gpu::CascadeClassifier_GPU_LBP::initializeBuffers(cv::Size /*frame*/) { throw_nogpu();} -int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const cv::gpu::GpuMat& /*image*/, cv::gpu::GpuMat& /*scaledImageBuffer*/, cv::gpu::GpuMat& /*objectsBuf*/, +int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const cv::gpu::GpuMat& /*image*/, cv::gpu::GpuMat& /*objectsBuf*/, double /*scaleFactor*/, int /*minNeighbors*/, cv::Size /*maxObjectSize*/){ throw_nogpu(); return 0;} #else @@ -299,28 +299,29 @@ namespace cv { namespace gpu { namespace device { namespace lbp { - void classifyStump(const DevMem2Db mstages, - const int nstages, - const DevMem2Di mnodes, - const DevMem2Df mleaves, - const DevMem2Di msubsets, - const DevMem2Db mfeatures, - const DevMem2Di integral, - const int workWidth, - const int workHeight, - const int clWidth, - const int clHeight, - float scale, - int step, - int subsetSize, - DevMem2D_ objects, - unsigned int* classified); - - int connectedConmonents(DevMem2D_ candidates, DevMem2D_ objects,int groupThreshold, float grouping_eps, unsigned int* nclasses); + void classifyStump(const DevMem2Db& mstages, + const int nstages, + const DevMem2Di& mnodes, + const DevMem2Df& mleaves, + const DevMem2Di& msubsets, + const DevMem2Db& mfeatures, + const int workWidth, + const int workHeight, + const int clWidth, + const int clHeight, + float scale, + int step, + int subsetSize, + DevMem2D_ objects, + unsigned int* classified); + + int connectedConmonents(DevMem2D_ candidates, DevMem2D_ objects,int groupThreshold, float grouping_eps, unsigned int* nclasses); + void bindIntegral(DevMem2Di integral); + void unbindIntegral(); } }}} -int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& scaledImageBuffer, GpuMat& objects, +int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& objects, double scaleFactor, int groupThreshold, cv::Size maxObjectSize /*, Size minSize=Size()*/) { CV_Assert( scaleFactor > 1 && image.depth() == CV_8U ); @@ -332,10 +333,12 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp if( !objects.empty() && objects.depth() == CV_32S) objects.reshape(4, 1); else - objects.create(1 , defaultObjSearchNum, CV_32SC4); - - GpuMat candidates(1 , defaultObjSearchNum, CV_32SC4); - // GpuMat candidates(objects); + objects.create(1 , image.cols >> 4, CV_32SC4); + GpuMat candidates(1 , image.cols >> 1, CV_32SC4); + // GpuMat candidates(1 , defaultObjSearchNum, CV_32SC4); + // used for debug + // candidates.setTo(cv::Scalar::all(0)); + // objects.setTo(cv::Scalar::all(0)); if (maxObjectSize == cv::Size()) maxObjectSize = image.size(); @@ -347,9 +350,11 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp cudaMalloc(&dclassified, sizeof(int)); cudaMemcpy(dclassified, classified, sizeof(int), cudaMemcpyHostToDevice); int step; + cv::gpu::device::lbp::bindIntegral(integral); for( double factor = 1; ; factor *= scaleFactor ) { + // if (factor > 2.0) break; cv::Size windowSize(cvRound(NxM.width * factor), cvRound(NxM.height * factor)); cv::Size scaledImageSize(cvRound( image.cols / factor ), cvRound( image.rows / factor )); cv::Size processingRectSize( scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 ); @@ -365,7 +370,7 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp GpuMat scaledImg(resuzeBuffer, cv::Rect(0, 0, scaledImageSize.width, scaledImageSize.height)); GpuMat scaledIntegral(integral, cv::Rect(0, 0, scaledImageSize.width + 1, scaledImageSize.height + 1)); - GpuMat currBuff = integralBuffer;//(integralBuffer, cv::Rect(0, 0, integralBuffer.width, integralBuffer.height)); + GpuMat currBuff = integralBuffer; cv::gpu::resize(image, scaledImg, scaledImageSize, 0, 0, CV_INTER_LINEAR); cv::gpu::integralBuffered(scaledImg, scaledIntegral, currBuff); @@ -373,8 +378,10 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp step = (factor <= 2.) + 1; cv::gpu::device::lbp::classifyStump(stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat, - scaledIntegral, processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified); + processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified); } + + cv::gpu::device::lbp::unbindIntegral(); if (groupThreshold <= 0 || objects.empty()) return 0; cv::gpu::device::lbp::connectedConmonents(candidates, objects, groupThreshold, grouping_eps, dclassified); diff --git a/modules/gpu/src/cuda/lbp.cu b/modules/gpu/src/cuda/lbp.cu index 769430ed8d..ba2e29448f 100644 --- a/modules/gpu/src/cuda/lbp.cu +++ b/modules/gpu/src/cuda/lbp.cu @@ -48,8 +48,102 @@ namespace cv { namespace gpu { namespace device { namespace lbp { + + texture tintegral(false, cudaFilterModePoint, cudaAddressModeClamp); + + struct LBP + { + __device__ __forceinline__ LBP(const LBP& other) {(void)other;} + __device__ __forceinline__ LBP() {} + + //feature as uchar x, y - left top, z,w - right bottom + __device__ __forceinline__ int operator() (int ty, int tx, int fh, int featurez, int& shift) const + { + int anchors[9]; + + anchors[0] = tex2D(tintegral, tx, ty); + anchors[1] = tex2D(tintegral, tx + featurez, ty); + anchors[0] -= anchors[1]; + anchors[2] = tex2D(tintegral, tx + featurez * 2, ty); + anchors[1] -= anchors[2]; + anchors[2] -= tex2D(tintegral, tx + featurez * 3, ty); + + ty += fh; + anchors[3] = tex2D(tintegral, tx, ty); + anchors[4] = tex2D(tintegral, tx + featurez, ty); + anchors[3] -= anchors[4]; + anchors[5] = tex2D(tintegral, tx + featurez * 2, ty); + anchors[4] -= anchors[5]; + anchors[5] -= tex2D(tintegral, tx + featurez * 3, ty); + + anchors[0] -= anchors[3]; + anchors[1] -= anchors[4]; + anchors[2] -= anchors[5]; + // 0 - 2 contains s0 - s2 + + ty += fh; + anchors[6] = tex2D(tintegral, tx, ty); + anchors[7] = tex2D(tintegral, tx + featurez, ty); + anchors[6] -= anchors[7]; + anchors[8] = tex2D(tintegral, tx + featurez * 2, ty); + anchors[7] -= anchors[8]; + anchors[8] -= tex2D(tintegral, tx + featurez * 3, ty); + + anchors[3] -= anchors[6]; + anchors[4] -= anchors[7]; + anchors[5] -= anchors[8]; + // 3 - 5 contains s3 - s5 + + anchors[0] -= anchors[4]; + anchors[1] -= anchors[4]; + anchors[2] -= anchors[4]; + anchors[3] -= anchors[4]; + anchors[5] -= anchors[4]; + + int response = (~(anchors[0] >> 31)) & 4; + response |= (~(anchors[1] >> 31)) & 2;; + response |= (~(anchors[2] >> 31)) & 1; + + shift = (~(anchors[5] >> 31)) & 16; + shift |= (~(anchors[3] >> 31)) & 1; + + ty += fh; + anchors[0] = tex2D(tintegral, tx, ty); + anchors[1] = tex2D(tintegral, tx + featurez, ty); + anchors[0] -= anchors[1]; + anchors[2] = tex2D(tintegral, tx + featurez * 2, ty); + anchors[1] -= anchors[2]; + anchors[2] -= tex2D(tintegral, tx + featurez * 3, ty); + + anchors[6] -= anchors[0]; + anchors[7] -= anchors[1]; + anchors[8] -= anchors[2]; + // 0 -2 contains s6 - s8 + + anchors[6] -= anchors[4]; + anchors[7] -= anchors[4]; + anchors[8] -= anchors[4]; + + shift |= (~(anchors[6] >> 31)) & 2; + shift |= (~(anchors[7] >> 31)) & 4; + shift |= (~(anchors[8] >> 31)) & 8; + return response; + } + }; + + void bindIntegral(DevMem2Di integral) + { + cudaChannelFormatDesc desc = cudaCreateChannelDesc(); + cudaSafeCall( cudaBindTexture2D(0, &tintegral, integral.ptr(), &desc, (size_t)integral.cols, (size_t)integral.rows, (size_t)integral.step)); + } + + void unbindIntegral() + { + cudaSafeCall( cudaUnbindTexture(&tintegral)); + } + __global__ void lbp_classify_stump(const Stage* stages, const int nstages, const ClNode* nodes, const float* leaves, const int* subsets, const uchar4* features, - const int* integral, const int istep, const int workWidth,const int workHeight, const int clWidth, const int clHeight, const float scale, const int step, + /* const int* integral,const int istep, const int workWidth,const int workHeight,*/ const int clWidth, const int clHeight, const float scale, const int step, const int subsetSize, DevMem2D_ objects, unsigned int* n) { int x = threadIdx.x * step; @@ -63,21 +157,18 @@ namespace cv { namespace gpu { namespace device { float sum = 0; Stage stage = stages[s]; - for (int t = 0; t < stage.ntrees; t++) { ClNode node = nodes[current_node]; uchar4 feature = features[node.featureIdx]; - int c = evaluator( (y + feature.y) * istep + x + feature.x , feature.w * istep, feature.z, integral, istep); - const int* subsetIdx = subsets + (current_node * subsetSize); - - int idx = (subsetIdx[c >> 5] & ( 1 << (c & 31))) ? current_leave : current_leave + 1; + int shift; + int c = evaluator(y + feature.y, x + feature.x, feature.w, feature.z, shift); + int idx = (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1; sum += leaves[idx]; current_node += 1; current_leave += 2; } - if (sum < stage.threshold) return; } @@ -85,8 +176,8 @@ namespace cv { namespace gpu { namespace device int4 rect; rect.x = roundf(x * scale); rect.y = roundf(y * scale); - rect.z = roundf(clWidth); - rect.w = roundf(clHeight); + rect.z = clWidth; + rect.w = clHeight; #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) int res = __atomicInc(n, 100U); #else @@ -178,8 +269,8 @@ namespace cv { namespace gpu { namespace device } } - void classifyStump(const DevMem2Db mstages, const int nstages, const DevMem2Di mnodes, const DevMem2Df mleaves, const DevMem2Di msubsets, const DevMem2Db mfeatures, - const DevMem2Di integral, const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, + void classifyStump(const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures, + /*const DevMem2Di& integral,*/ const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_ objects, unsigned int* classified) { int blocks = ceilf(workHeight / (float)step); @@ -190,11 +281,8 @@ namespace cv { namespace gpu { namespace device const float* leaves = mleaves.ptr(); const int* subsets = msubsets.ptr(); const uchar4* features = (uchar4*)(mfeatures.ptr()); - const int* integ = integral.ptr(); - int istep = integral.step / sizeof(int); - - lbp_classify_stump<<>>(stages, nstages, nodes, leaves, subsets, features, integ, istep, - workWidth, workHeight, clWidth, clHeight, scale, step, subsetSize, objects, classified); + lbp_classify_stump<<>>(stages, nstages, nodes, leaves, subsets, features, /*integ, istep, + workWidth, workHeight,*/ clWidth, clHeight, scale, step, subsetSize, objects, classified); } int connectedConmonents(DevMem2D_ candidates, DevMem2D_ objects, int groupThreshold, float grouping_eps, unsigned int* nclasses) diff --git a/modules/gpu/src/opencv2/gpu/device/lbp.hpp b/modules/gpu/src/opencv2/gpu/device/lbp.hpp index b5ef365846..69867c9939 100644 --- a/modules/gpu/src/opencv2/gpu/device/lbp.hpp +++ b/modules/gpu/src/opencv2/gpu/device/lbp.hpp @@ -153,90 +153,8 @@ __device__ __forceinline__ T __atomicMin(T* address, T val) __syncthreads(); // printf("tid %d label %d\n", tid, labels[tid]); } - - struct LBP - { - __device__ __forceinline__ LBP(const LBP& other) {(void)other;} - __device__ __forceinline__ LBP() {} - - //feature as uchar x, y - left top, z,w - right bottom - __device__ __forceinline__ int operator() (unsigned int y, int featurew, int featurez, const int* integral, int step) const - { - int x_off = 2 * featurez; - int anchors[9]; - - anchors[0] = integral[y]; - anchors[1] = integral[y + featurez]; - anchors[0] -= anchors[1]; - anchors[2] = integral[y + x_off]; - anchors[1] -= anchors[2]; - anchors[2] -= integral[y + featurez + x_off]; - y += featurew; - - anchors[3] = integral[y]; - anchors[4] = integral[y + featurez]; - anchors[3] -= anchors[4]; - anchors[5] = integral[y + x_off]; - anchors[4] -= anchors[5]; - anchors[5] -= integral[y + featurez + x_off]; - - anchors[0] -= anchors[3]; - anchors[1] -= anchors[4]; - anchors[2] -= anchors[5]; - // 0 - 2 contains s0 - s2 - - y += featurew; - anchors[6] = integral[y]; - anchors[7] = integral[y + featurez]; - anchors[6] -= anchors[7]; - anchors[8] = integral[y + x_off]; - anchors[7] -= anchors[8]; - anchors[8] -= integral[y + x_off + featurez]; - - anchors[3] -= anchors[6]; - anchors[4] -= anchors[7]; - anchors[5] -= anchors[8]; - // 3 - 5 contains s3 - s5 - - anchors[0] -= anchors[4]; - anchors[1] -= anchors[4]; - anchors[2] -= anchors[4]; - anchors[3] -= anchors[4]; - anchors[5] -= anchors[4]; - - int response = (~(anchors[0] >> 31)) & 128; - response |= (~(anchors[1] >> 31)) & 64;; - response |= (~(anchors[2] >> 31)) & 32; - response |= (~(anchors[5] >> 31)) & 16; - response |= (~(anchors[3] >> 31)) & 1; - - y += featurew; - anchors[0] = integral[y]; - anchors[1] = integral[y + featurez]; - anchors[0] -= anchors[1]; - anchors[2] = integral[y + x_off]; - anchors[1] -= anchors[2]; - anchors[2] -= integral[y + x_off + featurez]; - - anchors[6] -= anchors[0]; - anchors[7] -= anchors[1]; - anchors[8] -= anchors[2]; - // 0 -2 contains s6 - s8 - - anchors[6] -= anchors[4]; - anchors[7] -= anchors[4]; - anchors[8] -= anchors[4]; - - response |= (~(anchors[6] >> 31)) & 2; - response |= (~(anchors[7] >> 31)) & 4; - response |= (~(anchors[8] >> 31)) & 8; - - return response; - } - }; } // lbp - } } }// namespaces #endif \ No newline at end of file diff --git a/modules/gpu/test/test_objdetect.cpp b/modules/gpu/test/test_objdetect.cpp index 8b49538d40..fdd9454c86 100644 --- a/modules/gpu/test/test_objdetect.cpp +++ b/modules/gpu/test/test_objdetect.cpp @@ -343,15 +343,16 @@ TEST_P(LBP_classify, Accuracy) cv::gpu::CascadeClassifier_GPU_LBP gpuClassifier; ASSERT_TRUE(gpuClassifier.load(classifierXmlPath)); - cv::gpu::GpuMat gpu_rects, buffer; + cv::gpu::GpuMat gpu_rects; cv::gpu::GpuMat tested(grey); - int count = gpuClassifier.detectMultiScale(tested, buffer, gpu_rects); + int count = gpuClassifier.detectMultiScale(tested, gpu_rects); cv::Mat gpu_f(gpu_rects); int* gpu_faces = (int*)gpu_f.ptr(); for (int i = 0; i < count; i++) { cv::Rect r(gpu_faces[i * 4],gpu_faces[i * 4 + 1],gpu_faces[i * 4 + 2],gpu_faces[i * 4 + 3]); + std::cout << gpu_faces[i * 4]<< " " << gpu_faces[i * 4 + 1] << " " << gpu_faces[i * 4 + 2] << " " << gpu_faces[i * 4 + 3] << std::endl; cv::rectangle(markedImage, r , cv::Scalar(0, 0, 255, 255)); } }