diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 38da35b4f1..8d5ae12d87 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -1435,7 +1435,8 @@ public: bool load(const std::string& filename); void release(); - int detectMultiScale(const GpuMat& image, GpuMat& scaledImageBuffer, GpuMat& objectsBuf, double scaleFactor = 1.1, int minNeighbors = 4/*, Size minSize = Size()*/); + int detectMultiScale(const GpuMat& image, GpuMat& scaledImageBuffer, GpuMat& objectsBuf, double scaleFactor = 1.1, int minNeighbors = 4, + cv::Size maxObjectSize = cv::Size()/*, Size minSize = Size()*/); void preallocateIntegralBuffer(cv::Size desired); bool findLargestObject; diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp index 65ff9f4b6e..e1a4554ff1 100644 --- a/modules/gpu/src/cascadeclassifier.cpp +++ b/modules/gpu/src/cascadeclassifier.cpp @@ -48,20 +48,6 @@ using namespace cv; using namespace cv::gpu; using namespace std; -struct Stage -{ - int first; - int ntrees; - float threshold; -}; - -struct DTreeNode -{ - int featureIdx; - int left; - int right; -}; - #if !defined (HAVE_CUDA) // ============ old fashioned haar cascade ==============================================// cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU() { throw_nogpu(); } @@ -128,6 +114,13 @@ bool cv::gpu::CascadeClassifier_GPU_LBP::load(const string& classifierAsXml) #define GPU_CC_FEATURES "features" #define GPU_CC_RECT "rect" +struct Stage +{ + int first; + int ntrees; + float threshold; +}; + // currently only stump based boost classifiers are supported bool CascadeClassifier_GPU_LBP::read(const FileNode &root) { @@ -279,12 +272,26 @@ namespace cv { namespace gpu { namespace device { namespace lbp { - void cascadeClassify(const DevMem2Db stages, const DevMem2Di trees, const DevMem2Db nodes, const DevMem2Df leaves, const DevMem2Di subsets, const DevMem2Db features, - const DevMem2Di integral, int workWidth, int workHeight, int clWidth, int clHeight, float scale, int step, int subsetSize, DevMem2D_ objects, int minNeighbors = 4, cudaStream_t stream = 0); + void classifyStump(const DevMem2Db mstages, + const int nstages, + const DevMem2Di mnodes, + const DevMem2Df mleaves, + const DevMem2Di msubsets, + const DevMem2Db mfeatures, + const DevMem2Di integral, + const int workWidth, + const int workHeight, + const int clWidth, + const int clHeight, + float scale, + int step, + int subsetSize, + DevMem2D_ objects); } }}} -int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& scaledImageBuffer, GpuMat& objects, double scaleFactor, int minNeighbors /*, Size minSize=Size()*/) +int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& scaledImageBuffer, GpuMat& objects, + double scaleFactor, int minNeighbors, cv::Size maxObjectSize /*, Size minSize=Size()*/) { CV_Assert( scaleFactor > 1 && image.depth() == CV_8U ); CV_Assert(!empty()); @@ -299,28 +306,35 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp // temp solution objects.create(image.rows, image.cols, CV_32SC4); - scaledImageBuffer.create(image.size(), image.type()); + if (maxObjectSize == cv::Size()) + maxObjectSize = image.size(); + + scaledImageBuffer.create(image.rows + 1, image.cols + 1, CV_8U); - // TODO: specify max objects size for( double factor = 1; ; factor *= scaleFactor ) { cv::Size windowSize(cvRound(NxM.width * factor), cvRound(NxM.height * factor)); cv::Size scaledImageSize(cvRound( image.cols / factor ), cvRound( image.rows / factor )); cv::Size processingRectSize( scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 ); - // nothing to do if (processingRectSize.width <= 0 || processingRectSize.height <= 0 ) break; - // TODO: min max object sizes cheching - cv::gpu::resize(image, scaledImageBuffer, scaledImageSize, 0, 0, INTER_NEAREST); - //prepare image for evaluation + + if( windowSize.width > maxObjectSize.width || windowSize.height > maxObjectSize.height ) + break; + + // if( windowSize.width < minObjectSize.width || windowSize.height < minObjectSize.height ) + // continue; + + cv::gpu::resize(image, scaledImageBuffer, scaledImageSize, 0, 0, CV_INTER_LINEAR); + integral.create(cv::Size(scaledImageSize.width + 1, scaledImageSize.height + 1), CV_32SC1); cv::gpu::integral(scaledImageBuffer, integral); int step = (factor <= 2.) + 1; - cv::gpu::device::lbp::cascadeClassify(stage_mat, trees_mat, nodes_mat, leaves_mat, subsets_mat, features_mat, - integral, processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, scaleFactor, step, subsetSize, objects, minNeighbors); + cv::gpu::device::lbp::classifyStump(stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat, + integral, processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, scaleFactor, step, subsetSize, objects); } // TODO: reject levels diff --git a/modules/gpu/src/cuda/lbp.cu b/modules/gpu/src/cuda/lbp.cu index 888bf3b81f..7133920452 100644 --- a/modules/gpu/src/cuda/lbp.cu +++ b/modules/gpu/src/cuda/lbp.cu @@ -46,54 +46,69 @@ namespace cv { namespace gpu { namespace device { namespace lbp { - __global__ void lbp_classify(const DevMem2D_< ::cv::gpu::device::Stage> stages, const DevMem2Di trees, const DevMem2D_< ::cv::gpu::device::ClNode> nodes, - const DevMem2Df leaves, const DevMem2Di subsets, - const DevMem2D_ features, const DevMem2Di integral, float step, int subsetSize, DevMem2D_ objects, float scale, int clWidth, int clHeight) + __global__ void lbp_classify_stump(Stage* stages, int nstages, ClNode* nodes, const float* leaves, const int* subsets, const uchar4* features, + const DevMem2Di integral, int workWidth, int workHeight, int clWidth, int clHeight, float scale, int step, int subsetSize, DevMem2D_ objects) { - unsigned int x = threadIdx.x * step; - unsigned int y = blockIdx.x * step; - int nodeOfs = 0, leafOfs = 0; - ::cv::gpu::device::Feature evaluator; + int y = threadIdx.x * scale; + int x = blockIdx.x * scale; - for (int s = 0; s < stages.cols; s++ ) + int i = 0; + + int current_node = 0; + int current_leave = 0; + + LBP evaluator; + for (int s = 0; s < nstages; s++ ) { - ::cv::gpu::device::Stage stage = stages(0, s); - int sum = 0; - for (int w = 0; w < stage.ntrees; w++) + float sum = 0; + Stage stage = stages[s]; + + for (int t = 0; t < stage.ntrees; t++) { - ::cv::gpu::device::ClNode node = nodes(0, nodeOfs); - uchar4 feature = features(0, node.featureIdx); + ClNode node = nodes[current_node]; + + uchar4 feature = features[node.featureIdx]; + int c = evaluator(y, x, feature, integral); + const int* subsetIdx = subsets + (current_node * subsetSize); - uchar c = evaluator(y, x, feature, integral); - const int subsetIdx = (nodeOfs * subsetSize); - int idx = subsetIdx + ((c >> 5) & ( 1 << (c & 31)) ? leafOfs : leafOfs + 1); - sum += leaves(0, subsets(0, idx) ); - nodeOfs++; - leafOfs += 2; + int idx = (subsetIdx[c >> 5] & ( 1 << (c & 31))) ? current_leave : current_leave + 1; + sum += leaves[idx]; + current_node += 1; + current_leave += 2; } + i = s; if (sum < stage.threshold) return; } + int4 rect; rect.x = roundf(x * scale); rect.y = roundf(y * scale); - rect.z = roundf(clWidth * scale); - rect.w = roundf(clHeight * scale); - objects(blockIdx.x, threadIdx.x) = rect; + rect.z = roundf(clWidth); + rect.w = roundf(clHeight); + + if(i >= 19) + printf( "GPU detected [%d, %d] - [%d, %d]\n", rect.x, rect.y, rect.z, rect.w); + } - void cascadeClassify(const DevMem2Db bstages, const DevMem2Di trees, const DevMem2Db bnodes, const DevMem2Df leaves, const DevMem2Di subsets, const DevMem2Db bfeatures, - const DevMem2Di integral, int workWidth, int workHeight, int clWidth, int clHeight, float scale, int step, int subsetSize, DevMem2D_ objects, int minNeighbors, cudaStream_t stream) + void classifyStump(const DevMem2Db mstages, const int nstages, const DevMem2Di mnodes, const DevMem2Df mleaves, const DevMem2Di msubsets, const DevMem2Db mfeatures, + const DevMem2Di integral, const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, + DevMem2D_ objects) { - printf("CascadeClassify"); - int blocks = ceilf(workHeight / (float)step); + int blocks = ceilf(workHeight / (float)step); int threads = ceilf(workWidth / (float)step); - DevMem2D_< ::cv::gpu::device::Stage> stages = DevMem2D_< ::cv::gpu::device::Stage>(bstages); - DevMem2D_ features = (DevMem2D_)bfeatures; - DevMem2D_< ::cv::gpu::device::ClNode> nodes = DevMem2D_< ::cv::gpu::device::ClNode>(bnodes); + printf("blocks %d, threads %d\n", blocks, threads); + + Stage* stages = (Stage*)(mstages.ptr()); + ClNode* nodes = (ClNode*)(mnodes.ptr()); + const float* leaves = mleaves.ptr(); + const int* subsets = msubsets.ptr(); + const uchar4* features = (uchar4*)(mfeatures.ptr()); - lbp_classify<<>>(stages, trees, nodes, leaves, subsets, features, integral, step, subsetSize, objects, scale, clWidth, clHeight); + lbp_classify_stump<<>>(stages, nstages, nodes, leaves, subsets, features, integral, + workWidth, workHeight, clWidth, clHeight, scale, step, subsetSize, objects); } } }}} \ No newline at end of file diff --git a/modules/gpu/src/opencv2/gpu/device/lbp.hpp b/modules/gpu/src/opencv2/gpu/device/lbp.hpp index debda53d40..3296ee3967 100644 --- a/modules/gpu/src/opencv2/gpu/device/lbp.hpp +++ b/modules/gpu/src/opencv2/gpu/device/lbp.hpp @@ -44,62 +44,58 @@ #define __OPENCV_GPU_DEVICE_LBP_HPP_ #include "internal_shared.hpp" -// #include "opencv2/gpu/device/border_interpolate.hpp" -// #include "opencv2/gpu/device/vec_traits.hpp" -// #include "opencv2/gpu/device/vec_math.hpp" -// #include "opencv2/gpu/device/saturate_cast.hpp" -// #include "opencv2/gpu/device/filters.hpp" - -// #define CALC_SUM_(p0, p1, p2, p3, offset) \ -// ((p0)[offset] - (p1)[offset] - (p2)[offset] + (p3)[offset]) - -// __device__ __forceinline__ int sum(p0, p1, p2, p3, offset) -// { - -// } namespace cv { namespace gpu { namespace device { +namespace lbp{ struct Stage { int first; int ntrees; float threshold; - __device__ __forceinline__ Stage(int f = 0, int n = 0, float t = 0.f) : first(f), ntrees(n), threshold(t) {} - __device__ __forceinline__ Stage(const Stage& other) : first(other.first), ntrees(other.ntrees), threshold(other.threshold) {} }; struct ClNode { - int featureIdx; int left; int right; - __device__ __forceinline__ ClNode(int f = 0, int l = 0, int r = 0) : featureIdx(f), left(l), right(r) {} - __device__ __forceinline__ ClNode(const ClNode& other) : featureIdx(other.featureIdx), left(other.left), right(other.right) {} + int featureIdx; }; - struct Feature + struct LBP { - __device__ __forceinline__ Feature(const Feature& other) {(void)other;} - __device__ __forceinline__ Feature() {} + __device__ __forceinline__ LBP(const LBP& other) {(void)other;} + __device__ __forceinline__ LBP() {} //feature as uchar x, y - left top, z,w - right bottom - __device__ __forceinline__ uchar operator() (unsigned int y, unsigned int x, uchar4 feature, const DevMem2Di integral) const + __device__ __forceinline__ int operator() (unsigned int y, unsigned int x, uchar4 feature, const DevMem2Di integral) const { int x_off = 2 * feature.z; int y_off = 2 * feature.w; + // printf("feature: %d %d %d %d\n", (int)feature.x, (int)feature.y, (int)feature.z, (int)feature.w); + feature.z += feature.x; + feature.w += feature.y; // load feature key points int anchors[16]; + /* + P0-----P1-----P2-----P3 + | | | | + P4-----P5-----P6-----P7 + | | | | + P8-----P9-----P10----P11 + | | | | + P12----P13----P14----15 + */ anchors[0] = integral(y + feature.y, x + feature.x); anchors[1] = integral(y + feature.y, x + feature.z); - anchors[2] = integral(y + feature.y, x + x_off + feature.x); - anchors[3] = integral(y + feature.y, x + x_off + feature.z); + anchors[2] = integral(y + feature.y, x + feature.x + x_off); + anchors[3] = integral(y + feature.y, x + feature.z + x_off); anchors[4] = integral(y + feature.w, x + feature.x); anchors[5] = integral(y + feature.w, x + feature.z); - anchors[6] = integral(y + feature.w, x + x_off + feature.x); - anchors[7] = integral(y + feature.w, x + x_off + feature.z); + anchors[6] = integral(y + feature.w, x + feature.x + x_off); + anchors[7] = integral(y + feature.w, x + feature.z + x_off); anchors[8] = integral(y + y_off + feature.y, x + feature.x); anchors[9] = integral(y + y_off + feature.y, x + feature.z); @@ -114,7 +110,7 @@ namespace cv { namespace gpu { namespace device { // calculate feature int sum = anchors[5] - anchors[6] - anchors[9] + anchors[10]; - uchar response = (( (anchors[ 0] - anchors[ 1] - anchors[ 4] + anchors[ 5]) >= sum )? 128 : 0) + int response = (( (anchors[ 0] - anchors[ 1] - anchors[ 4] + anchors[ 5]) >= sum )? 128 : 0) |(( (anchors[ 1] - anchors[ 2] - anchors[ 5] + anchors[ 6]) >= sum )? 64 : 0) |(( (anchors[ 2] - anchors[ 3] - anchors[ 6] + anchors[ 7]) >= sum )? 32 : 0) |(( (anchors[ 6] - anchors[ 7] - anchors[10] + anchors[11]) >= sum )? 16 : 0) @@ -122,11 +118,12 @@ namespace cv { namespace gpu { namespace device { |(( (anchors[ 9] - anchors[10] - anchors[13] + anchors[14]) >= sum )? 4 : 0) |(( (anchors[ 8] - anchors[ 9] - anchors[12] + anchors[13]) >= sum )? 2 : 0) |(( (anchors[ 4] - anchors[ 5] - anchors[ 8] + anchors[ 9]) >= sum )? 1 : 0); - return response; } - }; +} // lbp + + } } }// namespaces #endif \ No newline at end of file