From 4128d5782fb2d0322e71efe6d9a1eb90be9145dc Mon Sep 17 00:00:00 2001
From: Marina Kolpakova <no@email>
Date: Wed, 4 Jul 2012 04:51:09 +0000
Subject: [PATCH] added hipotesis filtration

---
 modules/gpu/src/cascadeclassifier.cpp      | 18 +++--
 modules/gpu/src/cuda/lbp.cu                | 84 +++++++++++++++++++++-
 modules/gpu/src/opencv2/gpu/device/lbp.hpp | 47 +++++++++++-
 3 files changed, 139 insertions(+), 10 deletions(-)
diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp
index 2057f1aee9..8932667e86 100644
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -273,7 +273,7 @@ namespace cv { namespace gpu { namespace device
 {
     namespace lbp
     {
-        classifyStump(const DevMem2Db mstages,
+        void classifyStump(const DevMem2Db mstages,
                       const int nstages,
                       const DevMem2Di mnodes,
                       const DevMem2Df mleaves,
@@ -289,16 +289,19 @@ namespace cv { namespace gpu { namespace device
                       int subsetSize,
                       DevMem2D_<int4> objects,
                       unsigned int* classified);
+
+        int connectedConmonents(DevMem2D_<int4> candidates, int groupThreshold, float grouping_eps, unsigned int* nclasses);
     }
 }}}
 
 int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& scaledImageBuffer, GpuMat& objects,
-                                                        double scaleFactor, int minNeighbors, cv::Size maxObjectSize /*, Size minSize=Size()*/)
+                                                        double scaleFactor, int groupThreshold, cv::Size maxObjectSize /*, Size minSize=Size()*/)
 {
     CV_Assert( scaleFactor > 1 && image.depth() == CV_8U );
     CV_Assert(!empty());
 
     const int defaultObjSearchNum = 100;
+    const float grouping_eps = 0.2;
 
     if( !objects.empty() && objects.depth() == CV_32S)
         objects.reshape(4, 1);
@@ -340,11 +343,14 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
         cv::gpu::device::lbp::classifyStump(stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat,
         integral, processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, scaleFactor, step, subsetSize, objects, dclassified);
     }
-        cudaMemcpy(classified, dclassified, sizeof(int), cudaMemcpyDeviceToHost);
-        std::cout  << *classified << "Results:    " << cv::Mat(objects).row(0).colRange(0, *classified) << std::endl;
-    // TODO: reject levels
 
-    return 0;
+    cudaMemcpy(classified, dclassified, sizeof(int), cudaMemcpyDeviceToHost);
+    GpuMat candidates(1, *classified, objects.type(), objects.ptr());
+    // std::cout  << *classified << " Results: " << cv::Mat(candidates) << std::endl;
+
+    if (groupThreshold <= 0  || objects.empty())
+        return 0;
+    return cv::gpu::device::lbp::connectedConmonents(candidates, groupThreshold, grouping_eps, dclassified);
 }
 
 // ============ old fashioned haar cascade ==============================================//
diff --git a/modules/gpu/src/cuda/lbp.cu b/modules/gpu/src/cuda/lbp.cu
index b07ecad0a0..5c273b32b9 100644
--- a/modules/gpu/src/cuda/lbp.cu
+++ b/modules/gpu/src/cuda/lbp.cu
@@ -41,6 +41,8 @@
 //M*/
 
 #include <opencv2/gpu/device/lbp.hpp>
+#include <opencv2/gpu/device/vec_traits.hpp>
+#include <opencv2/gpu/device/saturate_cast.hpp>
 
 namespace cv { namespace gpu { namespace device
 {
@@ -89,13 +91,83 @@ namespace cv { namespace gpu { namespace device
             objects(0, res) = rect;
         }
 
-        classifyStump(const DevMem2Db mstages, const int nstages, const DevMem2Di mnodes, const DevMem2Df mleaves, const DevMem2Di msubsets, const DevMem2Db mfeatures,
+        template<typename Pr>
+        __global__ void disjoin(int4* candidates, unsigned int n, int groupThreshold, float grouping_eps, unsigned int* nclasses)
+        {
+            using cv::gpu::device::VecTraits;
+            unsigned int tid = threadIdx.x;
+            extern __shared__ int sbuff[];
+
+            int* labels = sbuff;
+            int* rrects = (int*)(sbuff + n);
+
+            Pr predicate(grouping_eps);
+            partition(candidates, n, labels, predicate);
+
+            rrects[tid * 4 + 0] = 0;
+            rrects[tid * 4 + 1] = 0;
+            rrects[tid * 4 + 2] = 0;
+            rrects[tid * 4 + 3] = 0;
+            __syncthreads();
+
+            int cls = labels[tid];
+            atomicAdd((int*)(rrects + cls * 4 + 0), candidates[tid].x);
+            atomicAdd((int*)(rrects + cls * 4 + 1), candidates[tid].y);
+            atomicAdd((int*)(rrects + cls * 4 + 2), candidates[tid].z);
+            atomicAdd((int*)(rrects + cls * 4 + 3), candidates[tid].w);
+            labels[tid] = 0;
+            __syncthreads();
+
+            atomicInc((unsigned int*)labels + cls, n);
+            labels[n - 1] = 0;
+
+            int active = labels[tid];
+            if (active)
+            {
+                int* r1 = rrects + tid * 4;
+                float s = 1.f / active;
+                r1[0] = saturate_cast<int>(r1[0] * s);
+                r1[1] = saturate_cast<int>(r1[1] * s);
+                r1[2] = saturate_cast<int>(r1[2] * s);
+                r1[3] = saturate_cast<int>(r1[3] * s);
+
+                int n1 = active;
+                __syncthreads();
+                unsigned int j = 0;
+                if( active > groupThreshold )
+                {
+                    for (j = 0; j < n; j++)
+                    {
+                        int n2 = labels[j];
+                        if(!n2 || j == tid || n2 <= groupThreshold )
+                        continue;
+
+                        int* r2 = rrects + j * 4;
+
+                        int dx = saturate_cast<int>( r2[2] * grouping_eps );
+                        int dy = saturate_cast<int>( r2[3] * grouping_eps );
+
+                        if( tid != j && r1[0] >= r2[0] - dx && r1[1] >= r2[1] - dy &&
+                            r1[0] + r1[2] <= r2[0] + r2[2] + dx && r1[1] + r1[3] <= r2[1] + r2[3] + dy &&
+                            (n2 > max(3, n1) || n1 < 3) )
+                            break;
+                    }
+
+                    if( j == n)
+                    {
+                        // printf("founded gpu %d %d %d %d \n", r1[0], r1[1], r1[2], r1[3]);
+                        candidates[atomicInc((unsigned int*)labels + n -1, n)] = VecTraits<int4>::make(r1[0], r1[1], r1[2], r1[3]);
+                    }
+                }
+            }
+        }
+
+        void classifyStump(const DevMem2Db mstages, const int nstages, const DevMem2Di mnodes, const DevMem2Df mleaves, const DevMem2Di msubsets, const DevMem2Db mfeatures,
                            const DevMem2Di integral, const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize,
                            DevMem2D_<int4> objects, unsigned int* classified)
         {
             int blocks  = ceilf(workHeight / (float)step);
             int threads = ceilf(workWidth / (float)step);
-            // printf("blocks %d, threads %d\n", blocks, threads);
 
             Stage* stages = (Stage*)(mstages.ptr());
             ClNode* nodes = (ClNode*)(mnodes.ptr());
@@ -106,5 +178,13 @@ namespace cv { namespace gpu { namespace device
             lbp_classify_stump<<<blocks, threads>>>(stages, nstages, nodes, leaves, subsets, features, integral,
                 workWidth, workHeight, clWidth, clHeight, scale, step, subsetSize, objects, classified);
         }
+
+        int connectedConmonents(DevMem2D_<int4> candidates, int groupThreshold, float grouping_eps, unsigned int* nclasses)
+        {
+            int threads = candidates.cols;
+            int smem_amount = threads * sizeof(int) + threads * sizeof(int4);
+            disjoin<InSameComponint><<<1, threads, smem_amount>>>((int4*)candidates.ptr(), candidates.cols, groupThreshold, grouping_eps, nclasses);
+            return 0;
+        }
     }
 }}}
\ No newline at end of file
diff --git a/modules/gpu/src/opencv2/gpu/device/lbp.hpp b/modules/gpu/src/opencv2/gpu/device/lbp.hpp
index 3296ee3967..2b620b5627 100644
--- a/modules/gpu/src/opencv2/gpu/device/lbp.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/lbp.hpp
@@ -62,6 +62,50 @@ namespace lbp{
         int   featureIdx;
     };
 
+    struct InSameComponint
+    {
+    public:
+        __device__ __forceinline__ InSameComponint(float _eps) : eps(_eps * 0.5) {}
+        __device__ __forceinline__ InSameComponint(const InSameComponint& other) : eps(other.eps) {}
+
+        __device__ __forceinline__ bool operator()(const int4& r1, const int4& r2) const
+        {
+            double delta = eps * (min(r1.z, r2.z) + min(r1.w, r2.w));
+
+            return abs(r1.x - r2.x) <= delta && abs(r1.y - r2.y) <= delta
+                && abs(r1.x + r1.z - r2.x - r2.z) <= delta && abs(r1.y + r1.w - r2.y - r2.w) <= delta;
+        }
+        float eps;
+    };
+
+    template<typename Pr>
+    __device__ __forceinline__ void partition(int4* vec, unsigned int n, int* labels, Pr predicate)
+    {
+        unsigned tid = threadIdx.x;
+        labels[tid] = tid;
+        __syncthreads();
+
+        for (unsigned int id = 0; id < n; id++)
+        {
+            if (tid != id && predicate(vec[tid], vec[id]))
+            {
+                int p = labels[tid];
+                int q = labels[id];
+
+                if (p < q)
+                {
+                    atomicMin(labels + id, p);
+                }
+                else if (p > q)
+                {
+                    atomicMin(labels + tid, q);
+                }
+            }
+        }
+        __syncthreads();
+        // printf("tid %d label %d\n", tid, labels[tid]);
+    }
+
     struct LBP
     {
         __device__ __forceinline__ LBP(const LBP& other) {(void)other;}
@@ -72,7 +116,6 @@ namespace lbp{
         {
             int x_off = 2 * feature.z;
             int y_off = 2 * feature.w;
-            // printf("feature: %d %d %d %d\n", (int)feature.x, (int)feature.y, (int)feature.z, (int)feature.w);
             feature.z += feature.x;
             feature.w += feature.y;
 
@@ -107,7 +150,7 @@ namespace lbp{
             anchors[14] = integral(y + y_off + feature.w, x + x_off + feature.x);
             anchors[15] = integral(y + y_off + feature.w, x + x_off + feature.z);
 
-            // calculate feature
+            // calculate responce
             int sum = anchors[5] - anchors[6] - anchors[9] + anchors[10];
 
             int response =   (( (anchors[ 0] - anchors[ 1] - anchors[ 4] + anchors[ 5]) >= sum )? 128 : 0)