added hipotesis filtration

13 years ago · 4128d5782f
parent a53f0f397e
commit 4128d5782f
3 changed files with 139 additions and 10 deletions
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@ -273,7 +273,7 @@ namespace cv { namespace gpu { namespace device
 {
    namespace lbp
    {
-        classifyStump(const DevMem2Db mstages,
+        void classifyStump(const DevMem2Db mstages,
                      const int nstages,
                      const DevMem2Di mnodes,
                      const DevMem2Df mleaves,
@ -289,16 +289,19 @@ namespace cv { namespace gpu { namespace device
                      int subsetSize,
                      DevMem2D_<int4> objects,
                      unsigned int* classified);
+
+        int connectedConmonents(DevMem2D_<int4> candidates, int groupThreshold, float grouping_eps, unsigned int* nclasses);
    }
 }}}

 int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& scaledImageBuffer, GpuMat& objects,
-                                                        double scaleFactor, int minNeighbors, cv::Size maxObjectSize /*, Size minSize=Size()*/)
+                                                        double scaleFactor, int groupThreshold, cv::Size maxObjectSize /*, Size minSize=Size()*/)
 {
    CV_Assert( scaleFactor > 1 && image.depth() == CV_8U );
    CV_Assert(!empty());

    const int defaultObjSearchNum = 100;
+    const float grouping_eps = 0.2;

    if( !objects.empty() && objects.depth() == CV_32S)
        objects.reshape(4, 1);
@ -340,11 +343,14 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
        cv::gpu::device::lbp::classifyStump(stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat,
        integral, processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, scaleFactor, step, subsetSize, objects, dclassified);
    }
-        cudaMemcpy(classified, dclassified, sizeof(int), cudaMemcpyDeviceToHost);
-        std::cout  << *classified << "Results:    " << cv::Mat(objects).row(0).colRange(0, *classified) << std::endl;
-    // TODO: reject levels

-    return 0;
+    cudaMemcpy(classified, dclassified, sizeof(int), cudaMemcpyDeviceToHost);
+    GpuMat candidates(1, *classified, objects.type(), objects.ptr());
+    // std::cout  << *classified << " Results: " << cv::Mat(candidates) << std::endl;
+
+    if (groupThreshold <= 0  || objects.empty())
+        return 0;
+    return cv::gpu::device::lbp::connectedConmonents(candidates, groupThreshold, grouping_eps, dclassified);
 }

 // ============ old fashioned haar cascade ==============================================//
--- a/modules/gpu/src/cuda/lbp.cu
+++ b/modules/gpu/src/cuda/lbp.cu
@ -41,6 +41,8 @@
 //M*/

 #include <opencv2/gpu/device/lbp.hpp>
+#include <opencv2/gpu/device/vec_traits.hpp>
+#include <opencv2/gpu/device/saturate_cast.hpp>

 namespace cv { namespace gpu { namespace device
 {
@ -89,13 +91,83 @@ namespace cv { namespace gpu { namespace device
            objects(0, res) = rect;
        }

-        classifyStump(const DevMem2Db mstages, const int nstages, const DevMem2Di mnodes, const DevMem2Df mleaves, const DevMem2Di msubsets, const DevMem2Db mfeatures,
+        template<typename Pr>
+        __global__ void disjoin(int4* candidates, unsigned int n, int groupThreshold, float grouping_eps, unsigned int* nclasses)
+        {
+            using cv::gpu::device::VecTraits;
+            unsigned int tid = threadIdx.x;
+            extern __shared__ int sbuff[];
+
+            int* labels = sbuff;
+            int* rrects = (int*)(sbuff + n);
+
+            Pr predicate(grouping_eps);
+            partition(candidates, n, labels, predicate);
+
+            rrects[tid * 4 + 0] = 0;
+            rrects[tid * 4 + 1] = 0;
+            rrects[tid * 4 + 2] = 0;
+            rrects[tid * 4 + 3] = 0;
+            __syncthreads();
+
+            int cls = labels[tid];
+            atomicAdd((int*)(rrects + cls * 4 + 0), candidates[tid].x);
+            atomicAdd((int*)(rrects + cls * 4 + 1), candidates[tid].y);
+            atomicAdd((int*)(rrects + cls * 4 + 2), candidates[tid].z);
+            atomicAdd((int*)(rrects + cls * 4 + 3), candidates[tid].w);
+            labels[tid] = 0;
+            __syncthreads();
+
+            atomicInc((unsigned int*)labels + cls, n);
+            labels[n - 1] = 0;
+
+            int active = labels[tid];
+            if (active)
+            {
+                int* r1 = rrects + tid * 4;
+                float s = 1.f / active;
+                r1[0] = saturate_cast<int>(r1[0] * s);
+                r1[1] = saturate_cast<int>(r1[1] * s);
+                r1[2] = saturate_cast<int>(r1[2] * s);
+                r1[3] = saturate_cast<int>(r1[3] * s);
+
+                int n1 = active;
+                __syncthreads();
+                unsigned int j = 0;
+                if( active > groupThreshold )
+                {
+                    for (j = 0; j < n; j++)
+                    {
+                        int n2 = labels[j];
+                        if(!n2 || j == tid || n2 <= groupThreshold )
+                        continue;
+
+                        int* r2 = rrects + j * 4;
+
+                        int dx = saturate_cast<int>( r2[2] * grouping_eps );
+                        int dy = saturate_cast<int>( r2[3] * grouping_eps );
+
+                        if( tid != j && r1[0] >= r2[0] - dx && r1[1] >= r2[1] - dy &&
+                            r1[0] + r1[2] <= r2[0] + r2[2] + dx && r1[1] + r1[3] <= r2[1] + r2[3] + dy &&
+                            (n2 > max(3, n1) || n1 < 3) )
+                            break;
+                    }
+
+                    if( j == n)
+                    {
+                        // printf("founded gpu %d %d %d %d \n", r1[0], r1[1], r1[2], r1[3]);
+                        candidates[atomicInc((unsigned int*)labels + n -1, n)] = VecTraits<int4>::make(r1[0], r1[1], r1[2], r1[3]);
+                    }
+                }
+            }
+        }
+
+        void classifyStump(const DevMem2Db mstages, const int nstages, const DevMem2Di mnodes, const DevMem2Df mleaves, const DevMem2Di msubsets, const DevMem2Db mfeatures,
                           const DevMem2Di integral, const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize,
                           DevMem2D_<int4> objects, unsigned int* classified)
        {
            int blocks  = ceilf(workHeight / (float)step);
            int threads = ceilf(workWidth / (float)step);
-            // printf("blocks %d, threads %d\n", blocks, threads);

            Stage* stages = (Stage*)(mstages.ptr());
            ClNode* nodes = (ClNode*)(mnodes.ptr());
@ -106,5 +178,13 @@ namespace cv { namespace gpu { namespace device
            lbp_classify_stump<<<blocks, threads>>>(stages, nstages, nodes, leaves, subsets, features, integral,
                workWidth, workHeight, clWidth, clHeight, scale, step, subsetSize, objects, classified);
        }
+
+        int connectedConmonents(DevMem2D_<int4> candidates, int groupThreshold, float grouping_eps, unsigned int* nclasses)
+        {
+            int threads = candidates.cols;
+            int smem_amount = threads * sizeof(int) + threads * sizeof(int4);
+            disjoin<InSameComponint><<<1, threads, smem_amount>>>((int4*)candidates.ptr(), candidates.cols, groupThreshold, grouping_eps, nclasses);
+            return 0;
+        }
    }
 }}}
--- a/modules/gpu/src/opencv2/gpu/device/lbp.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/lbp.hpp
@ -62,6 +62,50 @@ namespace lbp{
        int   featureIdx;
    };

+    struct InSameComponint
+    {
+    public:
+        __device__ __forceinline__ InSameComponint(float _eps) : eps(_eps * 0.5) {}
+        __device__ __forceinline__ InSameComponint(const InSameComponint& other) : eps(other.eps) {}
+
+        __device__ __forceinline__ bool operator()(const int4& r1, const int4& r2) const
+        {
+            double delta = eps * (min(r1.z, r2.z) + min(r1.w, r2.w));
+
+            return abs(r1.x - r2.x) <= delta && abs(r1.y - r2.y) <= delta
+                && abs(r1.x + r1.z - r2.x - r2.z) <= delta && abs(r1.y + r1.w - r2.y - r2.w) <= delta;
+        }
+        float eps;
+    };
+
+    template<typename Pr>
+    __device__ __forceinline__ void partition(int4* vec, unsigned int n, int* labels, Pr predicate)
+    {
+        unsigned tid = threadIdx.x;
+        labels[tid] = tid;
+        __syncthreads();
+
+        for (unsigned int id = 0; id < n; id++)
+        {
+            if (tid != id && predicate(vec[tid], vec[id]))
+            {
+                int p = labels[tid];
+                int q = labels[id];
+
+                if (p < q)
+                {
+                    atomicMin(labels + id, p);
+                }
+                else if (p > q)
+                {
+                    atomicMin(labels + tid, q);
+                }
+            }
+        }
+        __syncthreads();
+        // printf("tid %d label %d\n", tid, labels[tid]);
+    }
+
    struct LBP
    {
        __device__ __forceinline__ LBP(const LBP& other) {(void)other;}
@ -72,7 +116,6 @@ namespace lbp{
        {
            int x_off = 2 * feature.z;
            int y_off = 2 * feature.w;
-            // printf("feature: %d %d %d %d\n", (int)feature.x, (int)feature.y, (int)feature.z, (int)feature.w);
            feature.z += feature.x;
            feature.w += feature.y;

@ -107,7 +150,7 @@ namespace lbp{
            anchors[14] = integral(y + y_off + feature.w, x + x_off + feature.x);
            anchors[15] = integral(y + y_off + feature.w, x + x_off + feature.z);

-            // calculate feature
+            // calculate responce
            int sum = anchors[5] - anchors[6] - anchors[9] + anchors[10];

            int response =   (( (anchors[ 0] - anchors[ 1] - anchors[ 4] + anchors[ 5]) >= sum )? 128 : 0)