From 86d785622bf5215fa889fd4429dc02de6d90c10a Mon Sep 17 00:00:00 2001
From: Marina Kolpakova <no@email>
Date: Tue, 10 Jul 2012 11:58:15 +0000
Subject: [PATCH] LBP: switched to texture implementation

---
 modules/gpu/include/opencv2/gpu/gpu.hpp    |   2 +-
 modules/gpu/perf/perf_objdetect.cpp        |   8 +-
 modules/gpu/src/cascadeclassifier.cpp      |  59 +++++-----
 modules/gpu/src/cuda/lbp.cu                | 120 ++++++++++++++++++---
 modules/gpu/src/opencv2/gpu/device/lbp.hpp |  82 --------------
 modules/gpu/test/test_objdetect.cpp        |   5 +-
 6 files changed, 144 insertions(+), 132 deletions(-)
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 4c3af5956c..fd72d9e9d7 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -1435,7 +1435,7 @@ public:
     bool load(const std::string& filename);
     void release();
 
-    int detectMultiScale(const GpuMat& image, GpuMat& scaledImageBuffer, GpuMat& objectsBuf, double scaleFactor = 1.1, int minNeighbors = 4,
+    int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, double scaleFactor = 1.1, int minNeighbors = 4,
     cv::Size maxObjectSize = cv::Size()/*, Size minSize = Size()*/);
     void preallocateIntegralBuffer(cv::Size desired);
     Size getClassifierSize() const;
diff --git a/modules/gpu/perf/perf_objdetect.cpp b/modules/gpu/perf/perf_objdetect.cpp
index e1990488a3..a9b3d7e547 100644
--- a/modules/gpu/perf/perf_objdetect.cpp
+++ b/modules/gpu/perf/perf_objdetect.cpp
@@ -69,16 +69,14 @@ GPU_PERF_TEST_1(LBPClassifier, cv::gpu::DeviceInfo)
 
 
     cv::gpu::GpuMat img(img_host);
-        cv::gpu::GpuMat gpu_rects, buffer;
+        cv::gpu::GpuMat gpu_rects;
     cv::gpu::CascadeClassifier_GPU_LBP cascade(img.size());
     ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/lbpcascade/lbpcascade_frontalface.xml")));
 
-    // cascade.detectMultiScale(img, objects_buffer);
-    cascade.detectMultiScale(img, buffer, gpu_rects);
-
+    cascade.detectMultiScale(img, gpu_rects);
     TEST_CYCLE()
     {
-        cascade.detectMultiScale(img, buffer, gpu_rects);
+        cascade.detectMultiScale(img, gpu_rects);
     }
 }
 
diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp
index e8522568e3..5422dcf99d 100644
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -70,7 +70,7 @@ Size cv::gpu::CascadeClassifier_GPU_LBP::getClassifierSize() const
 void cv::gpu::CascadeClassifier_GPU_LBP::preallocateIntegralBuffer(cv::Size /*desired*/) { throw_nogpu();}
 void cv::gpu::CascadeClassifier_GPU_LBP::initializeBuffers(cv::Size /*frame*/)       { throw_nogpu();}
 
-int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const cv::gpu::GpuMat& /*image*/, cv::gpu::GpuMat& /*scaledImageBuffer*/, cv::gpu::GpuMat& /*objectsBuf*/,
+int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const cv::gpu::GpuMat& /*image*/, cv::gpu::GpuMat& /*objectsBuf*/,
 double /*scaleFactor*/, int /*minNeighbors*/, cv::Size /*maxObjectSize*/){ throw_nogpu(); return 0;}
 
 #else
@@ -299,28 +299,29 @@ namespace cv { namespace gpu { namespace device
 {
     namespace lbp
     {
-        void classifyStump(const DevMem2Db mstages,
-                      const int nstages,
-                      const DevMem2Di mnodes,
-                      const DevMem2Df mleaves,
-                      const DevMem2Di msubsets,
-                      const DevMem2Db mfeatures,
-                      const DevMem2Di integral,
-                      const int workWidth,
-                      const int workHeight,
-                      const int clWidth,
-                      const int clHeight,
-                      float scale,
-                      int step,
-                      int subsetSize,
-                      DevMem2D_<int4> objects,
-                      unsigned int* classified);
-
-        int connectedConmonents(DevMem2D_<int4> candidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
+        void classifyStump(const DevMem2Db& mstages,
+                          const int nstages,
+                          const DevMem2Di& mnodes,
+                          const DevMem2Df& mleaves,
+                          const DevMem2Di& msubsets,
+                          const DevMem2Db& mfeatures,
+                          const int workWidth,
+                          const int workHeight,
+                          const int clWidth,
+                          const int clHeight,
+                          float scale,
+                          int step,
+                          int subsetSize,
+                          DevMem2D_<int4> objects,
+                          unsigned int* classified);
+
+        int connectedConmonents(DevMem2D_<int4>  candidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
+        void bindIntegral(DevMem2Di integral);
+        void unbindIntegral();
     }
 }}}
 
-int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& scaledImageBuffer, GpuMat& objects,
+int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& objects,
                                                         double scaleFactor, int groupThreshold, cv::Size maxObjectSize /*, Size minSize=Size()*/)
 {
     CV_Assert( scaleFactor > 1 && image.depth() == CV_8U );
@@ -332,10 +333,12 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
     if( !objects.empty() && objects.depth() == CV_32S)
         objects.reshape(4, 1);
     else
-        objects.create(1 , defaultObjSearchNum, CV_32SC4);
-
-    GpuMat candidates(1 , defaultObjSearchNum, CV_32SC4);
-    // GpuMat candidates(objects);
+        objects.create(1 , image.cols >> 4, CV_32SC4);
+    GpuMat candidates(1 , image.cols >> 1, CV_32SC4);
+    // GpuMat candidates(1 , defaultObjSearchNum, CV_32SC4);
+    // used for debug
+    // candidates.setTo(cv::Scalar::all(0));
+    // objects.setTo(cv::Scalar::all(0));
     if (maxObjectSize == cv::Size())
         maxObjectSize = image.size();
 
@@ -347,9 +350,11 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
     cudaMalloc(&dclassified, sizeof(int));
     cudaMemcpy(dclassified, classified, sizeof(int), cudaMemcpyHostToDevice);
     int step;
+    cv::gpu::device::lbp::bindIntegral(integral);
 
     for( double factor = 1; ; factor *= scaleFactor )
     {
+        // if (factor > 2.0) break;
         cv::Size windowSize(cvRound(NxM.width * factor), cvRound(NxM.height * factor));
         cv::Size scaledImageSize(cvRound( image.cols / factor ), cvRound( image.rows / factor ));
         cv::Size processingRectSize( scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );
@@ -365,7 +370,7 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
 
         GpuMat scaledImg(resuzeBuffer, cv::Rect(0, 0, scaledImageSize.width, scaledImageSize.height));
         GpuMat scaledIntegral(integral, cv::Rect(0, 0, scaledImageSize.width + 1, scaledImageSize.height + 1));
-        GpuMat currBuff = integralBuffer;//(integralBuffer, cv::Rect(0, 0, integralBuffer.width, integralBuffer.height));
+        GpuMat currBuff = integralBuffer;
 
         cv::gpu::resize(image, scaledImg, scaledImageSize, 0, 0, CV_INTER_LINEAR);
         cv::gpu::integralBuffered(scaledImg, scaledIntegral, currBuff);
@@ -373,8 +378,10 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
         step = (factor <= 2.) + 1;
 
         cv::gpu::device::lbp::classifyStump(stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat,
-        scaledIntegral, processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified);
+        processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified);
     }
+
+    cv::gpu::device::lbp::unbindIntegral();
     if (groupThreshold <= 0  || objects.empty())
         return 0;
     cv::gpu::device::lbp::connectedConmonents(candidates, objects, groupThreshold, grouping_eps, dclassified);
diff --git a/modules/gpu/src/cuda/lbp.cu b/modules/gpu/src/cuda/lbp.cu
index 769430ed8d..ba2e29448f 100644
--- a/modules/gpu/src/cuda/lbp.cu
+++ b/modules/gpu/src/cuda/lbp.cu
@@ -48,8 +48,102 @@ namespace cv { namespace gpu { namespace device
 {
     namespace lbp
     {
+
+        texture<int, cudaTextureType2D, cudaReadModeElementType> tintegral(false, cudaFilterModePoint, cudaAddressModeClamp);
+
+        struct LBP
+        {
+            __device__ __forceinline__ LBP(const LBP& other) {(void)other;}
+            __device__ __forceinline__ LBP() {}
+
+            //feature as uchar x, y - left top, z,w - right bottom
+            __device__ __forceinline__ int operator() (int ty, int tx, int fh, int featurez, int& shift) const
+            {
+                int anchors[9];
+
+                anchors[0]  = tex2D(tintegral, tx, ty);
+                anchors[1]  = tex2D(tintegral, tx + featurez, ty);
+                anchors[0] -= anchors[1];
+                anchors[2]  = tex2D(tintegral, tx + featurez * 2, ty);
+                anchors[1] -= anchors[2];
+                anchors[2] -= tex2D(tintegral, tx + featurez * 3, ty);
+
+                ty += fh;
+                anchors[3]  = tex2D(tintegral, tx, ty);
+                anchors[4]  = tex2D(tintegral, tx + featurez, ty);
+                anchors[3] -= anchors[4];
+                anchors[5]  = tex2D(tintegral, tx + featurez * 2, ty);
+                anchors[4] -= anchors[5];
+                anchors[5] -= tex2D(tintegral, tx + featurez * 3, ty);
+
+                anchors[0] -= anchors[3];
+                anchors[1] -= anchors[4];
+                anchors[2] -= anchors[5];
+                // 0 - 2 contains s0 - s2
+
+                ty += fh;
+                anchors[6]  = tex2D(tintegral, tx, ty);
+                anchors[7]  = tex2D(tintegral, tx + featurez, ty);
+                anchors[6] -= anchors[7];
+                anchors[8]  = tex2D(tintegral, tx + featurez * 2, ty);
+                anchors[7] -= anchors[8];
+                anchors[8] -= tex2D(tintegral, tx + featurez * 3, ty);
+
+                anchors[3] -= anchors[6];
+                anchors[4] -= anchors[7];
+                anchors[5] -= anchors[8];
+                // 3 - 5 contains s3 - s5
+
+                anchors[0] -= anchors[4];
+                anchors[1] -= anchors[4];
+                anchors[2] -= anchors[4];
+                anchors[3] -= anchors[4];
+                anchors[5] -= anchors[4];
+
+                int response = (~(anchors[0] >> 31)) & 4;
+                response |= (~(anchors[1] >> 31)) & 2;;
+                response |= (~(anchors[2] >> 31)) & 1;
+
+                shift = (~(anchors[5] >> 31)) & 16;
+                shift |= (~(anchors[3] >> 31)) & 1;
+
+                ty += fh;
+                anchors[0]  = tex2D(tintegral, tx, ty);
+                anchors[1]  = tex2D(tintegral, tx + featurez, ty);
+                anchors[0] -= anchors[1];
+                anchors[2]  = tex2D(tintegral, tx + featurez * 2, ty);
+                anchors[1] -= anchors[2];
+                anchors[2] -= tex2D(tintegral, tx + featurez * 3, ty);
+
+                anchors[6] -= anchors[0];
+                anchors[7] -= anchors[1];
+                anchors[8] -= anchors[2];
+                // 0 -2 contains s6 - s8
+
+                anchors[6] -= anchors[4];
+                anchors[7] -= anchors[4];
+                anchors[8] -= anchors[4];
+
+                shift |= (~(anchors[6] >> 31)) & 2;
+                shift |= (~(anchors[7] >> 31)) & 4;
+                shift |= (~(anchors[8] >> 31)) & 8;
+                return response;
+            }
+        };
+
+        void bindIntegral(DevMem2Di integral)
+        {
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
+            cudaSafeCall( cudaBindTexture2D(0, &tintegral, integral.ptr(), &desc, (size_t)integral.cols, (size_t)integral.rows, (size_t)integral.step));
+        }
+
+        void unbindIntegral()
+        {
+             cudaSafeCall( cudaUnbindTexture(&tintegral));
+        }
+
         __global__ void lbp_classify_stump(const Stage* stages, const int nstages, const ClNode* nodes, const float* leaves, const int* subsets, const uchar4* features,
-            const int* integral, const int istep, const int workWidth,const int workHeight, const int clWidth, const int clHeight, const float scale, const int step,
+           /* const int* integral,const int istep,  const int workWidth,const int workHeight,*/ const int clWidth, const int clHeight, const float scale, const int step,
             const int subsetSize, DevMem2D_<int4> objects, unsigned int* n)
         {
             int x = threadIdx.x * step;
@@ -63,21 +157,18 @@ namespace cv { namespace gpu { namespace device
             {
                 float sum = 0;
                 Stage stage = stages[s];
-
                 for (int t = 0; t < stage.ntrees; t++)
                 {
                     ClNode node = nodes[current_node];
 
                     uchar4 feature = features[node.featureIdx];
-                    int c = evaluator( (y + feature.y) * istep + x + feature.x , feature.w * istep, feature.z, integral, istep);
-                    const int* subsetIdx = subsets + (current_node * subsetSize);
-
-                    int idx =  (subsetIdx[c >> 5] & ( 1 << (c & 31))) ? current_leave : current_leave + 1;
+                    int shift;
+                    int c = evaluator(y + feature.y, x + feature.x, feature.w, feature.z, shift);
+                    int idx =  (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1;
                     sum += leaves[idx];
                     current_node += 1;
                     current_leave += 2;
                 }
-
                 if (sum < stage.threshold)
                     return;
             }
@@ -85,8 +176,8 @@ namespace cv { namespace gpu { namespace device
             int4 rect;
             rect.x = roundf(x * scale);
             rect.y = roundf(y * scale);
-            rect.z = roundf(clWidth);
-            rect.w = roundf(clHeight);
+            rect.z = clWidth;
+            rect.w = clHeight;
 #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
             int res = __atomicInc(n, 100U);
 #else
@@ -178,8 +269,8 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
-        void classifyStump(const DevMem2Db mstages, const int nstages, const DevMem2Di mnodes, const DevMem2Df mleaves, const DevMem2Di msubsets, const DevMem2Db mfeatures,
-                           const DevMem2Di integral, const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize,
+        void classifyStump(const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
+                           /*const DevMem2Di& integral,*/ const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize,
                            DevMem2D_<int4> objects, unsigned int* classified)
         {
             int blocks  = ceilf(workHeight / (float)step);
@@ -190,11 +281,8 @@ namespace cv { namespace gpu { namespace device
             const float* leaves = mleaves.ptr();
             const int* subsets = msubsets.ptr();
             const uchar4* features = (uchar4*)(mfeatures.ptr());
-            const int* integ = integral.ptr();
-            int istep = integral.step / sizeof(int);
-
-            lbp_classify_stump<<<blocks, threads>>>(stages, nstages, nodes, leaves, subsets, features, integ, istep,
-                workWidth, workHeight, clWidth, clHeight, scale, step, subsetSize, objects, classified);
+            lbp_classify_stump<<<blocks, threads>>>(stages, nstages, nodes, leaves, subsets, features, /*integ, istep,
+                workWidth, workHeight,*/ clWidth, clHeight, scale, step, subsetSize, objects, classified);
         }
 
         int connectedConmonents(DevMem2D_<int4> candidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
diff --git a/modules/gpu/src/opencv2/gpu/device/lbp.hpp b/modules/gpu/src/opencv2/gpu/device/lbp.hpp
index b5ef365846..69867c9939 100644
--- a/modules/gpu/src/opencv2/gpu/device/lbp.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/lbp.hpp
@@ -153,90 +153,8 @@ __device__ __forceinline__ T __atomicMin(T* address, T val)
         __syncthreads();
         // printf("tid %d label %d\n", tid, labels[tid]);
     }
-
-    struct LBP
-    {
-        __device__ __forceinline__ LBP(const LBP& other) {(void)other;}
-        __device__ __forceinline__ LBP() {}
-
-        //feature as uchar x, y - left top, z,w - right bottom
-        __device__ __forceinline__ int operator() (unsigned int y, int featurew, int featurez, const int* integral, int step) const
-        {
-            int x_off = 2 * featurez;
-            int anchors[9];
-
-            anchors[0]  = integral[y];
-            anchors[1]  = integral[y + featurez];
-            anchors[0] -= anchors[1];
-            anchors[2]  = integral[y + x_off];
-            anchors[1] -= anchors[2];
-            anchors[2] -= integral[y + featurez + x_off];
-            y += featurew;
-
-            anchors[3]  = integral[y];
-            anchors[4]  = integral[y + featurez];
-            anchors[3] -= anchors[4];
-            anchors[5]  = integral[y + x_off];
-            anchors[4] -= anchors[5];
-            anchors[5] -= integral[y + featurez + x_off];
-
-            anchors[0] -= anchors[3];
-            anchors[1] -= anchors[4];
-            anchors[2] -= anchors[5];
-            // 0 - 2 contains s0 - s2
-
-            y += featurew;
-            anchors[6]  = integral[y];
-            anchors[7]  = integral[y + featurez];
-            anchors[6] -= anchors[7];
-            anchors[8]  = integral[y + x_off];
-            anchors[7] -= anchors[8];
-            anchors[8] -= integral[y + x_off + featurez];
-
-            anchors[3] -= anchors[6];
-            anchors[4] -= anchors[7];
-            anchors[5] -= anchors[8];
-            // 3 - 5 contains s3 - s5
-
-            anchors[0] -= anchors[4];
-            anchors[1] -= anchors[4];
-            anchors[2] -= anchors[4];
-            anchors[3] -= anchors[4];
-            anchors[5] -= anchors[4];
-
-            int response = (~(anchors[0] >> 31)) & 128;
-            response |= (~(anchors[1] >> 31)) & 64;;
-            response |= (~(anchors[2] >> 31)) & 32;
-            response |= (~(anchors[5] >> 31)) & 16;
-            response |= (~(anchors[3] >> 31)) & 1;
-
-            y += featurew;
-            anchors[0]  = integral[y];
-            anchors[1]  = integral[y + featurez];
-            anchors[0] -= anchors[1];
-            anchors[2]  = integral[y + x_off];
-            anchors[1] -= anchors[2];
-            anchors[2] -= integral[y + x_off + featurez];
-
-            anchors[6] -= anchors[0];
-            anchors[7] -= anchors[1];
-            anchors[8] -= anchors[2];
-            // 0 -2 contains s6 - s8
-
-            anchors[6] -= anchors[4];
-            anchors[7] -= anchors[4];
-            anchors[8] -= anchors[4];
-
-            response |= (~(anchors[6] >> 31)) & 2;
-            response |= (~(anchors[7] >> 31)) & 4;
-            response |= (~(anchors[8] >> 31)) & 8;
-
-            return response;
-        }
-    };
 } // lbp
 
-
 } } }// namespaces
 
 #endif
\ No newline at end of file
diff --git a/modules/gpu/test/test_objdetect.cpp b/modules/gpu/test/test_objdetect.cpp
index 8b49538d40..fdd9454c86 100644
--- a/modules/gpu/test/test_objdetect.cpp
+++ b/modules/gpu/test/test_objdetect.cpp
@@ -343,15 +343,16 @@ TEST_P(LBP_classify, Accuracy)
 
     cv::gpu::CascadeClassifier_GPU_LBP gpuClassifier;
     ASSERT_TRUE(gpuClassifier.load(classifierXmlPath));
-    cv::gpu::GpuMat gpu_rects, buffer;
+    cv::gpu::GpuMat gpu_rects;
     cv::gpu::GpuMat tested(grey);
-    int count = gpuClassifier.detectMultiScale(tested, buffer, gpu_rects);
+    int count = gpuClassifier.detectMultiScale(tested, gpu_rects);
 
     cv::Mat gpu_f(gpu_rects);
     int* gpu_faces = (int*)gpu_f.ptr();
     for (int i = 0; i < count; i++)
     {
         cv::Rect r(gpu_faces[i * 4],gpu_faces[i * 4 + 1],gpu_faces[i * 4 + 2],gpu_faces[i * 4 + 3]);
+        std::cout << gpu_faces[i * 4]<< " " << gpu_faces[i * 4 + 1] << " " << gpu_faces[i * 4 + 2] << " " << gpu_faces[i * 4 + 3] << std::endl;
         cv::rectangle(markedImage, r , cv::Scalar(0, 0, 255, 255));
     }
 }