From 433c3da7cfad09c191dc234f0693a86bc4fd628f Mon Sep 17 00:00:00 2001
From: Yan Wang <yan.wang@linux.intel.com>
Date: Mon, 23 Mar 2015 17:47:40 +0800
Subject: [PATCH] Optimize the performance of cascade OpenCL kernel.

1. Use built-in mad() instead of += and *.
2. For stump stages, if weight.z == 0, avoid unnecessary calculation
because some features only have 2 rectangles..

It could improve
OCL_Cascade_Image_MinSize_CascadeClassifier.CascadeClassifier/*
about 10% mean values.

Signed-off-by: Yan Wang <yan.wang@linux.intel.com>
---
 modules/objdetect/src/opencl/cascadedetect.cl | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/modules/objdetect/src/opencl/cascadedetect.cl b/modules/objdetect/src/opencl/cascadedetect.cl
index 854a7f617d..7ab581a282 100644
--- a/modules/objdetect/src/opencl/cascadedetect.cl
+++ b/modules/objdetect/src/opencl/cascadedetect.cl
@@ -180,11 +180,11 @@ void runHaarClassifier(
                         int4 ofs = f->ofs[0];
                         sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
                         ofs = f->ofs[1];
-                        sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
+                        sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval);
                         if( weight.z > 0 )
                         {
                             ofs = f->ofs[2];
-                            sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
+                            sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval);
                         }
 
                         s += (sval < st.y*nf) ? st.z : st.w;
@@ -204,11 +204,11 @@ void runHaarClassifier(
 
                             sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
                             ofs = f->ofs[1];
-                            sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
+                            sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval);
                             if( weight.z > 0 )
                             {
                                 ofs = f->ofs[2];
-                                sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
+                                sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval);
                             }
 
                             idx = (sval < as_float(n.y)*nf) ? n.z : n.w;
@@ -281,11 +281,12 @@ void runHaarClassifier(
                             int4 ofs = f->ofs[0];
                             float sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
                             ofs = f->ofs[1];
-                            sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
+                            sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval);
                             //if( weight.z > 0 )
+                            if( fabs(weight.z) > 0 )
                             {
                                 ofs = f->ofs[2];
-                                sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
+                                sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval);
                             }
 
                             partsum += (sval < st.y*nf) ? st.z : st.w;
@@ -303,11 +304,11 @@ void runHaarClassifier(
 
                                 float sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
                                 ofs = f->ofs[1];
-                                sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
+                                sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval);
                                 if( weight.z > 0 )
                                 {
                                     ofs = f->ofs[2];
-                                    sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
+                                    sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval);
                                 }
 
                                 idx = (sval < as_float(n.y)*nf) ? n.z : n.w;