Fix copy-paste bug in AVX optimization of haar

13 years ago · 089de14ed7
parent f32eb05ea1
commit 089de14ed7
1 changed files with 424 additions and 352 deletions
--- a/modules/objdetect/src/haar.cpp
+++ b/modules/objdetect/src/haar.cpp
@ -45,7 +45,6 @@
 #include <stdio.h>
 #include "opencv2/core/internal.hpp"

-
 #if CV_SSE2 || CV_SSE3
 #   if !CV_SSE4_1 && !CV_SSE4_2
 #       define _mm_blendv_pd(a, b, m) _mm_xor_pd(a, _mm_and_pd(_mm_xor_pd(b, a), m))
@ -53,13 +52,13 @@
 #   endif
 #endif

-#   if  CV_AVX
+#if CV_AVX
 #  define CV_HAAR_USE_AVX 1
-#   else
+#else
 #  if CV_SSE2 || CV_SSE3
 #    define CV_HAAR_USE_SSE 1
 #  endif
-#   endif
+#endif

 /* these settings affect the quality of detection: change with care */
 #define CV_ADJUST_FEATURES 1
@ -76,8 +75,7 @@ typedef struct CvHidHaarFeature
        float weight;
    }
    rect[CV_HAAR_FEATURE_MAX];
-}
-CvHidHaarFeature;
+} CvHidHaarFeature;


 typedef struct CvHidHaarTreeNode
@ -86,8 +84,7 @@ typedef struct CvHidHaarTreeNode
    float threshold;
    int left;
    int right;
-}
-CvHidHaarTreeNode;
+} CvHidHaarTreeNode;


 typedef struct CvHidHaarClassifier
@ -96,8 +93,7 @@ typedef struct CvHidHaarClassifier
    //CvHaarFeature* orig_feature;
    CvHidHaarTreeNode* node;
    float* alpha;
-}
-CvHidHaarClassifier;
+} CvHidHaarClassifier;


 typedef struct CvHidHaarStageClassifier
@ -110,11 +106,10 @@ typedef struct CvHidHaarStageClassifier
    struct CvHidHaarStageClassifier* next;
    struct CvHidHaarStageClassifier* child;
    struct CvHidHaarStageClassifier* parent;
-}
-CvHidHaarStageClassifier;
+} CvHidHaarStageClassifier;


-struct CvHidHaarClassifierCascade
+typedef struct CvHidHaarClassifierCascade
 {
    int  count;
    int  isStumpBased;
@ -127,7 +122,7 @@ struct CvHidHaarClassifierCascade
    sumtype *p0, *p1, *p2, *p3;

    void** ipp_stages;
-};
+} CvHidHaarClassifierCascade;


 const int icv_object_win_border = 1;
@ -634,21 +629,21 @@ cvSetImagesForHaarClassifierCascade( CvHaarClassifierCascade* _cascade,
 }


-//AVX version icvEvalHidHaarClassifier.  Process 8 CvHidHaarClassifiers per call. Check AVX support before invocation!!
+// AVX version icvEvalHidHaarClassifier.  Process 8 CvHidHaarClassifiers per call. Check AVX support before invocation!!
 #ifdef CV_HAAR_USE_AVX
 CV_INLINE
 double icvEvalHidHaarClassifierAVX( CvHidHaarClassifier* classifier,
                                    double variance_norm_factor, size_t p_offset )
 {
    int  CV_DECL_ALIGNED(32) idxV[8] = {0,0,0,0,0,0,0,0};
-    char flags[8] = {0,0,0,0,0,0,0,0};
+    uchar flags[8] = {0,0,0,0,0,0,0,0};
    CvHidHaarTreeNode* nodes[8];
    double res = 0;
-    char exitConditionFlag = 0;
+    uchar exitConditionFlag = 0;
    for(;;)
    {
        float CV_DECL_ALIGNED(32) tmp[8] = {0,0,0,0,0,0,0,0};
-        nodes[0] = classifier    ->node + idxV[0];
+        nodes[0] = (classifier+0)->node + idxV[0];
        nodes[1] = (classifier+1)->node + idxV[1];
        nodes[2] = (classifier+2)->node + idxV[2];
        nodes[3] = (classifier+3)->node + idxV[3];
@ -658,46 +653,79 @@ double icvEvalHidHaarClassifierAVX( CvHidHaarClassifier* classifier,
        nodes[7] = (classifier+7)->node + idxV[7];

        __m256 t = _mm256_set1_ps(variance_norm_factor);
-        t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold));

-        __m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset),
-                calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0],
-                p_offset),calc_sum(nodes[0]->feature.rect[0],p_offset));
-        __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight,
-                nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight);
-        __m256 sum = _mm256_mul_ps(offset, weight);
+        t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
+                                           nodes[6]->threshold,
+                                           nodes[5]->threshold,
+                                           nodes[4]->threshold,
+                                           nodes[3]->threshold,
+                                           nodes[2]->threshold,
+                                           nodes[1]->threshold,
+                                           nodes[0]->threshold));
+
+        __m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0], p_offset),
+                                      calc_sum(nodes[6]->feature.rect[0], p_offset),
+                                      calc_sum(nodes[5]->feature.rect[0], p_offset),
+                                      calc_sum(nodes[4]->feature.rect[0], p_offset),
+                                      calc_sum(nodes[3]->feature.rect[0], p_offset),
+                                      calc_sum(nodes[2]->feature.rect[0], p_offset),
+                                      calc_sum(nodes[1]->feature.rect[0], p_offset),
+                                      calc_sum(nodes[0]->feature.rect[0], p_offset));
+
+        __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
+                                      nodes[6]->feature.rect[0].weight,
+                                      nodes[5]->feature.rect[0].weight,
+                                      nodes[4]->feature.rect[0].weight,
+                                      nodes[3]->feature.rect[0].weight,
+                                      nodes[2]->feature.rect[0].weight,
+                                      nodes[1]->feature.rect[0].weight,
+                                      nodes[0]->feature.rect[0].weight);

-        offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset),
-                calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset),
-                calc_sum(nodes[0]->feature.rect[1],p_offset));
-        weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight,
-                nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight);
+        __m256 sum = _mm256_mul_ps(offset, weight);

-        sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight));
+        offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1], p_offset),
+                               calc_sum(nodes[6]->feature.rect[1], p_offset),
+                               calc_sum(nodes[5]->feature.rect[1], p_offset),
+                               calc_sum(nodes[4]->feature.rect[1], p_offset),
+                               calc_sum(nodes[3]->feature.rect[1], p_offset),
+                               calc_sum(nodes[2]->feature.rect[1], p_offset),
+                               calc_sum(nodes[1]->feature.rect[1], p_offset),
+                               calc_sum(nodes[0]->feature.rect[1], p_offset));
+
+        weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
+                               nodes[6]->feature.rect[1].weight,
+                               nodes[5]->feature.rect[1].weight,
+                               nodes[4]->feature.rect[1].weight,
+                               nodes[3]->feature.rect[1].weight,
+                               nodes[2]->feature.rect[1].weight,
+                               nodes[1]->feature.rect[1].weight,
+                               nodes[0]->feature.rect[1].weight);
+
+        sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight));

        if( nodes[0]->feature.rect[2].p0 )
-            tmp[0] = calc_sum(nodes[0]->feature.rect[2],p_offset) * nodes[0]->feature.rect[2].weight;
+            tmp[0] = calc_sum(nodes[0]->feature.rect[2], p_offset) * nodes[0]->feature.rect[2].weight;
        if( nodes[1]->feature.rect[2].p0 )
-            tmp[1] = calc_sum(nodes[1]->feature.rect[2],p_offset) * nodes[1]->feature.rect[2].weight;
+            tmp[1] = calc_sum(nodes[1]->feature.rect[2], p_offset) * nodes[1]->feature.rect[2].weight;
        if( nodes[2]->feature.rect[2].p0 )
-            tmp[2] = calc_sum(nodes[2]->feature.rect[2],p_offset) * nodes[2]->feature.rect[2].weight;
+            tmp[2] = calc_sum(nodes[2]->feature.rect[2], p_offset) * nodes[2]->feature.rect[2].weight;
        if( nodes[3]->feature.rect[2].p0 )
-            tmp[3] = calc_sum(nodes[3]->feature.rect[2],p_offset) * nodes[3]->feature.rect[2].weight;
+            tmp[3] = calc_sum(nodes[3]->feature.rect[2], p_offset) * nodes[3]->feature.rect[2].weight;
        if( nodes[4]->feature.rect[2].p0 )
-            tmp[4] = calc_sum(nodes[4]->feature.rect[2],p_offset) * nodes[4]->feature.rect[2].weight;
+            tmp[4] = calc_sum(nodes[4]->feature.rect[2], p_offset) * nodes[4]->feature.rect[2].weight;
        if( nodes[5]->feature.rect[2].p0 )
-            tmp[5] = calc_sum(nodes[5]->feature.rect[2],p_offset) * nodes[5]->feature.rect[2].weight;
+            tmp[5] = calc_sum(nodes[5]->feature.rect[2], p_offset) * nodes[5]->feature.rect[2].weight;
        if( nodes[6]->feature.rect[2].p0 )
-            tmp[6] = calc_sum(nodes[6]->feature.rect[2],p_offset) * nodes[6]->feature.rect[2].weight;
+            tmp[6] = calc_sum(nodes[6]->feature.rect[2], p_offset) * nodes[6]->feature.rect[2].weight;
        if( nodes[7]->feature.rect[2].p0 )
-            tmp[7] = calc_sum(nodes[7]->feature.rect[2],p_offset) * nodes[7]->feature.rect[2].weight;
+            tmp[7] = calc_sum(nodes[7]->feature.rect[2], p_offset) * nodes[7]->feature.rect[2].weight;

        sum = _mm256_add_ps(sum,_mm256_load_ps(tmp));

-        __m256 left = _mm256_set_ps(nodes[7]->left,nodes[6]->left,nodes[5]->left,nodes[4]->left,nodes[3]->left,nodes[2]->left,nodes[1]->left,nodes[0]->left);
+        __m256 left  = _mm256_set_ps(nodes[7]->left, nodes[6]->left, nodes[5]->left, nodes[4]->left, nodes[3]->left, nodes[2]->left, nodes[1]->left, nodes[0]->left );
        __m256 right = _mm256_set_ps(nodes[7]->right,nodes[6]->right,nodes[5]->right,nodes[4]->right,nodes[3]->right,nodes[2]->right,nodes[1]->right,nodes[0]->right);

-        _mm256_store_si256((__m256i*)idxV,_mm256_cvttps_epi32(_mm256_blendv_ps(right, left,_mm256_cmp_ps(sum, t, _CMP_LT_OQ ))));
+        _mm256_store_si256((__m256i*)idxV, _mm256_cvttps_epi32(_mm256_blendv_ps(right, left, _mm256_cmp_ps(sum, t, _CMP_LT_OQ))));

        for(int i = 0; i < 8; i++)
        {
@ -706,17 +734,17 @@ double icvEvalHidHaarClassifierAVX( CvHidHaarClassifier* classifier,
                if(!flags[i])
                {
                    exitConditionFlag++;
-                    flags[i]=1;
-                    res+=((classifier+i)->alpha[-idxV[i]]);
+                    flags[i] = 1;
+                    res += (classifier+i)->alpha[-idxV[i]];
                }
                idxV[i]=0;
            }
        }
-        if(exitConditionFlag==8)
+        if(exitConditionFlag == 8)
            return res;
    }
 }
-#endif
+#endif //CV_HAAR_USE_AVX

 CV_INLINE
 double icvEvalHidHaarClassifier( CvHidHaarClassifier* classifier,
@ -778,18 +806,16 @@ static int
 cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
                               CvPoint pt, double& stage_sum, int start_stage )
 {
-    #ifdef CV_HAAR_USE_AVX
+#ifdef CV_HAAR_USE_AVX
    bool haveAVX = false;
    if(cv::checkHardwareSupport(CV_CPU_AVX))
    if(__xgetbv()&0x6)// Check if the OS will save the YMM registers
-                {
       haveAVX = true;
-                }
-    #else
-        #ifdef CV_HAAR_USE_SSE
+#else
+#  ifdef CV_HAAR_USE_SSE
    bool haveSSE2 = cv::checkHardwareSupport(CV_CPU_SSE2);
-        #endif
-    #endif
+#  endif
+#endif

    int p_offset, pq_offset;
    int i, j;
@ -828,19 +854,20 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
        while( ptr )
        {
            stage_sum = 0.0;
+            j = 0;

-            #ifdef CV_HAAR_USE_AVX
+#ifdef CV_HAAR_USE_AVX
            if(haveAVX)
            {
-                for( ; j < cascade->stage_classifier[i].count-8; j+=8 )
+                for( ; j <= ptr->count - 8; j += 8 )
                {
                    stage_sum += icvEvalHidHaarClassifierAVX(
-                        cascade->stage_classifier[i].classifier+j,
+                        ptr->classifier + j,
                        variance_norm_factor, p_offset );
                }
            }
-            #endif
-            for( j = 0; j < ptr->count; j++ )
+#endif
+            for( ; j < ptr->count; j++ )
            {
                stage_sum += icvEvalHidHaarClassifier( ptr->classifier + j, variance_norm_factor, p_offset );
            }
@ -860,7 +887,7 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
    }
    else if( cascade->isStumpBased )
    {
-    #ifdef CV_HAAR_USE_AVX
+#ifdef CV_HAAR_USE_AVX
        if(haveAVX)
        {
            CvHidHaarClassifier* classifiers[8];
@ -872,15 +899,14 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
                float CV_DECL_ALIGNED(32) buf[8];
                if( cascade->stage_classifier[i].two_rects )
                {
-                        for( ; j <= cascade->stage_classifier[i].count-8; j+=8 )
+                    for( ; j <= cascade->stage_classifier[i].count - 8; j += 8 )
                    {
-                            //__m256 stage_sumPart = _mm256_setzero_ps();
                        classifiers[0] = cascade->stage_classifier[i].classifier + j;
                        nodes[0] = classifiers[0]->node;
                        classifiers[1] = cascade->stage_classifier[i].classifier + j + 1;
                        nodes[1] = classifiers[1]->node;
                        classifiers[2] = cascade->stage_classifier[i].classifier + j + 2;
-                            nodes[2]= classifiers[2]->node;
+                        nodes[2] = classifiers[2]->node;
                        classifiers[3] = cascade->stage_classifier[i].classifier + j + 3;
                        nodes[3] = classifiers[3]->node;
                        classifiers[4] = cascade->stage_classifier[i].classifier + j + 4;
@ -893,30 +919,74 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
                        nodes[7] = classifiers[7]->node;

                        __m256 t = _mm256_set1_ps(variance_norm_factor);
-                            t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold));
+                        t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
+                                                           nodes[6]->threshold,
+                                                           nodes[5]->threshold,
+                                                           nodes[4]->threshold,
+                                                           nodes[3]->threshold,
+                                                           nodes[2]->threshold,
+                                                           nodes[1]->threshold,
+                                                           nodes[0]->threshold));
+
+                        __m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0], p_offset),
+                                                      calc_sum(nodes[6]->feature.rect[0], p_offset),
+                                                      calc_sum(nodes[5]->feature.rect[0], p_offset),
+                                                      calc_sum(nodes[4]->feature.rect[0], p_offset),
+                                                      calc_sum(nodes[3]->feature.rect[0], p_offset),
+                                                      calc_sum(nodes[2]->feature.rect[0], p_offset),
+                                                      calc_sum(nodes[1]->feature.rect[0], p_offset),
+                                                      calc_sum(nodes[0]->feature.rect[0], p_offset));
+
+                        __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
+                                                      nodes[6]->feature.rect[0].weight,
+                                                      nodes[5]->feature.rect[0].weight,
+                                                      nodes[4]->feature.rect[0].weight,
+                                                      nodes[3]->feature.rect[0].weight,
+                                                      nodes[2]->feature.rect[0].weight,
+                                                      nodes[1]->feature.rect[0].weight,
+                                                      nodes[0]->feature.rect[0].weight);

-                            __m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset),
-                                calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0],
-                                p_offset),calc_sum(nodes[0]->feature.rect[0],p_offset));
-                            __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight,
-                                nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight);
                        __m256 sum = _mm256_mul_ps(offset, weight);

-                            offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset),
-                                calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset),
-                                calc_sum(nodes[0]->feature.rect[1],p_offset));
-                            weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight,
-                                nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight);
-                            sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight));
-
-                            __m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],classifiers[6]->alpha[0],classifiers[5]->alpha[0],classifiers[4]->alpha[0],classifiers[3]->alpha[0],
-                                classifiers[2]->alpha[0],classifiers[1]->alpha[0],classifiers[0]->alpha[0]);
-                            __m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],classifiers[6]->alpha[1],classifiers[5]->alpha[1],classifiers[4]->alpha[1],classifiers[3]->alpha[1],
-                                classifiers[2]->alpha[1],classifiers[1]->alpha[1],classifiers[0]->alpha[1]);
+                        offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1], p_offset),
+                                               calc_sum(nodes[6]->feature.rect[1], p_offset),
+                                               calc_sum(nodes[5]->feature.rect[1], p_offset),
+                                               calc_sum(nodes[4]->feature.rect[1], p_offset),
+                                               calc_sum(nodes[3]->feature.rect[1], p_offset),
+                                               calc_sum(nodes[2]->feature.rect[1], p_offset),
+                                               calc_sum(nodes[1]->feature.rect[1], p_offset),
+                                               calc_sum(nodes[0]->feature.rect[1], p_offset));
+
+                        weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
+                                               nodes[6]->feature.rect[1].weight,
+                                               nodes[5]->feature.rect[1].weight,
+                                               nodes[4]->feature.rect[1].weight,
+                                               nodes[3]->feature.rect[1].weight,
+                                               nodes[2]->feature.rect[1].weight,
+                                               nodes[1]->feature.rect[1].weight,
+                                               nodes[0]->feature.rect[1].weight);

-                            _mm256_store_ps(buf, _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ )));
-                            stage_sum+=(buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]);
+                        sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight));

+                        __m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],
+                                                      classifiers[6]->alpha[0],
+                                                      classifiers[5]->alpha[0],
+                                                      classifiers[4]->alpha[0],
+                                                      classifiers[3]->alpha[0],
+                                                      classifiers[2]->alpha[0],
+                                                      classifiers[1]->alpha[0],
+                                                      classifiers[0]->alpha[0]);
+                        __m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],
+                                                      classifiers[6]->alpha[1],
+                                                      classifiers[5]->alpha[1],
+                                                      classifiers[4]->alpha[1],
+                                                      classifiers[3]->alpha[1],
+                                                      classifiers[2]->alpha[1],
+                                                      classifiers[1]->alpha[1],
+                                                      classifiers[0]->alpha[1]);
+
+                        _mm256_store_ps(buf, _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ)));
+                        stage_sum += (buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]);
                    }

                    for( ; j < cascade->stage_classifier[i].count; j++ )
@ -941,7 +1011,7 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
                        classifiers[1] = cascade->stage_classifier[i].classifier + j + 1;
                        nodes[1] = classifiers[1]->node;
                        classifiers[2] = cascade->stage_classifier[i].classifier + j + 2;
-                            nodes[2]= classifiers[2]->node;
+                        nodes[2] = classifiers[2]->node;
                        classifiers[3] = cascade->stage_classifier[i].classifier + j + 3;
                        nodes[3] = classifiers[3]->node;
                        classifiers[4] = cascade->stage_classifier[i].classifier + j + 4;
@ -954,22 +1024,55 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
                        nodes[7] = classifiers[7]->node;

                        __m256 t = _mm256_set1_ps(variance_norm_factor);
-                            t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold));

-                            __m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset),
-                                calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0],
-                                p_offset),calc_sum(nodes[0]->feature.rect[0],p_offset));
-                            __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight,
-                                nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight);
+                        t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
+                                                           nodes[6]->threshold,
+                                                           nodes[5]->threshold,
+                                                           nodes[4]->threshold,
+                                                           nodes[3]->threshold,
+                                                           nodes[2]->threshold,
+                                                           nodes[1]->threshold,
+                                                           nodes[0]->threshold));
+
+                        __m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0], p_offset),
+                                                      calc_sum(nodes[6]->feature.rect[0], p_offset),
+                                                      calc_sum(nodes[5]->feature.rect[0], p_offset),
+                                                      calc_sum(nodes[4]->feature.rect[0], p_offset),
+                                                      calc_sum(nodes[3]->feature.rect[0], p_offset),
+                                                      calc_sum(nodes[2]->feature.rect[0], p_offset),
+                                                      calc_sum(nodes[1]->feature.rect[0], p_offset),
+                                                      calc_sum(nodes[0]->feature.rect[0], p_offset));
+
+                        __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
+                                                      nodes[6]->feature.rect[0].weight,
+                                                      nodes[5]->feature.rect[0].weight,
+                                                      nodes[4]->feature.rect[0].weight,
+                                                      nodes[3]->feature.rect[0].weight,
+                                                      nodes[2]->feature.rect[0].weight,
+                                                      nodes[1]->feature.rect[0].weight,
+                                                      nodes[0]->feature.rect[0].weight);
+
                        __m256 sum = _mm256_mul_ps(offset, weight);

-                            offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset),
-                                calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset),
-                                calc_sum(nodes[0]->feature.rect[1],p_offset));
-                            weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight,
-                                nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight);
-
-                            sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight));
+                        offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1], p_offset),
+                                               calc_sum(nodes[6]->feature.rect[1], p_offset),
+                                               calc_sum(nodes[5]->feature.rect[1], p_offset),
+                                               calc_sum(nodes[4]->feature.rect[1], p_offset),
+                                               calc_sum(nodes[3]->feature.rect[1], p_offset),
+                                               calc_sum(nodes[2]->feature.rect[1], p_offset),
+                                               calc_sum(nodes[1]->feature.rect[1], p_offset),
+                                               calc_sum(nodes[0]->feature.rect[1], p_offset));
+
+                        weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
+                                               nodes[6]->feature.rect[1].weight,
+                                               nodes[5]->feature.rect[1].weight,
+                                               nodes[4]->feature.rect[1].weight,
+                                               nodes[3]->feature.rect[1].weight,
+                                               nodes[2]->feature.rect[1].weight,
+                                               nodes[1]->feature.rect[1].weight,
+                                               nodes[0]->feature.rect[1].weight);
+
+                        sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight));

                        if( nodes[0]->feature.rect[2].p0 )
                            tmp[0] = calc_sum(nodes[0]->feature.rect[2],p_offset) * nodes[0]->feature.rect[2].weight;
@ -990,16 +1093,28 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,

                        sum = _mm256_add_ps(sum, _mm256_load_ps(tmp));

-                            __m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],classifiers[6]->alpha[0],classifiers[5]->alpha[0],classifiers[4]->alpha[0],classifiers[3]->alpha[0],
-                                classifiers[2]->alpha[0],classifiers[1]->alpha[0],classifiers[0]->alpha[0]);
-                            __m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],classifiers[6]->alpha[1],classifiers[5]->alpha[1],classifiers[4]->alpha[1],classifiers[3]->alpha[1],
-                                classifiers[2]->alpha[1],classifiers[1]->alpha[1],classifiers[0]->alpha[1]);
+                        __m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],
+                                                      classifiers[6]->alpha[0],
+                                                      classifiers[5]->alpha[0],
+                                                      classifiers[4]->alpha[0],
+                                                      classifiers[3]->alpha[0],
+                                                      classifiers[2]->alpha[0],
+                                                      classifiers[1]->alpha[0],
+                                                      classifiers[0]->alpha[0]);
+                        __m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],
+                                                      classifiers[6]->alpha[1],
+                                                      classifiers[5]->alpha[1],
+                                                      classifiers[4]->alpha[1],
+                                                      classifiers[3]->alpha[1],
+                                                      classifiers[2]->alpha[1],
+                                                      classifiers[1]->alpha[1],
+                                                      classifiers[0]->alpha[1]);

                        __m256 outBuf = _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ ));
                        outBuf = _mm256_hadd_ps(outBuf, outBuf);
                        outBuf = _mm256_hadd_ps(outBuf, outBuf);
                        _mm256_store_ps(buf, outBuf);
-                            stage_sum+=(buf[0]+buf[4]);//(buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]);
+                        stage_sum += (buf[0] + buf[4]);
                    }

                    for( ; j < cascade->stage_classifier[i].count; j++ )
@ -1020,8 +1135,7 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
            }
        }
        else
-    #endif
-    #if defined CV_HAAR_USE_SSE && CV_HAAR_USE_SSE && (!defined CV_HAAR_USE_AVX || !CV_HAAR_USE_AVX) //old SSE optimization
+#elif defined CV_HAAR_USE_SSE //old SSE optimization
        if(haveSSE2)
        {
            for( i = start_stage; i < cascade->count; i++ )
@ -1070,7 +1184,7 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
            }
        }
        else
-    #endif
+#endif // AVX or SSE
        {
            for( i = start_stage; i < cascade->count; i++ )
            {
@ -1106,24 +1220,24 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
            }
        }
    }
-
    else
    {
        for( i = start_stage; i < cascade->count; i++ )
        {
            stage_sum = 0.0;
            int k = 0;
-            #ifdef CV_HAAR_USE_AVX
+
+#ifdef CV_HAAR_USE_AVX
            if(haveAVX)
            {
-                for( ; k < cascade->stage_classifier[i].count-8; k+=8 )
+                for( ; k < cascade->stage_classifier[i].count - 8; k += 8 )
                {
                    stage_sum += icvEvalHidHaarClassifierAVX(
-                        cascade->stage_classifier[i].classifier+k,
+                        cascade->stage_classifier[i].classifier + k,
                        variance_norm_factor, p_offset );
                }
            }
-            #endif
+#endif
            for(; k < cascade->stage_classifier[i].count; k++ )
            {

@ -1136,7 +1250,6 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
                return -i;
        }
    }
-    //_mm256_zeroupper();
    return 1;
 }

@ -1186,7 +1299,7 @@ struct HaarDetectObjects_ScaleImage_Invoker
        Size ssz(sum1.cols - 1 - winSize0.width, y2 - y1);
        int x, y, ystep = factor > 2 ? 1 : 2;

-    #ifdef HAVE_IPP
+#ifdef HAVE_IPP
        if( cascade->hid_cascade->ipp_stages )
        {
            IppiRect iequRect = {equRect.x, equRect.y, equRect.width, equRect.height};
@ -1241,7 +1354,7 @@ struct HaarDetectObjects_ScaleImage_Invoker
                }
        }
        else
-#endif
+#endif // IPP
            for( y = y1; y < y2; y += ystep )
                for( x = 0; x < ssz.width; x += ystep )
                {
@ -2418,45 +2531,4 @@ CvType haar_type( CV_TYPE_NAME_HAAR, icvIsHaarClassifier,
                  icvReadHaarClassifier, icvWriteHaarClassifier,
                  icvCloneHaarClassifier );

-#if 0
-namespace cv
-{
-
-HaarClassifierCascade::HaarClassifierCascade() {}
-HaarClassifierCascade::HaarClassifierCascade(const String& filename)
-{ load(filename); }
-
-bool HaarClassifierCascade::load(const String& filename)
-{
-    cascade = Ptr<CvHaarClassifierCascade>((CvHaarClassifierCascade*)cvLoad(filename.c_str(), 0, 0, 0));
-    return (CvHaarClassifierCascade*)cascade != 0;
-}
-
-void HaarClassifierCascade::detectMultiScale( const Mat& image,
-                       Vector<Rect>& objects, double scaleFactor,
-                       int minNeighbors, int flags,
-                       Size minSize )
-{
-    MemStorage storage(cvCreateMemStorage(0));
-    CvMat _image = image;
-    CvSeq* _objects = cvHaarDetectObjects( &_image, cascade, storage, scaleFactor,
-                                           minNeighbors, flags, minSize );
-    Seq<Rect>(_objects).copyTo(objects);
-}
-
-int HaarClassifierCascade::runAt(Point pt, int startStage, int) const
-{
-    return cvRunHaarClassifierCascade(cascade, pt, startStage);
-}
-
-void HaarClassifierCascade::setImages( const Mat& sum, const Mat& sqsum,
-                                       const Mat& tilted, double scale )
-{
-    CvMat _sum = sum, _sqsum = sqsum, _tilted = tilted;
-    cvSetImagesForHaarClassifierCascade( cascade, &_sum, &_sqsum, &_tilted, scale );
-}
-
-}
-#endif
-
 /* End of file. */