3rd attempt to prepare patch with improved OpenCL kernels of CascadeClassifier.

11 years ago · 30593ee55e
parent 0fef7f8b96
commit 30593ee55e
9 changed files with 1183 additions and 1007 deletions
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@ -615,7 +615,7 @@ static void* initOpenCLAndLoad(const char* funcname)
            initialized = true;
            g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0;
            if( g_haveOpenCL )
-                fprintf(stderr, "Succesffuly loaded OpenCL v1.1+ runtime from %s\n", oclpath);
+                fprintf(stderr, "Successfully loaded OpenCL v1.1+ runtime from %s\n", oclpath);
            else
                fprintf(stderr, "Failed to load OpenCL runtime\n");
        }
@ -1335,11 +1335,13 @@ inline bool operator < (const HashKey& h1, const HashKey& h2)
    return h1.a < h2.a || (h1.a == h2.a && h1.b < h2.b);
 }

-static bool g_isOpenCLInitialized = false;
-static bool g_isOpenCLAvailable = false;

 bool haveOpenCL()
 {
+#ifdef HAVE_OPENCL
+    static bool g_isOpenCLInitialized = false;
+    static bool g_isOpenCLAvailable = false;
+
    if (!g_isOpenCLInitialized)
    {
        try
@ -1354,6 +1356,9 @@ bool haveOpenCL()
        g_isOpenCLInitialized = true;
    }
    return g_isOpenCLAvailable;
+#else
+    return false;
+#endif
 }

 bool useOpenCL()
--- a/modules/objdetect/doc/cascade_classification.rst
+++ b/modules/objdetect/doc/cascade_classification.rst
@ -32,112 +32,6 @@ The following reference is for the detection part only. There is a separate appl
 .. [Lienhart02] Rainer Lienhart and Jochen Maydt. An Extended Set of Haar-like Features for Rapid Object Detection. IEEE ICIP 2002, Vol. 1, pp. 900-903, Sep. 2002. This paper, as well as the extended technical report, can be retrieved at http://www.multimedia-computing.de/mediawiki//images/5/52/MRL-TR-May02-revised-Dec02.pdf


-FeatureEvaluator
----------------
-.. ocv:class:: FeatureEvaluator
-
-Base class for computing feature values in cascade classifiers. ::
-
-    class CV_EXPORTS FeatureEvaluator
-    {
-    public:
-        enum { HAAR = 0, LBP = 1 }; // supported feature types
-        virtual ~FeatureEvaluator(); // destructor
-        virtual bool read(const FileNode& node);
-        virtual Ptr<FeatureEvaluator> clone() const;
-        virtual int getFeatureType() const;
-
-        virtual bool setImage(const Mat& img, Size origWinSize);
-        virtual bool setWindow(Point p);
-
-        virtual double calcOrd(int featureIdx) const;
-        virtual int calcCat(int featureIdx) const;
-
-        static Ptr<FeatureEvaluator> create(int type);
-    };
-
-
-FeatureEvaluator::read
--------------------------
-Reads parameters of features from the ``FileStorage`` node.
-
-.. ocv:function:: bool FeatureEvaluator::read(const FileNode& node)
-
-    :param node: File node from which the feature parameters are read.
-
-
-
-FeatureEvaluator::clone
---------------------------
-Returns a full copy of the feature evaluator.
-
-.. ocv:function:: Ptr<FeatureEvaluator> FeatureEvaluator::clone() const
-
-
-
-FeatureEvaluator::getFeatureType
------------------------------------
-Returns the feature type (``HAAR`` or ``LBP`` for now).
-
-.. ocv:function:: int FeatureEvaluator::getFeatureType() const
-
-
-FeatureEvaluator::setImage
------------------------------
-Assigns an image to feature evaluator.
-
-.. ocv:function:: bool FeatureEvaluator::setImage(InputArray img, Size origWinSize, Size sumSize)
-
-    :param img: Matrix of the type ``CV_8UC1`` containing an image where the features are computed.
-
-    :param origWinSize: Size of training images.
-
-    :param sumSize: The requested size of integral images (so if the integral image is smaller, it resides in the top-left corner of the larger image of requested size). Because the features are represented using offsets from the image origin, using the same sumSize for all scales helps to avoid constant readjustments of the features to different scales.
-
-The method assigns an image, where the features will be computed, to the feature evaluator.
-
-
-
-FeatureEvaluator::setWindow
-------------------------------
-Assigns a window in the current image where the features will be computed.
-
-.. ocv:function:: bool FeatureEvaluator::setWindow(Point p)
-
-    :param p: Upper left point of the window where the features are computed. Size of the window is equal to the size of training images.
-
-FeatureEvaluator::calcOrd
-----------------------------
-Computes the value of an ordered (numerical) feature.
-
-.. ocv:function:: double FeatureEvaluator::calcOrd(int featureIdx) const
-
-    :param featureIdx: Index of the feature whose value is computed.
-
-The function returns the computed value of an ordered feature.
-
-
-
-FeatureEvaluator::calcCat
-----------------------------
-Computes the value of a categorical feature.
-
-.. ocv:function:: int FeatureEvaluator::calcCat(int featureIdx) const
-
-    :param featureIdx: Index of the feature whose value is computed.
-
-The function returns the computed label of a categorical feature, which is the value from [0,... (number of categories - 1)].
-
-
-FeatureEvaluator::create
----------------------------
-Constructs the feature evaluator.
-
-.. ocv:function:: Ptr<FeatureEvaluator> FeatureEvaluator::create(int type)
-
-    :param type: Type of features evaluated by cascade (``HAAR`` or ``LBP`` for now).
-
-
 CascadeClassifier
 -----------------
 .. ocv:class:: CascadeClassifier
--- a/modules/objdetect/include/opencv2/objdetect.hpp
+++ b/modules/objdetect/include/opencv2/objdetect.hpp
@ -121,29 +121,6 @@ CV_EXPORTS   void groupRectangles_meanshift(std::vector<Rect>& rectList, std::ve
                                            std::vector<double>& foundScales,
                                            double detectThreshold = 0.0, Size winDetSize = Size(64, 128));

-class CV_EXPORTS FeatureEvaluator
-{
-public:
-    enum { HAAR = 0,
-           LBP  = 1,
-           HOG  = 2
-         };
-
-    virtual ~FeatureEvaluator();
-
-    virtual bool read(const FileNode& node);
-    virtual Ptr<FeatureEvaluator> clone() const;
-    virtual int getFeatureType() const;
-
-    virtual bool setImage(InputArray img, Size origWinSize, Size sumSize);
-    virtual bool setWindow(Point p);
-
-    virtual double calcOrd(int featureIdx) const;
-    virtual int calcCat(int featureIdx) const;
-
-    static Ptr<FeatureEvaluator> create(int type);
-};
-
 template<> CV_EXPORTS void DefaultDeleter<CvHaarClassifierCascade>::operator ()(CvHaarClassifierCascade* obj) const;

 enum { CASCADE_DO_CANNY_PRUNING    = 1,
--- a/modules/objdetect/perf/opencl/perf_cascades.cpp
+++ b/modules/objdetect/perf/opencl/perf_cascades.cpp
@ -24,14 +24,14 @@ OCL_PERF_TEST_P(Cascade_Image_MinSize, CascadeClassifier,
                                     string("cv/cascadeandhog/images/class57.png") ),
                    testing::Values(30, 64, 90) ) )
 {
-    const string cascasePath = get<0>(GetParam());
+    const string cascadePath = get<0>(GetParam());
    const string imagePath   = get<1>(GetParam());
    int min_size = get<2>(GetParam());
    Size minSize(min_size, min_size);

-    CascadeClassifier cc( getDataPath(cascasePath) );
+    CascadeClassifier cc( getDataPath(cascadePath) );
    if (cc.empty())
-        FAIL() << "Can't load cascade file: " << getDataPath(cascasePath);
+        FAIL() << "Can't load cascade file: " << getDataPath(cascadePath);

    Mat img = imread(getDataPath(imagePath), IMREAD_GRAYSCALE);
    if (img.empty())
--- a/modules/objdetect/src/cascadedetect.cpp
+++ b/modules/objdetect/src/cascadedetect.cpp
--- a/modules/objdetect/src/cascadedetect.hpp
+++ b/modules/objdetect/src/cascadedetect.hpp
@ -3,6 +3,72 @@
 namespace cv
 {

+class FeatureEvaluator
+{
+public:
+    enum
+    {
+        HAAR = 0,
+        LBP  = 1,
+        HOG  = 2
+    };
+
+    struct ScaleData
+    {
+        ScaleData() { scale = 0.f; layer_ofs = ystep = 0; }
+        Size getWorkingSize(Size winSize) const
+        {
+            return Size(std::max(szi.width - winSize.width, 0),
+                        std::max(szi.height - winSize.height, 0));
+        }
+
+        float scale;
+        Size szi;
+        int layer_ofs, ystep;
+    };
+
+    virtual ~FeatureEvaluator();
+    
+    virtual bool read(const FileNode& node, Size origWinSize);
+    virtual Ptr<FeatureEvaluator> clone() const;
+    virtual int getFeatureType() const;
+    int getNumChannels() const { return nchannels; }
+
+    virtual bool setImage(InputArray img, const std::vector<float>& scales);
+    virtual bool setWindow(Point p, int scaleIdx);
+    const ScaleData& getScaleData(int scaleIdx) const
+    {
+        CV_Assert( 0 <= scaleIdx && scaleIdx < (int)scaleData->size());
+        return scaleData->at(scaleIdx);
+    }
+    virtual void getUMats(std::vector<UMat>& bufs);
+    virtual void getMats();
+
+    Size getLocalSize() const { return localSize; }
+    Size getLocalBufSize() const { return lbufSize; }
+
+    virtual float calcOrd(int featureIdx) const;
+    virtual int calcCat(int featureIdx) const;
+
+    static Ptr<FeatureEvaluator> create(int type);
+
+protected:
+    enum { SBUF_VALID=1, USBUF_VALID=2 };
+    int sbufFlag;
+
+    bool updateScaleData( Size imgsz, const std::vector<float>& _scales );
+    virtual void computeChannels( int, InputArray ) {}
+    virtual void computeOptFeatures() {}
+
+    Size origWinSize, sbufSize, localSize, lbufSize;
+    int nchannels;
+    Mat sbuf, rbuf;
+    UMat urbuf, usbuf, ufbuf, uscaleData;
+
+    Ptr<std::vector<ScaleData> > scaleData;
+};
+
+
 class CascadeClassifierImpl : public BaseCascadeClassifier
 {
 public:
@ -54,9 +120,8 @@ protected:
                            int yStep, double factor, std::vector<Rect>& candidates,
                            std::vector<int>& rejectLevels, std::vector<double>& levelWeights,
                            Size sumSize0, bool outputRejectLevels = false );
-    bool ocl_detectSingleScale( InputArray image, Size processingRectSize,
-                                int yStep, double factor, Size sumSize0 );
-
+    bool ocl_detectMultiScaleNoGrouping( const std::vector<float>& scales,
+                                         std::vector<Rect>& candidates );

    void detectMultiScaleNoGrouping( InputArray image, std::vector<Rect>& candidates,
                                    std::vector<int>& rejectLevels, std::vector<double>& levelWeights,
@ -72,6 +137,7 @@ protected:
    };

    friend class CascadeClassifierInvoker;
+    friend class SparseCascadeClassifierInvoker;

    template<class FEval>
    friend int predictOrdered( CascadeClassifierImpl& cascade, Ptr<FeatureEvaluator> &featureEvaluator, double& weight);
@ -85,7 +151,7 @@ protected:
    template<class FEval>
    friend int predictCategoricalStump( CascadeClassifierImpl& cascade, Ptr<FeatureEvaluator> &featureEvaluator, double& weight);

-    int runAt( Ptr<FeatureEvaluator>& feval, Point pt, double& weight );
+    int runAt( Ptr<FeatureEvaluator>& feval, Point pt, int scaleIdx, double& weight );

    class Data
    {
@ -126,12 +192,10 @@ protected:

        bool read(const FileNode &node);

-        bool isStumpBased() const { return maxNodesPerTree == 1; }
-
        int stageType;
        int featureType;
        int ncategories;
-        int maxNodesPerTree;
+        int minNodesPerTree, maxNodesPerTree;
        Size origWinSize;

        std::vector<Stage> stages;
@ -148,7 +212,7 @@ protected:

    Ptr<MaskGenerator> maskGenerator;
    UMat ugrayImage, uimageBuffer;
-    UMat ufacepos, ustages, ustumps, usubsets;
+    UMat ufacepos, ustages, unodes, uleaves, usubsets;
    ocl::Kernel haarKernel, lbpKernel;
    bool tryOpenCL;

@ -268,7 +332,6 @@ public:

        enum { RECT_NUM = Feature::RECT_NUM };
        float calc( const int* pwin ) const;
-
        void setOffsets( const Feature& _f, int step, int tofs );

        int ofs[RECT_NUM][4];
@ -278,35 +341,34 @@ public:
    HaarEvaluator();
    virtual ~HaarEvaluator();

-    virtual bool read( const FileNode& node );
+    virtual bool read( const FileNode& node, Size origWinSize);
    virtual Ptr<FeatureEvaluator> clone() const;
    virtual int getFeatureType() const { return FeatureEvaluator::HAAR; }

-    virtual bool setImage(InputArray, Size origWinSize, Size sumSize);
-    virtual bool setWindow(Point pt);
-    virtual Rect getNormRect() const;
-    virtual void getUMats(std::vector<UMat>& bufs);
+    virtual bool setWindow(Point p, int scaleIdx);
+    Rect getNormRect() const;
+    int getSquaresOffset() const;

-    double operator()(int featureIdx) const
+    float operator()(int featureIdx) const
    { return optfeaturesPtr[featureIdx].calc(pwin) * varianceNormFactor; }
-    virtual double calcOrd(int featureIdx) const
+    virtual float calcOrd(int featureIdx) const
    { return (*this)(featureIdx); }

 protected:
-    Size origWinSize, sumSize0;
+    virtual void computeChannels( int i, InputArray img );
+    virtual void computeOptFeatures();
+
    Ptr<std::vector<Feature> > features;
    Ptr<std::vector<OptFeature> > optfeatures;
-    OptFeature* optfeaturesPtr; // optimization
+    Ptr<std::vector<OptFeature> > optfeatures_lbuf;
    bool hasTiltedFeatures;

-    Mat sum0, sum, sqsum0, sqsum;
-    UMat usum0, usum, usqsum0, usqsum, ufbuf;
-
+    int tofs, sqofs;
+    Vec4i nofs;
    Rect normrect;
-    int nofs[4];
-
    const int* pwin;
-    double varianceNormFactor;
+    OptFeature* optfeaturesPtr; // optimization
+    float varianceNormFactor;
 };

 inline HaarEvaluator::Feature :: Feature()
@ -336,28 +398,6 @@ inline float HaarEvaluator::OptFeature :: calc( const int* ptr ) const
    return ret;
 }

-inline void HaarEvaluator::OptFeature :: setOffsets( const Feature& _f, int step, int tofs )
-{
-    weight[0] = _f.rect[0].weight;
-    weight[1] = _f.rect[1].weight;
-    weight[2] = _f.rect[2].weight;
-
-    Rect r2 = weight[2] > 0 ? _f.rect[2].r : Rect(0,0,0,0);
-    if (_f.tilted)
-    {
-        CV_TILTED_OFS( ofs[0][0], ofs[0][1], ofs[0][2], ofs[0][3], tofs, _f.rect[0].r, step );
-        CV_TILTED_OFS( ofs[1][0], ofs[1][1], ofs[1][2], ofs[1][3], tofs, _f.rect[1].r, step );
-        CV_TILTED_PTRS( ofs[2][0], ofs[2][1], ofs[2][2], ofs[2][3], tofs, r2, step );
-    }
-    else
-    {
-        CV_SUM_OFS( ofs[0][0], ofs[0][1], ofs[0][2], ofs[0][3], 0, _f.rect[0].r, step );
-        CV_SUM_OFS( ofs[1][0], ofs[1][1], ofs[1][2], ofs[1][3], 0, _f.rect[1].r, step );
-        CV_SUM_OFS( ofs[2][0], ofs[2][1], ofs[2][2], ofs[2][3], 0, r2, step );
-    }
-}
-
-
 //----------------------------------------------  LBPEvaluator -------------------------------------

 class LBPEvaluator : public FeatureEvaluator
@ -367,7 +407,7 @@ public:
    {
        Feature();
        Feature( int x, int y, int _block_w, int _block_h  ) :
-            rect(x, y, _block_w, _block_h) {}
+                 rect(x, y, _block_w, _block_h) {}

        bool read(const FileNode& node );

@ -386,27 +426,25 @@ public:
    LBPEvaluator();
    virtual ~LBPEvaluator();

-    virtual bool read( const FileNode& node );
+    virtual bool read( const FileNode& node, Size origWinSize );
    virtual Ptr<FeatureEvaluator> clone() const;
    virtual int getFeatureType() const { return FeatureEvaluator::LBP; }

-    virtual bool setImage(InputArray image, Size _origWinSize, Size);
-    virtual bool setWindow(Point pt);
-    virtual void getUMats(std::vector<UMat>& bufs);
+    virtual bool setWindow(Point p, int scaleIdx);

    int operator()(int featureIdx) const
    { return optfeaturesPtr[featureIdx].calc(pwin); }
    virtual int calcCat(int featureIdx) const
    { return (*this)(featureIdx); }
 protected:
-    Size origWinSize, sumSize0;
+    virtual void computeChannels( int i, InputArray img );
+    virtual void computeOptFeatures();
+
    Ptr<std::vector<Feature> > features;
    Ptr<std::vector<OptFeature> > optfeatures;
+    Ptr<std::vector<OptFeature> > optfeatures_lbuf;
    OptFeature* optfeaturesPtr; // optimization

-    Mat sum0, sum;
-    UMat usum0, usum, ufbuf;
-
    const int* pwin;
 };

@ -436,98 +474,6 @@ inline int LBPEvaluator::OptFeature :: calc( const int* p ) const
           (CALC_SUM_OFS_( ofs[4], ofs[5], ofs[8], ofs[9], p ) >= cval ? 1 : 0);
 }

-inline void LBPEvaluator::OptFeature :: setOffsets( const Feature& _f, int step )
-{
-    Rect tr = _f.rect;
-    CV_SUM_OFS( ofs[0], ofs[1], ofs[4], ofs[5], 0, tr, step );
-    tr.x += 2*_f.rect.width;
-    CV_SUM_OFS( ofs[2], ofs[3], ofs[6], ofs[7], 0, tr, step );
-    tr.y += 2*_f.rect.height;
-    CV_SUM_OFS( ofs[10], ofs[11], ofs[14], ofs[15], 0, tr, step );
-    tr.x -= 2*_f.rect.width;
-    CV_SUM_OFS( ofs[8], ofs[9], ofs[12], ofs[13], 0, tr, step );
-}
-
-//---------------------------------------------- HOGEvaluator -------------------------------------------
-
-class HOGEvaluator : public FeatureEvaluator
-{
-public:
-    struct Feature
-    {
-        Feature();
-        float calc( int offset ) const;
-        void updatePtrs( const std::vector<Mat>& _hist, const Mat &_normSum );
-        bool read( const FileNode& node );
-
-        enum { CELL_NUM = 4, BIN_NUM = 9 };
-
-        Rect rect[CELL_NUM];
-        int featComponent; //component index from 0 to 35
-        const float* pF[4]; //for feature calculation
-        const float* pN[4]; //for normalization calculation
-    };
-    HOGEvaluator();
-    virtual ~HOGEvaluator();
-    virtual bool read( const FileNode& node );
-    virtual Ptr<FeatureEvaluator> clone() const;
-    virtual int getFeatureType() const { return FeatureEvaluator::HOG; }
-    virtual bool setImage( InputArray image, Size winSize, Size );
-    virtual bool setWindow( Point pt );
-    double operator()(int featureIdx) const
-    {
-        return featuresPtr[featureIdx].calc(offset);
-    }
-    virtual double calcOrd( int featureIdx ) const
-    {
-        return (*this)(featureIdx);
-    }
-
-private:
-    virtual void integralHistogram( const Mat& srcImage, std::vector<Mat> &histogram, Mat &norm, int nbins ) const;
-
-    Size origWinSize;
-    Ptr<std::vector<Feature> > features;
-    Feature* featuresPtr;
-    std::vector<Mat> hist;
-    Mat normSum;
-    int offset;
-};
-
-inline HOGEvaluator::Feature :: Feature()
-{
-    rect[0] = rect[1] = rect[2] = rect[3] = Rect();
-    pF[0] = pF[1] = pF[2] = pF[3] = 0;
-    pN[0] = pN[1] = pN[2] = pN[3] = 0;
-    featComponent = 0;
-}
-
-inline float HOGEvaluator::Feature :: calc( int _offset ) const
-{
-    float res = CALC_SUM(pF, _offset);
-    float normFactor = CALC_SUM(pN, _offset);
-    res = (res > 0.001f) ? (res / ( normFactor + 0.001f) ) : 0.f;
-    return res;
-}
-
-inline void HOGEvaluator::Feature :: updatePtrs( const std::vector<Mat> &_hist, const Mat &_normSum )
-{
-    int binIdx = featComponent % BIN_NUM;
-    int cellIdx = featComponent / BIN_NUM;
-    Rect normRect = Rect( rect[0].x, rect[0].y, 2*rect[0].width, 2*rect[0].height );
-
-    const float* featBuf = (const float*)_hist[binIdx].data;
-    size_t featStep = _hist[0].step / sizeof(featBuf[0]);
-
-    const float* normBuf = (const float*)_normSum.data;
-    size_t normStep = _normSum.step / sizeof(normBuf[0]);
-
-    CV_SUM_PTRS( pF[0], pF[1], pF[2], pF[3], featBuf, rect[cellIdx], featStep );
-    CV_SUM_PTRS( pN[0], pN[1], pN[2], pN[3], normBuf, normRect, normStep );
-}
-
-
-

 //----------------------------------------------  predictor functions -------------------------------------

@ -662,11 +608,7 @@ inline int predictCategoricalStump( CascadeClassifierImpl& cascade,
    const CascadeClassifierImpl::Data::Stump* cascadeStumps = &cascade.data.stumps[0];
    const CascadeClassifierImpl::Data::Stage* cascadeStages = &cascade.data.stages[0];

-#ifdef HAVE_TEGRA_OPTIMIZATION
-    float tmp = 0; // float accumulator -- float operations are quicker
-#else
-    double tmp = 0;
-#endif
+    float tmp = 0;
    for( int si = 0; si < nstages; si++ )
    {
        const CascadeClassifierImpl::Data::Stage& stage = cascadeStages[si];
--- a/modules/objdetect/src/opencl/cascadedetect.cl
+++ b/modules/objdetect/src/opencl/cascadedetect.cl
@ -1,6 +1,18 @@
 ///////////////////////////// OpenCL kernels for face detection //////////////////////////////
 ////////////////////////////// see the opencv/doc/license.txt ///////////////////////////////

+//
+// the code has been derived from the OpenCL Haar cascade kernel by
+//
+//    Niko Li, newlife20080214@gmail.com
+//    Wang Weiyan, wangweiyanster@gmail.com
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Nathan, liujun@multicorewareinc.com
+//    Peng Xiao, pengxiao@outlook.com
+//    Erping Pang, erping@multicorewareinc.com
+//
+
+
 typedef struct __attribute__((aligned(4))) OptHaarFeature
 {
    int4 ofs[3] __attribute__((aligned (4)));
@ -20,6 +32,12 @@ typedef struct __attribute__((aligned(4))) Stump
 }
 Stump;

+typedef struct __attribute__((aligned(4))) Node
+{
+    int4 n __attribute__((aligned (4)));
+}
+Node;
+
 typedef struct __attribute__((aligned (4))) Stage
 {
    int first __attribute__((aligned (4)));
@ -28,151 +46,614 @@ typedef struct __attribute__((aligned (4))) Stage
 }
 Stage;

-__kernel void runHaarClassifierStump(
+typedef struct __attribute__((aligned (4))) ScaleData
+{
+    float scale __attribute__((aligned (4)));
+    int szi_width __attribute__((aligned (4)));
+    int szi_height __attribute__((aligned (4)));
+    int layer_ofs __attribute__((aligned (4)));
+    int ystep __attribute__((aligned (4)));
+}
+ScaleData;
+
+#ifndef SUM_BUF_SIZE
+#define SUM_BUF_SIZE 0
+#endif
+
+#ifndef NODE_COUNT
+#define NODE_COUNT 1
+#endif
+
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X,LOCAL_SIZE_Y,1)))
+void runHaarClassifier(
+    int nscales, __global const ScaleData* scaleData,
    __global const int* sum,
-    int sumstep, int sumoffset,
-    __global const int* sqsum,
-    int sqsumstep, int sqsumoffset,
+    int _sumstep, int sumoffset,
    __global const OptHaarFeature* optfeatures,

-    int nstages,
+    int splitstage, int nstages,
    __global const Stage* stages,
-    __global const Stump* stumps,
+    __global const Node* nodes,
+    __global const float* leaves0,

    volatile __global int* facepos,
-    int2 imgsize, int xyscale, float factor,
-    int4 normrect, int2 windowsize, int maxFaces)
+    int4 normrect, int sqofs, int2 windowsize, int maxFaces)
 {
-    int ix = get_global_id(0)*xyscale;
-    int iy = get_global_id(1)*xyscale;
-    sumstep /= sizeof(int);
-    sqsumstep /= sizeof(int);
+    int lx = get_local_id(0);
+    int ly = get_local_id(1);
+    int groupIdx = get_group_id(0);
+    int i, ngroups = get_global_size(0)/LOCAL_SIZE_X;
+    int scaleIdx, tileIdx, stageIdx;
+    int sumstep = (int)(_sumstep/sizeof(int));
+    int4 nofs0 = (int4)(mad24(normrect.y, sumstep, normrect.x),
+                        mad24(normrect.y, sumstep, normrect.x + normrect.z),
+                        mad24(normrect.y + normrect.w, sumstep, normrect.x),
+                        mad24(normrect.y + normrect.w, sumstep, normrect.x + normrect.z));
+    int normarea = normrect.z * normrect.w;
+    float invarea = 1.f/normarea;
+    int lidx = ly*LOCAL_SIZE_X + lx;

-    if( ix < imgsize.x && iy < imgsize.y )
+    #if SUM_BUF_SIZE > 0
+    int4 nofs = (int4)(mad24(normrect.y, SUM_BUF_STEP, normrect.x),
+                       mad24(normrect.y, SUM_BUF_STEP, normrect.x + normrect.z),
+                       mad24(normrect.y + normrect.w, SUM_BUF_STEP, normrect.x),
+                       mad24(normrect.y + normrect.w, SUM_BUF_STEP, normrect.x + normrect.z));
+    #else
+    int4 nofs = nofs0;
+    #endif
+    #define LOCAL_SIZE (LOCAL_SIZE_X*LOCAL_SIZE_Y)
+    __local int lstore[SUM_BUF_SIZE + LOCAL_SIZE*5/2+1];
+    #if SUM_BUF_SIZE > 0
+    __local int* ibuf = lstore;
+    __local int* lcount = ibuf + SUM_BUF_SIZE;
+    #else
+    __local int* lcount = lstore;
+    #endif
+    __local float* lnf = (__local float*)(lcount + 1);
+    __local float* lpartsum = lnf + LOCAL_SIZE;
+    __local short* lbuf = (__local short*)(lpartsum + LOCAL_SIZE);
+
+    for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )
    {
-        int stageIdx;
-        __global const Stump* stump = stumps;
-
-        __global const int* psum = sum + mad24(iy, sumstep, ix);
-        __global const int* pnsum = psum + mad24(normrect.y, sumstep, normrect.x);
-        int normarea = normrect.z * normrect.w;
-        float invarea = 1.f/normarea;
-        float sval = (pnsum[0] - pnsum[normrect.z] - pnsum[mul24(normrect.w, sumstep)] +
-                      pnsum[mad24(normrect.w, sumstep, normrect.z)])*invarea;
-        float sqval = (sqsum[mad24(iy + normrect.y, sqsumstep, ix + normrect.x)])*invarea;
-        float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f));
-        nf = nf > 0 ? nf : 1.f;
-
-        for( stageIdx = 0; stageIdx < nstages; stageIdx++ )
+        __global const ScaleData* s = scaleData + scaleIdx;
+        int ystep = s->ystep;
+        int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0));
+        int2 ntiles = (int2)((worksize.x + LOCAL_SIZE_X-1)/LOCAL_SIZE_X,
+                             (worksize.y + LOCAL_SIZE_Y-1)/LOCAL_SIZE_Y);
+        int totalTiles = ntiles.x*ntiles.y;
+
+        for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups )
        {
-            int i, ntrees = stages[stageIdx].ntrees;
-            float s = 0.f;
-            for( i = 0; i < ntrees; i++, stump++ )
+            int ix0 = (tileIdx % ntiles.x)*LOCAL_SIZE_X;
+            int iy0 = (tileIdx / ntiles.x)*LOCAL_SIZE_Y;
+            int ix = lx, iy = ly;
+            __global const int* psum0 = sum + mad24(iy0, sumstep, ix0) + s->layer_ofs;
+            __global const int* psum1 = psum0 + mad24(iy, sumstep, ix);
+
+            if( ix0 >= worksize.x || iy0 >= worksize.y )
+                continue;
+            #if SUM_BUF_SIZE > 0
+            for( i = lidx*4; i < SUM_BUF_SIZE; i += LOCAL_SIZE_X*LOCAL_SIZE_Y*4 )
            {
-                float4 st = stump->st;
-                __global const OptHaarFeature* f = optfeatures + as_int(st.x);
-                float4 weight = f->weight;
-
-                int4 ofs = f->ofs[0];
-                sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
-                ofs = f->ofs[1];
-                sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
-                if( weight.z > 0 )
+                int dy = i/SUM_BUF_STEP, dx = i - dy*SUM_BUF_STEP;
+                vstore4(vload4(0, psum0 + mad24(dy, sumstep, dx)), 0, ibuf+i);
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+            #endif
+
+            if( lidx == 0 )
+                lcount[0] = 0;
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            if( ix0 + ix < worksize.x && iy0 + iy < worksize.y )
+            {
+                #if NODE_COUNT==1
+                __global const Stump* stump = (__global const Stump*)nodes;
+                #else
+                __global const Node* node = nodes;
+                __global const float* leaves = leaves0;
+                #endif
+                #if SUM_BUF_SIZE > 0
+                __local const int* psum = ibuf + mad24(iy, SUM_BUF_STEP, ix);
+                #else
+                __global const int* psum = psum1;
+                #endif
+
+                __global const float* psqsum = (__global const float*)(psum1 + sqofs);
+                float sval = (psum[nofs.x] - psum[nofs.y] - psum[nofs.z] + psum[nofs.w])*invarea;
+                float sqval = (psqsum[nofs0.x] - psqsum[nofs0.y] - psqsum[nofs0.z] + psqsum[nofs0.w])*invarea;
+                float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f));
+                nf = nf > 0 ? nf : 1.f;
+
+                for( stageIdx = 0; stageIdx < splitstage; stageIdx++ )
                {
-                    ofs = f->ofs[2];
-                    sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
+                    int ntrees = stages[stageIdx].ntrees;
+                    float s = 0.f;
+                    #if NODE_COUNT==1
+                    for( i = 0; i < ntrees; i++ )
+                    {
+                        float4 st = stump[i].st;
+                        __global const OptHaarFeature* f = optfeatures + as_int(st.x);
+                        float4 weight = f->weight;
+
+                        int4 ofs = f->ofs[0];
+                        sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
+                        ofs = f->ofs[1];
+                        sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
+                        if( weight.z > 0 )
+                        {
+                            ofs = f->ofs[2];
+                            sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
+                        }
+
+                        s += (sval < st.y*nf) ? st.z : st.w;
+                    }
+                    stump += ntrees;
+                    #else
+                    for( i = 0; i < ntrees; i++, node += NODE_COUNT, leaves += NODE_COUNT+1 )
+                    {
+                        int idx = 0;
+                        do
+                        {
+                            int4 n = node[idx].n;
+                            __global const OptHaarFeature* f = optfeatures + n.x;
+                            float4 weight = f->weight;
+
+                            int4 ofs = f->ofs[0];
+
+                            sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
+                            ofs = f->ofs[1];
+                            sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
+                            if( weight.z > 0 )
+                            {
+                                ofs = f->ofs[2];
+                                sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
+                            }
+
+                            idx = (sval < as_float(n.y)*nf) ? n.z : n.w;
+                        }
+                        while(idx > 0);
+                        s += leaves[-idx];
+                    }
+                    #endif
+
+                    if( s < stages[stageIdx].threshold )
+                        break;
                }

-                s += (sval < st.y*nf) ? st.z : st.w;
+                if( stageIdx == splitstage && (ystep == 1 || ((ix | iy) & 1) == 0) )
+                {
+                    int count = atomic_inc(lcount);
+                    lbuf[count] = (int)(ix | (iy << 8));
+                    lnf[count] = nf;
+                }
            }

-            if( s < stages[stageIdx].threshold )
-                break;
-        }
+            for( stageIdx = splitstage; stageIdx < nstages; stageIdx++ )
+            {
+                int nrects = lcount[0];

-        if( stageIdx == nstages )
-        {
-            int nfaces = atomic_inc(facepos);
-            if( nfaces < maxFaces )
+                barrier(CLK_LOCAL_MEM_FENCE);
+                if( nrects == 0 )
+                    break;
+                if( lidx == 0 )
+                    lcount[0] = 0;
+
+                {
+                    #if NODE_COUNT == 1
+                    __global const Stump* stump = (__global const Stump*)nodes + stages[stageIdx].first;
+                    #else
+                    __global const Node* node = nodes + stages[stageIdx].first*NODE_COUNT;
+                    __global const float* leaves = leaves0 + stages[stageIdx].first*(NODE_COUNT+1);
+                    #endif
+                    int nparts = LOCAL_SIZE / nrects;
+                    int ntrees = stages[stageIdx].ntrees;
+                    int ntrees_p = (ntrees + nparts - 1)/nparts;
+                    int nr = lidx / nparts;
+                    int partidx = -1, idxval = 0;
+                    float partsum = 0.f, nf = 0.f;
+
+                    if( nr < nrects )
+                    {
+                        partidx = lidx % nparts;
+                        idxval = lbuf[nr];
+                        nf = lnf[nr];
+
+                        {
+                        int ntrees0 = ntrees_p*partidx;
+                        int ntrees1 = min(ntrees0 + ntrees_p, ntrees);
+                        int ix1 = idxval & 255, iy1 = idxval >> 8;
+                        #if SUM_BUF_SIZE > 0
+                        __local const int* psum = ibuf + mad24(iy1, SUM_BUF_STEP, ix1);
+                        #else
+                        __global const int* psum = psum0 + mad24(iy1, sumstep, ix1);
+                        #endif
+
+                        #if NODE_COUNT == 1
+                        for( i = ntrees0; i < ntrees1; i++ )
+                        {
+                            float4 st = stump[i].st;
+                            __global const OptHaarFeature* f = optfeatures + as_int(st.x);
+                            float4 weight = f->weight;
+
+                            int4 ofs = f->ofs[0];
+                            float sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
+                            ofs = f->ofs[1];
+                            sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
+                            //if( weight.z > 0 )
+                            {
+                                ofs = f->ofs[2];
+                                sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
+                            }
+
+                            partsum += (sval < st.y*nf) ? st.z : st.w;
+                        }
+                        #else
+                        for( i = ntrees0; i < ntrees1; i++ )
+                        {
+                            int idx = 0;
+                            do
+                            {
+                                int4 n = node[i*2 + idx].n;
+                                __global const OptHaarFeature* f = optfeatures + n.x;
+                                float4 weight = f->weight;
+                                int4 ofs = f->ofs[0];
+
+                                float sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
+                                ofs = f->ofs[1];
+                                sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
+                                if( weight.z > 0 )
+                                {
+                                    ofs = f->ofs[2];
+                                    sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
+                                }
+
+                                idx = (sval < as_float(n.y)*nf) ? n.z : n.w;
+                            }
+                            while(idx > 0);
+                            partsum += leaves[i*3-idx];
+                        }
+                        #endif
+                        }
+                    }
+                    lpartsum[lidx] = partsum;
+                    barrier(CLK_LOCAL_MEM_FENCE);
+
+                    if( partidx == 0 )
+                    {
+                        float s = lpartsum[nr*nparts];
+                        for( i = 1; i < nparts; i++ )
+                            s += lpartsum[i + nr*nparts];
+                        if( s >= stages[stageIdx].threshold )
+                        {
+                            int count = atomic_inc(lcount);
+                            lbuf[count] = idxval;
+                            lnf[count] = nf;
+                        }
+                    }
+                }
+            }
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+            if( stageIdx == nstages )
            {
-                volatile __global int* face = facepos + 1 + nfaces*4;
-                face[0] = convert_int_rte(ix*factor);
-                face[1] = convert_int_rte(iy*factor);
-                face[2] = convert_int_rte(windowsize.x*factor);
-                face[3] = convert_int_rte(windowsize.y*factor);
+                int nrects = lcount[0];
+                if( lidx < nrects )
+                {
+                    int nfaces = atomic_inc(facepos);
+                    if( nfaces < maxFaces )
+                    {
+                        volatile __global int* face = facepos + 1 + nfaces*3;
+                        int val = lbuf[lidx];
+                        face[0] = scaleIdx;
+                        face[1] = ix0 + (val & 255);
+                        face[2] = iy0 + (val >> 8);
+                    }
+                }
            }
        }
    }
 }

+#undef CALC_SUM_OFS_
+#define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \
+    ((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3])

-__kernel void runLBPClassifierStump(
+__kernel void runLBPClassifierStumpSimple(
+    int nscales, __global const ScaleData* scaleData,
    __global const int* sum,
-    int sumstep, int sumoffset,
+    int _sumstep, int sumoffset,
    __global const OptLBPFeature* optfeatures,

-    int nstages,
+    int splitstage, int nstages,
    __global const Stage* stages,
    __global const Stump* stumps,
    __global const int* bitsets,
    int bitsetSize,

    volatile __global int* facepos,
-    int2 imgsize, int xyscale, float factor,
    int2 windowsize, int maxFaces)
 {
-    int ix = get_global_id(0)*xyscale;
-    int iy = get_global_id(1)*xyscale;
-    sumstep /= sizeof(int);
+    int lx = get_local_id(0);
+    int ly = get_local_id(1);
+    int local_size_x = get_local_size(0);
+    int local_size_y = get_local_size(1);
+    int groupIdx = get_group_id(1)*get_num_groups(0) + get_group_id(0);
+    int ngroups = get_num_groups(0)*get_num_groups(1);
+    int scaleIdx, tileIdx, stageIdx;
+    int startStage = 0, endStage = nstages;
+    int sumstep = (int)(_sumstep/sizeof(int));

-    if( ix < imgsize.x && iy < imgsize.y )
+    for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )
    {
-        int stageIdx;
-        __global const Stump* stump = stumps;
-        __global const int* p = sum + mad24(iy, sumstep, ix);
+        __global const ScaleData* s = scaleData + scaleIdx;
+        int ystep = s->ystep;
+        int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0));
+        int2 ntiles = (int2)((worksize.x/ystep + local_size_x-1)/local_size_x,
+                             (worksize.y/ystep + local_size_y-1)/local_size_y);
+        int totalTiles = ntiles.x*ntiles.y;

-        for( stageIdx = 0; stageIdx < nstages; stageIdx++ )
+        for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups )
        {
-            int i, ntrees = stages[stageIdx].ntrees;
-            float s = 0.f;
-            for( i = 0; i < ntrees; i++, stump++, bitsets += bitsetSize )
+            int iy = ((tileIdx / ntiles.x)*local_size_y + ly)*ystep;
+            int ix = ((tileIdx % ntiles.x)*local_size_x + lx)*ystep;
+
+            if( ix < worksize.x && iy < worksize.y )
            {
-                float4 st = stump->st;
-                __global const OptLBPFeature* f = optfeatures + as_int(st.x);
-                int16 ofs = f->ofs;
+                __global const int* p = sum + mad24(iy, sumstep, ix) + s->layer_ofs;
+                __global const Stump* stump = stumps;
+                __global const int* bitset = bitsets;

-                #define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \
-                ((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3])
+                for( stageIdx = 0; stageIdx < endStage; stageIdx++ )
+                {
+                    int i, ntrees = stages[stageIdx].ntrees;
+                    float s = 0.f;
+                    for( i = 0; i < ntrees; i++, stump++, bitset += bitsetSize )
+                    {
+                        float4 st = stump->st;
+                        __global const OptLBPFeature* f = optfeatures + as_int(st.x);
+                        int16 ofs = f->ofs;

-                int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );
+                        int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );

-                int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0
-                idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1
-                idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2
+                        int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0
+                        idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1
+                        idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2

-                mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5
-                mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0);  // 8
-                mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0);  // 7
-                mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0);  // 6
-                mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0);  // 7
+                        mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5
+                        mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0);  // 8
+                        mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0);  // 7
+                        mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0);  // 6
+                        mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0);  // 7

-                s += (bitsets[idx] & (1 << mask)) ? st.z : st.w;
-            }
+                        s += (bitset[idx] & (1 << mask)) ? st.z : st.w;
+                    }
+
+                    if( s < stages[stageIdx].threshold )
+                        break;
+                }

-            if( s < stages[stageIdx].threshold )
-                break;
+                if( stageIdx == nstages )
+                {
+                    int nfaces = atomic_inc(facepos);
+                    if( nfaces < maxFaces )
+                    {
+                        volatile __global int* face = facepos + 1 + nfaces*3;
+                        face[0] = scaleIdx;
+                        face[1] = ix;
+                        face[2] = iy;
+                    }
+                }
+            }
        }
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X,LOCAL_SIZE_Y,1)))
+void runLBPClassifierStump(
+    int nscales, __global const ScaleData* scaleData,
+    __global const int* sum,
+    int _sumstep, int sumoffset,
+    __global const OptLBPFeature* optfeatures,
+
+    int splitstage, int nstages,
+    __global const Stage* stages,
+    __global const Stump* stumps,
+    __global const int* bitsets,
+    int bitsetSize,
+
+    volatile __global int* facepos,
+    int2 windowsize, int maxFaces)
+{
+    int lx = get_local_id(0);
+    int ly = get_local_id(1);
+    int groupIdx = get_group_id(0);
+    int i, ngroups = get_global_size(0)/LOCAL_SIZE_X;
+    int scaleIdx, tileIdx, stageIdx;
+    int sumstep = (int)(_sumstep/sizeof(int));
+    int lidx = ly*LOCAL_SIZE_X + lx;

-        if( stageIdx == nstages )
+    #define LOCAL_SIZE (LOCAL_SIZE_X*LOCAL_SIZE_Y)
+    __local int lstore[SUM_BUF_SIZE + LOCAL_SIZE*3/2+1];
+    #if SUM_BUF_SIZE > 0
+    __local int* ibuf = lstore;
+    __local int* lcount = ibuf + SUM_BUF_SIZE;
+    #else
+    __local int* lcount = lstore;
+    #endif
+    __local float* lpartsum = (__local float*)(lcount + 1);
+    __local short* lbuf = (__local short*)(lpartsum + LOCAL_SIZE);
+
+    for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )
+    {
+        __global const ScaleData* s = scaleData + scaleIdx;
+        int ystep = s->ystep;
+        int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0));
+        int2 ntiles = (int2)((worksize.x + LOCAL_SIZE_X-1)/LOCAL_SIZE_X,
+                             (worksize.y + LOCAL_SIZE_Y-1)/LOCAL_SIZE_Y);
+        int totalTiles = ntiles.x*ntiles.y;
+
+        for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups )
        {
-            int nfaces = atomic_inc(facepos);
-            if( nfaces < maxFaces )
+            int ix0 = (tileIdx % ntiles.x)*LOCAL_SIZE_X;
+            int iy0 = (tileIdx / ntiles.x)*LOCAL_SIZE_Y;
+            int ix = lx, iy = ly;
+            __global const int* psum0 = sum + mad24(iy0, sumstep, ix0) + s->layer_ofs;
+
+            if( ix0 >= worksize.x || iy0 >= worksize.y )
+                continue;
+            #if SUM_BUF_SIZE > 0
+            for( i = lidx*4; i < SUM_BUF_SIZE; i += LOCAL_SIZE_X*LOCAL_SIZE_Y*4 )
            {
-                volatile __global int* face = facepos + 1 + nfaces*4;
-                face[0] = convert_int_rte(ix*factor);
-                face[1] = convert_int_rte(iy*factor);
-                face[2] = convert_int_rte(windowsize.x*factor);
-                face[3] = convert_int_rte(windowsize.y*factor);
+                int dy = i/SUM_BUF_STEP, dx = i - dy*SUM_BUF_STEP;
+                vstore4(vload4(0, psum0 + mad24(dy, sumstep, dx)), 0, ibuf+i);
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+            #endif
+
+            if( lidx == 0 )
+                lcount[0] = 0;
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            if( ix0 + ix < worksize.x && iy0 + iy < worksize.y )
+            {
+                __global const Stump* stump = stumps;
+                __global const int* bitset = bitsets;
+                #if SUM_BUF_SIZE > 0
+                __local const int* p = ibuf + mad24(iy, SUM_BUF_STEP, ix);
+                #else
+                __global const int* p = psum0 + mad24(iy, sumstep, ix);
+                #endif
+
+                for( stageIdx = 0; stageIdx < splitstage; stageIdx++ )
+                {
+                    int ntrees = stages[stageIdx].ntrees;
+                    float s = 0.f;
+                    for( i = 0; i < ntrees; i++, stump++, bitset += bitsetSize )
+                    {
+                        float4 st = stump->st;
+                        __global const OptLBPFeature* f = optfeatures + as_int(st.x);
+                        int16 ofs = f->ofs;
+
+                        int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );
+
+                        int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0
+                        idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1
+                        idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2
+
+                        mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5
+                        mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0);  // 8
+                        mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0);  // 7
+                        mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0);  // 6
+                        mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0);  // 7
+                        
+                        s += (bitset[idx] & (1 << mask)) ? st.z : st.w;
+                    }
+
+                    if( s < stages[stageIdx].threshold )
+                        break;
+                }
+
+                if( stageIdx == splitstage && (ystep == 1 || ((ix | iy) & 1) == 0) )
+                {
+                    int count = atomic_inc(lcount);
+                    lbuf[count] = (int)(ix | (iy << 8));
+                }
+            }
+
+            for( stageIdx = splitstage; stageIdx < nstages; stageIdx++ )
+            {
+                int nrects = lcount[0];
+
+                barrier(CLK_LOCAL_MEM_FENCE);
+                if( nrects == 0 )
+                    break;
+                if( lidx == 0 )
+                    lcount[0] = 0;
+
+                {
+                    __global const Stump* stump = stumps + stages[stageIdx].first;
+                    __global const int* bitset = bitsets + stages[stageIdx].first*bitsetSize;
+                    int nparts = LOCAL_SIZE / nrects;
+                    int ntrees = stages[stageIdx].ntrees;
+                    int ntrees_p = (ntrees + nparts - 1)/nparts;
+                    int nr = lidx / nparts;
+                    int partidx = -1, idxval = 0;
+                    float partsum = 0.f, nf = 0.f;
+
+                    if( nr < nrects )
+                    {
+                        partidx = lidx % nparts;
+                        idxval = lbuf[nr];
+
+                        {
+                            int ntrees0 = ntrees_p*partidx;
+                            int ntrees1 = min(ntrees0 + ntrees_p, ntrees);
+                            int ix1 = idxval & 255, iy1 = idxval >> 8;
+                            #if SUM_BUF_SIZE > 0
+                            __local const int* p = ibuf + mad24(iy1, SUM_BUF_STEP, ix1);
+                            #else
+                            __global const int* p = psum0 + mad24(iy1, sumstep, ix1);
+                            #endif
+
+                            for( i = ntrees0; i < ntrees1; i++ )
+                            {
+                                float4 st = stump[i].st;
+                                __global const OptLBPFeature* f = optfeatures + as_int(st.x);
+                                int16 ofs = f->ofs;
+
+                                #define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \
+                                    ((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3])
+
+                                int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );
+
+                                int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0
+                                idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1
+                                idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2
+
+                                mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5
+                                mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0);  // 8
+                                mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0);  // 7
+                                mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0);  // 6
+                                mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0);  // 7
+
+                                partsum += (bitset[i*bitsetSize + idx] & (1 << mask)) ? st.z : st.w;
+                            }
+                        }
+                    }
+                    lpartsum[lidx] = partsum;
+                    barrier(CLK_LOCAL_MEM_FENCE);
+
+                    if( partidx == 0 )
+                    {
+                        float s = lpartsum[nr*nparts];
+                        for( i = 1; i < nparts; i++ )
+                            s += lpartsum[i + nr*nparts];
+                        if( s >= stages[stageIdx].threshold )
+                        {
+                            int count = atomic_inc(lcount);
+                            lbuf[count] = idxval;
+                        }
+                    }
+                }
+            }
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+            if( stageIdx == nstages )
+            {
+                int nrects = lcount[0];
+                if( lidx < nrects )
+                {
+                    int nfaces = atomic_inc(facepos);
+                    if( nfaces < maxFaces )
+                    {
+                        volatile __global int* face = facepos + 1 + nfaces*3;
+                        int val = lbuf[lidx];
+                        face[0] = scaleIdx;
+                        face[1] = ix0 + (val & 255);
+                        face[2] = iy0 + (val >> 8);
+                    }
+                }
            }
        }
    }
--- a/modules/objdetect/test/test_cascadeandhog.cpp
+++ b/modules/objdetect/test/test_cascadeandhog.cpp
@ -257,6 +257,7 @@ int CV_DetectorTest::runTestCase( int detectorIdx, vector<vector<Rect> >& object
    string dataPath = ts->get_data_path(), detectorFilename;
    if( !detectorFilenames[detectorIdx].empty() )
        detectorFilename = dataPath + detectorFilenames[detectorIdx];
+    printf("detector %s\n", detectorFilename.c_str());

    for( int ii = 0; ii < (int)imageFilenames.size(); ++ii )
    {
--- a/samples/cpp/ufacedetect.cpp
+++ b/samples/cpp/ufacedetect.cpp
@ -231,9 +231,14 @@ void detectAndDraw( UMat& img, Mat& canvas, CascadeClassifier& cascade,
    smallImg.copyTo(canvas);

    double fps = getTickFrequency()/t;
+    static double avgfps = 0;
+    static int nframes = 0;
+    nframes++;
+    double alpha = nframes > 50 ? 0.01 : 1./nframes;
+    avgfps = avgfps*(1-alpha) + fps*alpha;

-    putText(canvas, format("OpenCL: %s, fps: %.1f", ocl::useOpenCL() ? "ON" : "OFF", fps), Point(250, 50),
-            FONT_HERSHEY_SIMPLEX, 1, Scalar(0,255,0), 3);
+    putText(canvas, format("OpenCL: %s, fps: %.1f", ocl::useOpenCL() ? "ON" : "OFF", avgfps), Point(50, 30),
+            FONT_HERSHEY_SIMPLEX, 0.8, Scalar(0,255,0), 2);

    for( vector<Rect>::const_iterator r = faces.begin(); r != faces.end(); r++, i++ )
    {