3rd attempt to prepare patch with improved OpenCL kernels of CascadeClassifier.

pull/2265/head
Vadim Pisarevsky 11 years ago
parent 0fef7f8b96
commit 30593ee55e
  1. 11
      modules/core/src/ocl.cpp
  2. 106
      modules/objdetect/doc/cascade_classification.rst
  3. 23
      modules/objdetect/include/opencv2/objdetect.hpp
  4. 6
      modules/objdetect/perf/opencl/perf_cascades.cpp
  5. 1117
      modules/objdetect/src/cascadedetect.cpp
  6. 246
      modules/objdetect/src/cascadedetect.hpp
  7. 671
      modules/objdetect/src/opencl/cascadedetect.cl
  8. 1
      modules/objdetect/test/test_cascadeandhog.cpp
  9. 9
      samples/cpp/ufacedetect.cpp

@ -615,7 +615,7 @@ static void* initOpenCLAndLoad(const char* funcname)
initialized = true; initialized = true;
g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0; g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0;
if( g_haveOpenCL ) if( g_haveOpenCL )
fprintf(stderr, "Succesffuly loaded OpenCL v1.1+ runtime from %s\n", oclpath); fprintf(stderr, "Successfully loaded OpenCL v1.1+ runtime from %s\n", oclpath);
else else
fprintf(stderr, "Failed to load OpenCL runtime\n"); fprintf(stderr, "Failed to load OpenCL runtime\n");
} }
@ -1335,11 +1335,13 @@ inline bool operator < (const HashKey& h1, const HashKey& h2)
return h1.a < h2.a || (h1.a == h2.a && h1.b < h2.b); return h1.a < h2.a || (h1.a == h2.a && h1.b < h2.b);
} }
static bool g_isOpenCLInitialized = false;
static bool g_isOpenCLAvailable = false;
bool haveOpenCL() bool haveOpenCL()
{ {
#ifdef HAVE_OPENCL
static bool g_isOpenCLInitialized = false;
static bool g_isOpenCLAvailable = false;
if (!g_isOpenCLInitialized) if (!g_isOpenCLInitialized)
{ {
try try
@ -1354,6 +1356,9 @@ bool haveOpenCL()
g_isOpenCLInitialized = true; g_isOpenCLInitialized = true;
} }
return g_isOpenCLAvailable; return g_isOpenCLAvailable;
#else
return false;
#endif
} }
bool useOpenCL() bool useOpenCL()

@ -32,112 +32,6 @@ The following reference is for the detection part only. There is a separate appl
.. [Lienhart02] Rainer Lienhart and Jochen Maydt. An Extended Set of Haar-like Features for Rapid Object Detection. IEEE ICIP 2002, Vol. 1, pp. 900-903, Sep. 2002. This paper, as well as the extended technical report, can be retrieved at http://www.multimedia-computing.de/mediawiki//images/5/52/MRL-TR-May02-revised-Dec02.pdf .. [Lienhart02] Rainer Lienhart and Jochen Maydt. An Extended Set of Haar-like Features for Rapid Object Detection. IEEE ICIP 2002, Vol. 1, pp. 900-903, Sep. 2002. This paper, as well as the extended technical report, can be retrieved at http://www.multimedia-computing.de/mediawiki//images/5/52/MRL-TR-May02-revised-Dec02.pdf
FeatureEvaluator
----------------
.. ocv:class:: FeatureEvaluator
Base class for computing feature values in cascade classifiers. ::
class CV_EXPORTS FeatureEvaluator
{
public:
enum { HAAR = 0, LBP = 1 }; // supported feature types
virtual ~FeatureEvaluator(); // destructor
virtual bool read(const FileNode& node);
virtual Ptr<FeatureEvaluator> clone() const;
virtual int getFeatureType() const;
virtual bool setImage(const Mat& img, Size origWinSize);
virtual bool setWindow(Point p);
virtual double calcOrd(int featureIdx) const;
virtual int calcCat(int featureIdx) const;
static Ptr<FeatureEvaluator> create(int type);
};
FeatureEvaluator::read
--------------------------
Reads parameters of features from the ``FileStorage`` node.
.. ocv:function:: bool FeatureEvaluator::read(const FileNode& node)
:param node: File node from which the feature parameters are read.
FeatureEvaluator::clone
---------------------------
Returns a full copy of the feature evaluator.
.. ocv:function:: Ptr<FeatureEvaluator> FeatureEvaluator::clone() const
FeatureEvaluator::getFeatureType
------------------------------------
Returns the feature type (``HAAR`` or ``LBP`` for now).
.. ocv:function:: int FeatureEvaluator::getFeatureType() const
FeatureEvaluator::setImage
------------------------------
Assigns an image to feature evaluator.
.. ocv:function:: bool FeatureEvaluator::setImage(InputArray img, Size origWinSize, Size sumSize)
:param img: Matrix of the type ``CV_8UC1`` containing an image where the features are computed.
:param origWinSize: Size of training images.
:param sumSize: The requested size of integral images (so if the integral image is smaller, it resides in the top-left corner of the larger image of requested size). Because the features are represented using offsets from the image origin, using the same sumSize for all scales helps to avoid constant readjustments of the features to different scales.
The method assigns an image, where the features will be computed, to the feature evaluator.
FeatureEvaluator::setWindow
-------------------------------
Assigns a window in the current image where the features will be computed.
.. ocv:function:: bool FeatureEvaluator::setWindow(Point p)
:param p: Upper left point of the window where the features are computed. Size of the window is equal to the size of training images.
FeatureEvaluator::calcOrd
-----------------------------
Computes the value of an ordered (numerical) feature.
.. ocv:function:: double FeatureEvaluator::calcOrd(int featureIdx) const
:param featureIdx: Index of the feature whose value is computed.
The function returns the computed value of an ordered feature.
FeatureEvaluator::calcCat
-----------------------------
Computes the value of a categorical feature.
.. ocv:function:: int FeatureEvaluator::calcCat(int featureIdx) const
:param featureIdx: Index of the feature whose value is computed.
The function returns the computed label of a categorical feature, which is the value from [0,... (number of categories - 1)].
FeatureEvaluator::create
----------------------------
Constructs the feature evaluator.
.. ocv:function:: Ptr<FeatureEvaluator> FeatureEvaluator::create(int type)
:param type: Type of features evaluated by cascade (``HAAR`` or ``LBP`` for now).
CascadeClassifier CascadeClassifier
----------------- -----------------
.. ocv:class:: CascadeClassifier .. ocv:class:: CascadeClassifier

@ -121,29 +121,6 @@ CV_EXPORTS void groupRectangles_meanshift(std::vector<Rect>& rectList, std::ve
std::vector<double>& foundScales, std::vector<double>& foundScales,
double detectThreshold = 0.0, Size winDetSize = Size(64, 128)); double detectThreshold = 0.0, Size winDetSize = Size(64, 128));
class CV_EXPORTS FeatureEvaluator
{
public:
enum { HAAR = 0,
LBP = 1,
HOG = 2
};
virtual ~FeatureEvaluator();
virtual bool read(const FileNode& node);
virtual Ptr<FeatureEvaluator> clone() const;
virtual int getFeatureType() const;
virtual bool setImage(InputArray img, Size origWinSize, Size sumSize);
virtual bool setWindow(Point p);
virtual double calcOrd(int featureIdx) const;
virtual int calcCat(int featureIdx) const;
static Ptr<FeatureEvaluator> create(int type);
};
template<> CV_EXPORTS void DefaultDeleter<CvHaarClassifierCascade>::operator ()(CvHaarClassifierCascade* obj) const; template<> CV_EXPORTS void DefaultDeleter<CvHaarClassifierCascade>::operator ()(CvHaarClassifierCascade* obj) const;
enum { CASCADE_DO_CANNY_PRUNING = 1, enum { CASCADE_DO_CANNY_PRUNING = 1,

@ -24,14 +24,14 @@ OCL_PERF_TEST_P(Cascade_Image_MinSize, CascadeClassifier,
string("cv/cascadeandhog/images/class57.png") ), string("cv/cascadeandhog/images/class57.png") ),
testing::Values(30, 64, 90) ) ) testing::Values(30, 64, 90) ) )
{ {
const string cascasePath = get<0>(GetParam()); const string cascadePath = get<0>(GetParam());
const string imagePath = get<1>(GetParam()); const string imagePath = get<1>(GetParam());
int min_size = get<2>(GetParam()); int min_size = get<2>(GetParam());
Size minSize(min_size, min_size); Size minSize(min_size, min_size);
CascadeClassifier cc( getDataPath(cascasePath) ); CascadeClassifier cc( getDataPath(cascadePath) );
if (cc.empty()) if (cc.empty())
FAIL() << "Can't load cascade file: " << getDataPath(cascasePath); FAIL() << "Can't load cascade file: " << getDataPath(cascadePath);
Mat img = imread(getDataPath(imagePath), IMREAD_GRAYSCALE); Mat img = imread(getDataPath(imagePath), IMREAD_GRAYSCALE);
if (img.empty()) if (img.empty())

File diff suppressed because it is too large Load Diff

@ -3,6 +3,72 @@
namespace cv namespace cv
{ {
class FeatureEvaluator
{
public:
enum
{
HAAR = 0,
LBP = 1,
HOG = 2
};
struct ScaleData
{
ScaleData() { scale = 0.f; layer_ofs = ystep = 0; }
Size getWorkingSize(Size winSize) const
{
return Size(std::max(szi.width - winSize.width, 0),
std::max(szi.height - winSize.height, 0));
}
float scale;
Size szi;
int layer_ofs, ystep;
};
virtual ~FeatureEvaluator();
virtual bool read(const FileNode& node, Size origWinSize);
virtual Ptr<FeatureEvaluator> clone() const;
virtual int getFeatureType() const;
int getNumChannels() const { return nchannels; }
virtual bool setImage(InputArray img, const std::vector<float>& scales);
virtual bool setWindow(Point p, int scaleIdx);
const ScaleData& getScaleData(int scaleIdx) const
{
CV_Assert( 0 <= scaleIdx && scaleIdx < (int)scaleData->size());
return scaleData->at(scaleIdx);
}
virtual void getUMats(std::vector<UMat>& bufs);
virtual void getMats();
Size getLocalSize() const { return localSize; }
Size getLocalBufSize() const { return lbufSize; }
virtual float calcOrd(int featureIdx) const;
virtual int calcCat(int featureIdx) const;
static Ptr<FeatureEvaluator> create(int type);
protected:
enum { SBUF_VALID=1, USBUF_VALID=2 };
int sbufFlag;
bool updateScaleData( Size imgsz, const std::vector<float>& _scales );
virtual void computeChannels( int, InputArray ) {}
virtual void computeOptFeatures() {}
Size origWinSize, sbufSize, localSize, lbufSize;
int nchannels;
Mat sbuf, rbuf;
UMat urbuf, usbuf, ufbuf, uscaleData;
Ptr<std::vector<ScaleData> > scaleData;
};
class CascadeClassifierImpl : public BaseCascadeClassifier class CascadeClassifierImpl : public BaseCascadeClassifier
{ {
public: public:
@ -54,9 +120,8 @@ protected:
int yStep, double factor, std::vector<Rect>& candidates, int yStep, double factor, std::vector<Rect>& candidates,
std::vector<int>& rejectLevels, std::vector<double>& levelWeights, std::vector<int>& rejectLevels, std::vector<double>& levelWeights,
Size sumSize0, bool outputRejectLevels = false ); Size sumSize0, bool outputRejectLevels = false );
bool ocl_detectSingleScale( InputArray image, Size processingRectSize, bool ocl_detectMultiScaleNoGrouping( const std::vector<float>& scales,
int yStep, double factor, Size sumSize0 ); std::vector<Rect>& candidates );
void detectMultiScaleNoGrouping( InputArray image, std::vector<Rect>& candidates, void detectMultiScaleNoGrouping( InputArray image, std::vector<Rect>& candidates,
std::vector<int>& rejectLevels, std::vector<double>& levelWeights, std::vector<int>& rejectLevels, std::vector<double>& levelWeights,
@ -72,6 +137,7 @@ protected:
}; };
friend class CascadeClassifierInvoker; friend class CascadeClassifierInvoker;
friend class SparseCascadeClassifierInvoker;
template<class FEval> template<class FEval>
friend int predictOrdered( CascadeClassifierImpl& cascade, Ptr<FeatureEvaluator> &featureEvaluator, double& weight); friend int predictOrdered( CascadeClassifierImpl& cascade, Ptr<FeatureEvaluator> &featureEvaluator, double& weight);
@ -85,7 +151,7 @@ protected:
template<class FEval> template<class FEval>
friend int predictCategoricalStump( CascadeClassifierImpl& cascade, Ptr<FeatureEvaluator> &featureEvaluator, double& weight); friend int predictCategoricalStump( CascadeClassifierImpl& cascade, Ptr<FeatureEvaluator> &featureEvaluator, double& weight);
int runAt( Ptr<FeatureEvaluator>& feval, Point pt, double& weight ); int runAt( Ptr<FeatureEvaluator>& feval, Point pt, int scaleIdx, double& weight );
class Data class Data
{ {
@ -126,12 +192,10 @@ protected:
bool read(const FileNode &node); bool read(const FileNode &node);
bool isStumpBased() const { return maxNodesPerTree == 1; }
int stageType; int stageType;
int featureType; int featureType;
int ncategories; int ncategories;
int maxNodesPerTree; int minNodesPerTree, maxNodesPerTree;
Size origWinSize; Size origWinSize;
std::vector<Stage> stages; std::vector<Stage> stages;
@ -148,7 +212,7 @@ protected:
Ptr<MaskGenerator> maskGenerator; Ptr<MaskGenerator> maskGenerator;
UMat ugrayImage, uimageBuffer; UMat ugrayImage, uimageBuffer;
UMat ufacepos, ustages, ustumps, usubsets; UMat ufacepos, ustages, unodes, uleaves, usubsets;
ocl::Kernel haarKernel, lbpKernel; ocl::Kernel haarKernel, lbpKernel;
bool tryOpenCL; bool tryOpenCL;
@ -268,7 +332,6 @@ public:
enum { RECT_NUM = Feature::RECT_NUM }; enum { RECT_NUM = Feature::RECT_NUM };
float calc( const int* pwin ) const; float calc( const int* pwin ) const;
void setOffsets( const Feature& _f, int step, int tofs ); void setOffsets( const Feature& _f, int step, int tofs );
int ofs[RECT_NUM][4]; int ofs[RECT_NUM][4];
@ -278,35 +341,34 @@ public:
HaarEvaluator(); HaarEvaluator();
virtual ~HaarEvaluator(); virtual ~HaarEvaluator();
virtual bool read( const FileNode& node ); virtual bool read( const FileNode& node, Size origWinSize);
virtual Ptr<FeatureEvaluator> clone() const; virtual Ptr<FeatureEvaluator> clone() const;
virtual int getFeatureType() const { return FeatureEvaluator::HAAR; } virtual int getFeatureType() const { return FeatureEvaluator::HAAR; }
virtual bool setImage(InputArray, Size origWinSize, Size sumSize); virtual bool setWindow(Point p, int scaleIdx);
virtual bool setWindow(Point pt); Rect getNormRect() const;
virtual Rect getNormRect() const; int getSquaresOffset() const;
virtual void getUMats(std::vector<UMat>& bufs);
double operator()(int featureIdx) const float operator()(int featureIdx) const
{ return optfeaturesPtr[featureIdx].calc(pwin) * varianceNormFactor; } { return optfeaturesPtr[featureIdx].calc(pwin) * varianceNormFactor; }
virtual double calcOrd(int featureIdx) const virtual float calcOrd(int featureIdx) const
{ return (*this)(featureIdx); } { return (*this)(featureIdx); }
protected: protected:
Size origWinSize, sumSize0; virtual void computeChannels( int i, InputArray img );
virtual void computeOptFeatures();
Ptr<std::vector<Feature> > features; Ptr<std::vector<Feature> > features;
Ptr<std::vector<OptFeature> > optfeatures; Ptr<std::vector<OptFeature> > optfeatures;
OptFeature* optfeaturesPtr; // optimization Ptr<std::vector<OptFeature> > optfeatures_lbuf;
bool hasTiltedFeatures; bool hasTiltedFeatures;
Mat sum0, sum, sqsum0, sqsum; int tofs, sqofs;
UMat usum0, usum, usqsum0, usqsum, ufbuf; Vec4i nofs;
Rect normrect; Rect normrect;
int nofs[4];
const int* pwin; const int* pwin;
double varianceNormFactor; OptFeature* optfeaturesPtr; // optimization
float varianceNormFactor;
}; };
inline HaarEvaluator::Feature :: Feature() inline HaarEvaluator::Feature :: Feature()
@ -336,28 +398,6 @@ inline float HaarEvaluator::OptFeature :: calc( const int* ptr ) const
return ret; return ret;
} }
inline void HaarEvaluator::OptFeature :: setOffsets( const Feature& _f, int step, int tofs )
{
weight[0] = _f.rect[0].weight;
weight[1] = _f.rect[1].weight;
weight[2] = _f.rect[2].weight;
Rect r2 = weight[2] > 0 ? _f.rect[2].r : Rect(0,0,0,0);
if (_f.tilted)
{
CV_TILTED_OFS( ofs[0][0], ofs[0][1], ofs[0][2], ofs[0][3], tofs, _f.rect[0].r, step );
CV_TILTED_OFS( ofs[1][0], ofs[1][1], ofs[1][2], ofs[1][3], tofs, _f.rect[1].r, step );
CV_TILTED_PTRS( ofs[2][0], ofs[2][1], ofs[2][2], ofs[2][3], tofs, r2, step );
}
else
{
CV_SUM_OFS( ofs[0][0], ofs[0][1], ofs[0][2], ofs[0][3], 0, _f.rect[0].r, step );
CV_SUM_OFS( ofs[1][0], ofs[1][1], ofs[1][2], ofs[1][3], 0, _f.rect[1].r, step );
CV_SUM_OFS( ofs[2][0], ofs[2][1], ofs[2][2], ofs[2][3], 0, r2, step );
}
}
//---------------------------------------------- LBPEvaluator ------------------------------------- //---------------------------------------------- LBPEvaluator -------------------------------------
class LBPEvaluator : public FeatureEvaluator class LBPEvaluator : public FeatureEvaluator
@ -367,7 +407,7 @@ public:
{ {
Feature(); Feature();
Feature( int x, int y, int _block_w, int _block_h ) : Feature( int x, int y, int _block_w, int _block_h ) :
rect(x, y, _block_w, _block_h) {} rect(x, y, _block_w, _block_h) {}
bool read(const FileNode& node ); bool read(const FileNode& node );
@ -386,27 +426,25 @@ public:
LBPEvaluator(); LBPEvaluator();
virtual ~LBPEvaluator(); virtual ~LBPEvaluator();
virtual bool read( const FileNode& node ); virtual bool read( const FileNode& node, Size origWinSize );
virtual Ptr<FeatureEvaluator> clone() const; virtual Ptr<FeatureEvaluator> clone() const;
virtual int getFeatureType() const { return FeatureEvaluator::LBP; } virtual int getFeatureType() const { return FeatureEvaluator::LBP; }
virtual bool setImage(InputArray image, Size _origWinSize, Size); virtual bool setWindow(Point p, int scaleIdx);
virtual bool setWindow(Point pt);
virtual void getUMats(std::vector<UMat>& bufs);
int operator()(int featureIdx) const int operator()(int featureIdx) const
{ return optfeaturesPtr[featureIdx].calc(pwin); } { return optfeaturesPtr[featureIdx].calc(pwin); }
virtual int calcCat(int featureIdx) const virtual int calcCat(int featureIdx) const
{ return (*this)(featureIdx); } { return (*this)(featureIdx); }
protected: protected:
Size origWinSize, sumSize0; virtual void computeChannels( int i, InputArray img );
virtual void computeOptFeatures();
Ptr<std::vector<Feature> > features; Ptr<std::vector<Feature> > features;
Ptr<std::vector<OptFeature> > optfeatures; Ptr<std::vector<OptFeature> > optfeatures;
Ptr<std::vector<OptFeature> > optfeatures_lbuf;
OptFeature* optfeaturesPtr; // optimization OptFeature* optfeaturesPtr; // optimization
Mat sum0, sum;
UMat usum0, usum, ufbuf;
const int* pwin; const int* pwin;
}; };
@ -436,98 +474,6 @@ inline int LBPEvaluator::OptFeature :: calc( const int* p ) const
(CALC_SUM_OFS_( ofs[4], ofs[5], ofs[8], ofs[9], p ) >= cval ? 1 : 0); (CALC_SUM_OFS_( ofs[4], ofs[5], ofs[8], ofs[9], p ) >= cval ? 1 : 0);
} }
inline void LBPEvaluator::OptFeature :: setOffsets( const Feature& _f, int step )
{
Rect tr = _f.rect;
CV_SUM_OFS( ofs[0], ofs[1], ofs[4], ofs[5], 0, tr, step );
tr.x += 2*_f.rect.width;
CV_SUM_OFS( ofs[2], ofs[3], ofs[6], ofs[7], 0, tr, step );
tr.y += 2*_f.rect.height;
CV_SUM_OFS( ofs[10], ofs[11], ofs[14], ofs[15], 0, tr, step );
tr.x -= 2*_f.rect.width;
CV_SUM_OFS( ofs[8], ofs[9], ofs[12], ofs[13], 0, tr, step );
}
//---------------------------------------------- HOGEvaluator -------------------------------------------
class HOGEvaluator : public FeatureEvaluator
{
public:
struct Feature
{
Feature();
float calc( int offset ) const;
void updatePtrs( const std::vector<Mat>& _hist, const Mat &_normSum );
bool read( const FileNode& node );
enum { CELL_NUM = 4, BIN_NUM = 9 };
Rect rect[CELL_NUM];
int featComponent; //component index from 0 to 35
const float* pF[4]; //for feature calculation
const float* pN[4]; //for normalization calculation
};
HOGEvaluator();
virtual ~HOGEvaluator();
virtual bool read( const FileNode& node );
virtual Ptr<FeatureEvaluator> clone() const;
virtual int getFeatureType() const { return FeatureEvaluator::HOG; }
virtual bool setImage( InputArray image, Size winSize, Size );
virtual bool setWindow( Point pt );
double operator()(int featureIdx) const
{
return featuresPtr[featureIdx].calc(offset);
}
virtual double calcOrd( int featureIdx ) const
{
return (*this)(featureIdx);
}
private:
virtual void integralHistogram( const Mat& srcImage, std::vector<Mat> &histogram, Mat &norm, int nbins ) const;
Size origWinSize;
Ptr<std::vector<Feature> > features;
Feature* featuresPtr;
std::vector<Mat> hist;
Mat normSum;
int offset;
};
inline HOGEvaluator::Feature :: Feature()
{
rect[0] = rect[1] = rect[2] = rect[3] = Rect();
pF[0] = pF[1] = pF[2] = pF[3] = 0;
pN[0] = pN[1] = pN[2] = pN[3] = 0;
featComponent = 0;
}
inline float HOGEvaluator::Feature :: calc( int _offset ) const
{
float res = CALC_SUM(pF, _offset);
float normFactor = CALC_SUM(pN, _offset);
res = (res > 0.001f) ? (res / ( normFactor + 0.001f) ) : 0.f;
return res;
}
inline void HOGEvaluator::Feature :: updatePtrs( const std::vector<Mat> &_hist, const Mat &_normSum )
{
int binIdx = featComponent % BIN_NUM;
int cellIdx = featComponent / BIN_NUM;
Rect normRect = Rect( rect[0].x, rect[0].y, 2*rect[0].width, 2*rect[0].height );
const float* featBuf = (const float*)_hist[binIdx].data;
size_t featStep = _hist[0].step / sizeof(featBuf[0]);
const float* normBuf = (const float*)_normSum.data;
size_t normStep = _normSum.step / sizeof(normBuf[0]);
CV_SUM_PTRS( pF[0], pF[1], pF[2], pF[3], featBuf, rect[cellIdx], featStep );
CV_SUM_PTRS( pN[0], pN[1], pN[2], pN[3], normBuf, normRect, normStep );
}
//---------------------------------------------- predictor functions ------------------------------------- //---------------------------------------------- predictor functions -------------------------------------
@ -662,11 +608,7 @@ inline int predictCategoricalStump( CascadeClassifierImpl& cascade,
const CascadeClassifierImpl::Data::Stump* cascadeStumps = &cascade.data.stumps[0]; const CascadeClassifierImpl::Data::Stump* cascadeStumps = &cascade.data.stumps[0];
const CascadeClassifierImpl::Data::Stage* cascadeStages = &cascade.data.stages[0]; const CascadeClassifierImpl::Data::Stage* cascadeStages = &cascade.data.stages[0];
#ifdef HAVE_TEGRA_OPTIMIZATION float tmp = 0;
float tmp = 0; // float accumulator -- float operations are quicker
#else
double tmp = 0;
#endif
for( int si = 0; si < nstages; si++ ) for( int si = 0; si < nstages; si++ )
{ {
const CascadeClassifierImpl::Data::Stage& stage = cascadeStages[si]; const CascadeClassifierImpl::Data::Stage& stage = cascadeStages[si];

@ -1,6 +1,18 @@
///////////////////////////// OpenCL kernels for face detection ////////////////////////////// ///////////////////////////// OpenCL kernels for face detection //////////////////////////////
////////////////////////////// see the opencv/doc/license.txt /////////////////////////////// ////////////////////////////// see the opencv/doc/license.txt ///////////////////////////////
//
// the code has been derived from the OpenCL Haar cascade kernel by
//
// Niko Li, newlife20080214@gmail.com
// Wang Weiyan, wangweiyanster@gmail.com
// Jia Haipeng, jiahaipeng95@gmail.com
// Nathan, liujun@multicorewareinc.com
// Peng Xiao, pengxiao@outlook.com
// Erping Pang, erping@multicorewareinc.com
//
typedef struct __attribute__((aligned(4))) OptHaarFeature typedef struct __attribute__((aligned(4))) OptHaarFeature
{ {
int4 ofs[3] __attribute__((aligned (4))); int4 ofs[3] __attribute__((aligned (4)));
@ -20,6 +32,12 @@ typedef struct __attribute__((aligned(4))) Stump
} }
Stump; Stump;
typedef struct __attribute__((aligned(4))) Node
{
int4 n __attribute__((aligned (4)));
}
Node;
typedef struct __attribute__((aligned (4))) Stage typedef struct __attribute__((aligned (4))) Stage
{ {
int first __attribute__((aligned (4))); int first __attribute__((aligned (4)));
@ -28,151 +46,614 @@ typedef struct __attribute__((aligned (4))) Stage
} }
Stage; Stage;
__kernel void runHaarClassifierStump( typedef struct __attribute__((aligned (4))) ScaleData
{
float scale __attribute__((aligned (4)));
int szi_width __attribute__((aligned (4)));
int szi_height __attribute__((aligned (4)));
int layer_ofs __attribute__((aligned (4)));
int ystep __attribute__((aligned (4)));
}
ScaleData;
#ifndef SUM_BUF_SIZE
#define SUM_BUF_SIZE 0
#endif
#ifndef NODE_COUNT
#define NODE_COUNT 1
#endif
__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X,LOCAL_SIZE_Y,1)))
void runHaarClassifier(
int nscales, __global const ScaleData* scaleData,
__global const int* sum, __global const int* sum,
int sumstep, int sumoffset, int _sumstep, int sumoffset,
__global const int* sqsum,
int sqsumstep, int sqsumoffset,
__global const OptHaarFeature* optfeatures, __global const OptHaarFeature* optfeatures,
int nstages, int splitstage, int nstages,
__global const Stage* stages, __global const Stage* stages,
__global const Stump* stumps, __global const Node* nodes,
__global const float* leaves0,
volatile __global int* facepos, volatile __global int* facepos,
int2 imgsize, int xyscale, float factor, int4 normrect, int sqofs, int2 windowsize, int maxFaces)
int4 normrect, int2 windowsize, int maxFaces)
{ {
int ix = get_global_id(0)*xyscale; int lx = get_local_id(0);
int iy = get_global_id(1)*xyscale; int ly = get_local_id(1);
sumstep /= sizeof(int); int groupIdx = get_group_id(0);
sqsumstep /= sizeof(int); int i, ngroups = get_global_size(0)/LOCAL_SIZE_X;
int scaleIdx, tileIdx, stageIdx;
int sumstep = (int)(_sumstep/sizeof(int));
int4 nofs0 = (int4)(mad24(normrect.y, sumstep, normrect.x),
mad24(normrect.y, sumstep, normrect.x + normrect.z),
mad24(normrect.y + normrect.w, sumstep, normrect.x),
mad24(normrect.y + normrect.w, sumstep, normrect.x + normrect.z));
int normarea = normrect.z * normrect.w;
float invarea = 1.f/normarea;
int lidx = ly*LOCAL_SIZE_X + lx;
if( ix < imgsize.x && iy < imgsize.y ) #if SUM_BUF_SIZE > 0
int4 nofs = (int4)(mad24(normrect.y, SUM_BUF_STEP, normrect.x),
mad24(normrect.y, SUM_BUF_STEP, normrect.x + normrect.z),
mad24(normrect.y + normrect.w, SUM_BUF_STEP, normrect.x),
mad24(normrect.y + normrect.w, SUM_BUF_STEP, normrect.x + normrect.z));
#else
int4 nofs = nofs0;
#endif
#define LOCAL_SIZE (LOCAL_SIZE_X*LOCAL_SIZE_Y)
__local int lstore[SUM_BUF_SIZE + LOCAL_SIZE*5/2+1];
#if SUM_BUF_SIZE > 0
__local int* ibuf = lstore;
__local int* lcount = ibuf + SUM_BUF_SIZE;
#else
__local int* lcount = lstore;
#endif
__local float* lnf = (__local float*)(lcount + 1);
__local float* lpartsum = lnf + LOCAL_SIZE;
__local short* lbuf = (__local short*)(lpartsum + LOCAL_SIZE);
for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )
{ {
int stageIdx; __global const ScaleData* s = scaleData + scaleIdx;
__global const Stump* stump = stumps; int ystep = s->ystep;
int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0));
__global const int* psum = sum + mad24(iy, sumstep, ix); int2 ntiles = (int2)((worksize.x + LOCAL_SIZE_X-1)/LOCAL_SIZE_X,
__global const int* pnsum = psum + mad24(normrect.y, sumstep, normrect.x); (worksize.y + LOCAL_SIZE_Y-1)/LOCAL_SIZE_Y);
int normarea = normrect.z * normrect.w; int totalTiles = ntiles.x*ntiles.y;
float invarea = 1.f/normarea;
float sval = (pnsum[0] - pnsum[normrect.z] - pnsum[mul24(normrect.w, sumstep)] + for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups )
pnsum[mad24(normrect.w, sumstep, normrect.z)])*invarea;
float sqval = (sqsum[mad24(iy + normrect.y, sqsumstep, ix + normrect.x)])*invarea;
float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f));
nf = nf > 0 ? nf : 1.f;
for( stageIdx = 0; stageIdx < nstages; stageIdx++ )
{ {
int i, ntrees = stages[stageIdx].ntrees; int ix0 = (tileIdx % ntiles.x)*LOCAL_SIZE_X;
float s = 0.f; int iy0 = (tileIdx / ntiles.x)*LOCAL_SIZE_Y;
for( i = 0; i < ntrees; i++, stump++ ) int ix = lx, iy = ly;
__global const int* psum0 = sum + mad24(iy0, sumstep, ix0) + s->layer_ofs;
__global const int* psum1 = psum0 + mad24(iy, sumstep, ix);
if( ix0 >= worksize.x || iy0 >= worksize.y )
continue;
#if SUM_BUF_SIZE > 0
for( i = lidx*4; i < SUM_BUF_SIZE; i += LOCAL_SIZE_X*LOCAL_SIZE_Y*4 )
{ {
float4 st = stump->st; int dy = i/SUM_BUF_STEP, dx = i - dy*SUM_BUF_STEP;
__global const OptHaarFeature* f = optfeatures + as_int(st.x); vstore4(vload4(0, psum0 + mad24(dy, sumstep, dx)), 0, ibuf+i);
float4 weight = f->weight; }
barrier(CLK_LOCAL_MEM_FENCE);
int4 ofs = f->ofs[0]; #endif
sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
ofs = f->ofs[1]; if( lidx == 0 )
sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y; lcount[0] = 0;
if( weight.z > 0 ) barrier(CLK_LOCAL_MEM_FENCE);
if( ix0 + ix < worksize.x && iy0 + iy < worksize.y )
{
#if NODE_COUNT==1
__global const Stump* stump = (__global const Stump*)nodes;
#else
__global const Node* node = nodes;
__global const float* leaves = leaves0;
#endif
#if SUM_BUF_SIZE > 0
__local const int* psum = ibuf + mad24(iy, SUM_BUF_STEP, ix);
#else
__global const int* psum = psum1;
#endif
__global const float* psqsum = (__global const float*)(psum1 + sqofs);
float sval = (psum[nofs.x] - psum[nofs.y] - psum[nofs.z] + psum[nofs.w])*invarea;
float sqval = (psqsum[nofs0.x] - psqsum[nofs0.y] - psqsum[nofs0.z] + psqsum[nofs0.w])*invarea;
float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f));
nf = nf > 0 ? nf : 1.f;
for( stageIdx = 0; stageIdx < splitstage; stageIdx++ )
{ {
ofs = f->ofs[2]; int ntrees = stages[stageIdx].ntrees;
sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z; float s = 0.f;
#if NODE_COUNT==1
for( i = 0; i < ntrees; i++ )
{
float4 st = stump[i].st;
__global const OptHaarFeature* f = optfeatures + as_int(st.x);
float4 weight = f->weight;
int4 ofs = f->ofs[0];
sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
ofs = f->ofs[1];
sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
if( weight.z > 0 )
{
ofs = f->ofs[2];
sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
}
s += (sval < st.y*nf) ? st.z : st.w;
}
stump += ntrees;
#else
for( i = 0; i < ntrees; i++, node += NODE_COUNT, leaves += NODE_COUNT+1 )
{
int idx = 0;
do
{
int4 n = node[idx].n;
__global const OptHaarFeature* f = optfeatures + n.x;
float4 weight = f->weight;
int4 ofs = f->ofs[0];
sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
ofs = f->ofs[1];
sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
if( weight.z > 0 )
{
ofs = f->ofs[2];
sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
}
idx = (sval < as_float(n.y)*nf) ? n.z : n.w;
}
while(idx > 0);
s += leaves[-idx];
}
#endif
if( s < stages[stageIdx].threshold )
break;
} }
s += (sval < st.y*nf) ? st.z : st.w; if( stageIdx == splitstage && (ystep == 1 || ((ix | iy) & 1) == 0) )
{
int count = atomic_inc(lcount);
lbuf[count] = (int)(ix | (iy << 8));
lnf[count] = nf;
}
} }
if( s < stages[stageIdx].threshold ) for( stageIdx = splitstage; stageIdx < nstages; stageIdx++ )
break; {
} int nrects = lcount[0];
if( stageIdx == nstages ) barrier(CLK_LOCAL_MEM_FENCE);
{ if( nrects == 0 )
int nfaces = atomic_inc(facepos); break;
if( nfaces < maxFaces ) if( lidx == 0 )
lcount[0] = 0;
{
#if NODE_COUNT == 1
__global const Stump* stump = (__global const Stump*)nodes + stages[stageIdx].first;
#else
__global const Node* node = nodes + stages[stageIdx].first*NODE_COUNT;
__global const float* leaves = leaves0 + stages[stageIdx].first*(NODE_COUNT+1);
#endif
int nparts = LOCAL_SIZE / nrects;
int ntrees = stages[stageIdx].ntrees;
int ntrees_p = (ntrees + nparts - 1)/nparts;
int nr = lidx / nparts;
int partidx = -1, idxval = 0;
float partsum = 0.f, nf = 0.f;
if( nr < nrects )
{
partidx = lidx % nparts;
idxval = lbuf[nr];
nf = lnf[nr];
{
int ntrees0 = ntrees_p*partidx;
int ntrees1 = min(ntrees0 + ntrees_p, ntrees);
int ix1 = idxval & 255, iy1 = idxval >> 8;
#if SUM_BUF_SIZE > 0
__local const int* psum = ibuf + mad24(iy1, SUM_BUF_STEP, ix1);
#else
__global const int* psum = psum0 + mad24(iy1, sumstep, ix1);
#endif
#if NODE_COUNT == 1
for( i = ntrees0; i < ntrees1; i++ )
{
float4 st = stump[i].st;
__global const OptHaarFeature* f = optfeatures + as_int(st.x);
float4 weight = f->weight;
int4 ofs = f->ofs[0];
float sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
ofs = f->ofs[1];
sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
//if( weight.z > 0 )
{
ofs = f->ofs[2];
sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
}
partsum += (sval < st.y*nf) ? st.z : st.w;
}
#else
for( i = ntrees0; i < ntrees1; i++ )
{
int idx = 0;
do
{
int4 n = node[i*2 + idx].n;
__global const OptHaarFeature* f = optfeatures + n.x;
float4 weight = f->weight;
int4 ofs = f->ofs[0];
float sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
ofs = f->ofs[1];
sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
if( weight.z > 0 )
{
ofs = f->ofs[2];
sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
}
idx = (sval < as_float(n.y)*nf) ? n.z : n.w;
}
while(idx > 0);
partsum += leaves[i*3-idx];
}
#endif
}
}
lpartsum[lidx] = partsum;
barrier(CLK_LOCAL_MEM_FENCE);
if( partidx == 0 )
{
float s = lpartsum[nr*nparts];
for( i = 1; i < nparts; i++ )
s += lpartsum[i + nr*nparts];
if( s >= stages[stageIdx].threshold )
{
int count = atomic_inc(lcount);
lbuf[count] = idxval;
lnf[count] = nf;
}
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if( stageIdx == nstages )
{ {
volatile __global int* face = facepos + 1 + nfaces*4; int nrects = lcount[0];
face[0] = convert_int_rte(ix*factor); if( lidx < nrects )
face[1] = convert_int_rte(iy*factor); {
face[2] = convert_int_rte(windowsize.x*factor); int nfaces = atomic_inc(facepos);
face[3] = convert_int_rte(windowsize.y*factor); if( nfaces < maxFaces )
{
volatile __global int* face = facepos + 1 + nfaces*3;
int val = lbuf[lidx];
face[0] = scaleIdx;
face[1] = ix0 + (val & 255);
face[2] = iy0 + (val >> 8);
}
}
} }
} }
} }
} }
#undef CALC_SUM_OFS_
#define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \
((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3])
__kernel void runLBPClassifierStump( __kernel void runLBPClassifierStumpSimple(
int nscales, __global const ScaleData* scaleData,
__global const int* sum, __global const int* sum,
int sumstep, int sumoffset, int _sumstep, int sumoffset,
__global const OptLBPFeature* optfeatures, __global const OptLBPFeature* optfeatures,
int nstages, int splitstage, int nstages,
__global const Stage* stages, __global const Stage* stages,
__global const Stump* stumps, __global const Stump* stumps,
__global const int* bitsets, __global const int* bitsets,
int bitsetSize, int bitsetSize,
volatile __global int* facepos, volatile __global int* facepos,
int2 imgsize, int xyscale, float factor,
int2 windowsize, int maxFaces) int2 windowsize, int maxFaces)
{ {
int ix = get_global_id(0)*xyscale; int lx = get_local_id(0);
int iy = get_global_id(1)*xyscale; int ly = get_local_id(1);
sumstep /= sizeof(int); int local_size_x = get_local_size(0);
int local_size_y = get_local_size(1);
int groupIdx = get_group_id(1)*get_num_groups(0) + get_group_id(0);
int ngroups = get_num_groups(0)*get_num_groups(1);
int scaleIdx, tileIdx, stageIdx;
int startStage = 0, endStage = nstages;
int sumstep = (int)(_sumstep/sizeof(int));
if( ix < imgsize.x && iy < imgsize.y ) for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )
{ {
int stageIdx; __global const ScaleData* s = scaleData + scaleIdx;
__global const Stump* stump = stumps; int ystep = s->ystep;
__global const int* p = sum + mad24(iy, sumstep, ix); int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0));
int2 ntiles = (int2)((worksize.x/ystep + local_size_x-1)/local_size_x,
(worksize.y/ystep + local_size_y-1)/local_size_y);
int totalTiles = ntiles.x*ntiles.y;
for( stageIdx = 0; stageIdx < nstages; stageIdx++ ) for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups )
{ {
int i, ntrees = stages[stageIdx].ntrees; int iy = ((tileIdx / ntiles.x)*local_size_y + ly)*ystep;
float s = 0.f; int ix = ((tileIdx % ntiles.x)*local_size_x + lx)*ystep;
for( i = 0; i < ntrees; i++, stump++, bitsets += bitsetSize )
if( ix < worksize.x && iy < worksize.y )
{ {
float4 st = stump->st; __global const int* p = sum + mad24(iy, sumstep, ix) + s->layer_ofs;
__global const OptLBPFeature* f = optfeatures + as_int(st.x); __global const Stump* stump = stumps;
int16 ofs = f->ofs; __global const int* bitset = bitsets;
#define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \ for( stageIdx = 0; stageIdx < endStage; stageIdx++ )
((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3]) {
int i, ntrees = stages[stageIdx].ntrees;
float s = 0.f;
for( i = 0; i < ntrees; i++, stump++, bitset += bitsetSize )
{
float4 st = stump->st;
__global const OptLBPFeature* f = optfeatures + as_int(st.x);
int16 ofs = f->ofs;
int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p ); int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );
int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0 int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0
idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1 idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1
idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2 idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2
mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5 mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5
mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0); // 8 mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0); // 8
mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0); // 7 mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0); // 7
mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0); // 6 mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0); // 6
mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0); // 7 mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0); // 7
s += (bitsets[idx] & (1 << mask)) ? st.z : st.w; s += (bitset[idx] & (1 << mask)) ? st.z : st.w;
} }
if( s < stages[stageIdx].threshold )
break;
}
if( s < stages[stageIdx].threshold ) if( stageIdx == nstages )
break; {
int nfaces = atomic_inc(facepos);
if( nfaces < maxFaces )
{
volatile __global int* face = facepos + 1 + nfaces*3;
face[0] = scaleIdx;
face[1] = ix;
face[2] = iy;
}
}
}
} }
}
}
__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X,LOCAL_SIZE_Y,1)))
void runLBPClassifierStump(
int nscales, __global const ScaleData* scaleData,
__global const int* sum,
int _sumstep, int sumoffset,
__global const OptLBPFeature* optfeatures,
int splitstage, int nstages,
__global const Stage* stages,
__global const Stump* stumps,
__global const int* bitsets,
int bitsetSize,
volatile __global int* facepos,
int2 windowsize, int maxFaces)
{
int lx = get_local_id(0);
int ly = get_local_id(1);
int groupIdx = get_group_id(0);
int i, ngroups = get_global_size(0)/LOCAL_SIZE_X;
int scaleIdx, tileIdx, stageIdx;
int sumstep = (int)(_sumstep/sizeof(int));
int lidx = ly*LOCAL_SIZE_X + lx;
if( stageIdx == nstages ) #define LOCAL_SIZE (LOCAL_SIZE_X*LOCAL_SIZE_Y)
__local int lstore[SUM_BUF_SIZE + LOCAL_SIZE*3/2+1];
#if SUM_BUF_SIZE > 0
__local int* ibuf = lstore;
__local int* lcount = ibuf + SUM_BUF_SIZE;
#else
__local int* lcount = lstore;
#endif
__local float* lpartsum = (__local float*)(lcount + 1);
__local short* lbuf = (__local short*)(lpartsum + LOCAL_SIZE);
for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )
{
__global const ScaleData* s = scaleData + scaleIdx;
int ystep = s->ystep;
int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0));
int2 ntiles = (int2)((worksize.x + LOCAL_SIZE_X-1)/LOCAL_SIZE_X,
(worksize.y + LOCAL_SIZE_Y-1)/LOCAL_SIZE_Y);
int totalTiles = ntiles.x*ntiles.y;
for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups )
{ {
int nfaces = atomic_inc(facepos); int ix0 = (tileIdx % ntiles.x)*LOCAL_SIZE_X;
if( nfaces < maxFaces ) int iy0 = (tileIdx / ntiles.x)*LOCAL_SIZE_Y;
int ix = lx, iy = ly;
__global const int* psum0 = sum + mad24(iy0, sumstep, ix0) + s->layer_ofs;
if( ix0 >= worksize.x || iy0 >= worksize.y )
continue;
#if SUM_BUF_SIZE > 0
for( i = lidx*4; i < SUM_BUF_SIZE; i += LOCAL_SIZE_X*LOCAL_SIZE_Y*4 )
{ {
volatile __global int* face = facepos + 1 + nfaces*4; int dy = i/SUM_BUF_STEP, dx = i - dy*SUM_BUF_STEP;
face[0] = convert_int_rte(ix*factor); vstore4(vload4(0, psum0 + mad24(dy, sumstep, dx)), 0, ibuf+i);
face[1] = convert_int_rte(iy*factor); }
face[2] = convert_int_rte(windowsize.x*factor); barrier(CLK_LOCAL_MEM_FENCE);
face[3] = convert_int_rte(windowsize.y*factor); #endif
if( lidx == 0 )
lcount[0] = 0;
barrier(CLK_LOCAL_MEM_FENCE);
if( ix0 + ix < worksize.x && iy0 + iy < worksize.y )
{
__global const Stump* stump = stumps;
__global const int* bitset = bitsets;
#if SUM_BUF_SIZE > 0
__local const int* p = ibuf + mad24(iy, SUM_BUF_STEP, ix);
#else
__global const int* p = psum0 + mad24(iy, sumstep, ix);
#endif
for( stageIdx = 0; stageIdx < splitstage; stageIdx++ )
{
int ntrees = stages[stageIdx].ntrees;
float s = 0.f;
for( i = 0; i < ntrees; i++, stump++, bitset += bitsetSize )
{
float4 st = stump->st;
__global const OptLBPFeature* f = optfeatures + as_int(st.x);
int16 ofs = f->ofs;
int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );
int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0
idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1
idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2
mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5
mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0); // 8
mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0); // 7
mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0); // 6
mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0); // 7
s += (bitset[idx] & (1 << mask)) ? st.z : st.w;
}
if( s < stages[stageIdx].threshold )
break;
}
if( stageIdx == splitstage && (ystep == 1 || ((ix | iy) & 1) == 0) )
{
int count = atomic_inc(lcount);
lbuf[count] = (int)(ix | (iy << 8));
}
}
for( stageIdx = splitstage; stageIdx < nstages; stageIdx++ )
{
int nrects = lcount[0];
barrier(CLK_LOCAL_MEM_FENCE);
if( nrects == 0 )
break;
if( lidx == 0 )
lcount[0] = 0;
{
__global const Stump* stump = stumps + stages[stageIdx].first;
__global const int* bitset = bitsets + stages[stageIdx].first*bitsetSize;
int nparts = LOCAL_SIZE / nrects;
int ntrees = stages[stageIdx].ntrees;
int ntrees_p = (ntrees + nparts - 1)/nparts;
int nr = lidx / nparts;
int partidx = -1, idxval = 0;
float partsum = 0.f, nf = 0.f;
if( nr < nrects )
{
partidx = lidx % nparts;
idxval = lbuf[nr];
{
int ntrees0 = ntrees_p*partidx;
int ntrees1 = min(ntrees0 + ntrees_p, ntrees);
int ix1 = idxval & 255, iy1 = idxval >> 8;
#if SUM_BUF_SIZE > 0
__local const int* p = ibuf + mad24(iy1, SUM_BUF_STEP, ix1);
#else
__global const int* p = psum0 + mad24(iy1, sumstep, ix1);
#endif
for( i = ntrees0; i < ntrees1; i++ )
{
float4 st = stump[i].st;
__global const OptLBPFeature* f = optfeatures + as_int(st.x);
int16 ofs = f->ofs;
#define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \
((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3])
int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );
int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0
idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1
idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2
mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5
mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0); // 8
mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0); // 7
mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0); // 6
mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0); // 7
partsum += (bitset[i*bitsetSize + idx] & (1 << mask)) ? st.z : st.w;
}
}
}
lpartsum[lidx] = partsum;
barrier(CLK_LOCAL_MEM_FENCE);
if( partidx == 0 )
{
float s = lpartsum[nr*nparts];
for( i = 1; i < nparts; i++ )
s += lpartsum[i + nr*nparts];
if( s >= stages[stageIdx].threshold )
{
int count = atomic_inc(lcount);
lbuf[count] = idxval;
}
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if( stageIdx == nstages )
{
int nrects = lcount[0];
if( lidx < nrects )
{
int nfaces = atomic_inc(facepos);
if( nfaces < maxFaces )
{
volatile __global int* face = facepos + 1 + nfaces*3;
int val = lbuf[lidx];
face[0] = scaleIdx;
face[1] = ix0 + (val & 255);
face[2] = iy0 + (val >> 8);
}
}
} }
} }
} }

@ -257,6 +257,7 @@ int CV_DetectorTest::runTestCase( int detectorIdx, vector<vector<Rect> >& object
string dataPath = ts->get_data_path(), detectorFilename; string dataPath = ts->get_data_path(), detectorFilename;
if( !detectorFilenames[detectorIdx].empty() ) if( !detectorFilenames[detectorIdx].empty() )
detectorFilename = dataPath + detectorFilenames[detectorIdx]; detectorFilename = dataPath + detectorFilenames[detectorIdx];
printf("detector %s\n", detectorFilename.c_str());
for( int ii = 0; ii < (int)imageFilenames.size(); ++ii ) for( int ii = 0; ii < (int)imageFilenames.size(); ++ii )
{ {

@ -231,9 +231,14 @@ void detectAndDraw( UMat& img, Mat& canvas, CascadeClassifier& cascade,
smallImg.copyTo(canvas); smallImg.copyTo(canvas);
double fps = getTickFrequency()/t; double fps = getTickFrequency()/t;
static double avgfps = 0;
static int nframes = 0;
nframes++;
double alpha = nframes > 50 ? 0.01 : 1./nframes;
avgfps = avgfps*(1-alpha) + fps*alpha;
putText(canvas, format("OpenCL: %s, fps: %.1f", ocl::useOpenCL() ? "ON" : "OFF", fps), Point(250, 50), putText(canvas, format("OpenCL: %s, fps: %.1f", ocl::useOpenCL() ? "ON" : "OFF", avgfps), Point(50, 30),
FONT_HERSHEY_SIMPLEX, 1, Scalar(0,255,0), 3); FONT_HERSHEY_SIMPLEX, 0.8, Scalar(0,255,0), 2);
for( vector<Rect>::const_iterator r = faces.begin(); r != faces.end(); r++, i++ ) for( vector<Rect>::const_iterator r = faces.begin(); r != faces.end(); r++, i++ )
{ {

Loading…
Cancel
Save