@ -47,18 +47,18 @@ |
#if CV_SSE2 || CV_SSE3 |
# if !CV_SSE4_1 && !CV_SSE4_2 |
# define _mm_blendv_pd(a, b, m) _mm_xor_pd(a, _mm_and_pd(_mm_xor_pd(b, a), m)) |
# define _mm_blendv_ps(a, b, m) _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(b, a), m)) |
# define _mm_blendv_pd(a, b, m) _mm_xor_pd(a, _mm_and_pd(_mm_xor_pd(b, a), m)) |
# define _mm_blendv_ps(a, b, m) _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(b, a), m)) |
# endif |
#endif |
# if CV_AVX |
# define CV_HAAR_USE_AVX 1 |
# else |
# if CV_SSE2 || CV_SSE3 |
# define CV_HAAR_USE_SSE 1 |
# endif |
# endif |
# if CV_AVX |
# define CV_HAAR_USE_AVX 1 |
# else |
# if CV_SSE2 || CV_SSE3 |
# define CV_HAAR_USE_SSE 1 |
# endif |
# endif |
/* these settings affect the quality of detection: change with care */ |
@ -634,86 +634,86 @@ cvSetImagesForHaarClassifierCascade( CvHaarClassifierCascade* _cascade, |
//AVX version icvEvalHidHaarClassifier. Process 8 CvHidHaarClassifiers per call. Check AVX support before invocation!!
#ifdef CV_HAAR_USE_AVX |
#ifdef CV_HAAR_USE_AVX |
double icvEvalHidHaarClassifierAVX( CvHidHaarClassifier* classifier, |
double variance_norm_factor, size_t p_offset ) |
double variance_norm_factor, size_t p_offset ) |
{ |
int CV_DECL_ALIGNED(32) idxV[8] = {0,0,0,0,0,0,0,0}; |
char flags[8] = {0,0,0,0,0,0,0,0}; |
CvHidHaarTreeNode* nodes[8]; |
double res = 0;
char exitConditionFlag = 0; |
for(;;) |
{ |
float CV_DECL_ALIGNED(32) tmp[8] = {0,0,0,0,0,0,0,0}; |
nodes[0] = classifier ->node + idxV[0]; |
nodes[1] = (classifier+1)->node + idxV[1]; |
nodes[2] = (classifier+2)->node + idxV[2]; |
nodes[3] = (classifier+3)->node + idxV[3]; |
nodes[4] = (classifier+4)->node + idxV[4]; |
nodes[5] = (classifier+5)->node + idxV[5]; |
nodes[6] = (classifier+6)->node + idxV[6]; |
nodes[7] = (classifier+7)->node + idxV[7]; |
__m256 t = _mm256_set1_ps(variance_norm_factor); |
t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold)); |
__m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset), |
calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0], |
__m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight,
nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight);
__m256 sum = _mm256_mul_ps(offset, weight); |
offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset), |
calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset), |
weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight,
nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight);
sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight)); |
if( nodes[0]->feature.rect[2].p0 ) |
tmp[0] = calc_sum(nodes[0]->feature.rect[2],p_offset) * nodes[0]->feature.rect[2].weight; |
if( nodes[1]->feature.rect[2].p0 ) |
int CV_DECL_ALIGNED(32) idxV[8] = {0,0,0,0,0,0,0,0}; |
char flags[8] = {0,0,0,0,0,0,0,0}; |
CvHidHaarTreeNode* nodes[8]; |
double res = 0; |
char exitConditionFlag = 0; |
for(;;) |
{ |
float CV_DECL_ALIGNED(32) tmp[8] = {0,0,0,0,0,0,0,0}; |
nodes[0] = classifier ->node + idxV[0]; |
nodes[1] = (classifier+1)->node + idxV[1]; |
nodes[2] = (classifier+2)->node + idxV[2]; |
nodes[3] = (classifier+3)->node + idxV[3]; |
nodes[4] = (classifier+4)->node + idxV[4]; |
nodes[5] = (classifier+5)->node + idxV[5]; |
nodes[6] = (classifier+6)->node + idxV[6]; |
nodes[7] = (classifier+7)->node + idxV[7]; |
__m256 t = _mm256_set1_ps(variance_norm_factor); |
t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold)); |
__m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset), |
calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0], |
p_offset),calc_sum(nodes[0]->feature.rect[0],p_offset)); |
__m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight, |
nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight); |
__m256 sum = _mm256_mul_ps(offset, weight); |
offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset), |
calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset), |
calc_sum(nodes[0]->feature.rect[1],p_offset)); |
weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight, |
nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight); |
sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight)); |
if( nodes[0]->feature.rect[2].p0 ) |
tmp[0] = calc_sum(nodes[0]->feature.rect[2],p_offset) * nodes[0]->feature.rect[2].weight; |
if( nodes[1]->feature.rect[2].p0 ) |
tmp[1] = calc_sum(nodes[1]->feature.rect[2],p_offset) * nodes[1]->feature.rect[2].weight; |
if( nodes[2]->feature.rect[2].p0 ) |
if( nodes[2]->feature.rect[2].p0 ) |
tmp[2] = calc_sum(nodes[2]->feature.rect[2],p_offset) * nodes[2]->feature.rect[2].weight; |
if( nodes[3]->feature.rect[2].p0 ) |
if( nodes[3]->feature.rect[2].p0 ) |
tmp[3] = calc_sum(nodes[3]->feature.rect[2],p_offset) * nodes[3]->feature.rect[2].weight; |
if( nodes[4]->feature.rect[2].p0 ) |
tmp[4] = calc_sum(nodes[4]->feature.rect[2],p_offset) * nodes[4]->feature.rect[2].weight; |
if( nodes[5]->feature.rect[2].p0 ) |
if( nodes[4]->feature.rect[2].p0 ) |
tmp[4] = calc_sum(nodes[4]->feature.rect[2],p_offset) * nodes[4]->feature.rect[2].weight; |
if( nodes[5]->feature.rect[2].p0 ) |
tmp[5] = calc_sum(nodes[5]->feature.rect[2],p_offset) * nodes[5]->feature.rect[2].weight; |
if( nodes[6]->feature.rect[2].p0 ) |
if( nodes[6]->feature.rect[2].p0 ) |
tmp[6] = calc_sum(nodes[6]->feature.rect[2],p_offset) * nodes[6]->feature.rect[2].weight; |
if( nodes[7]->feature.rect[2].p0 ) |
if( nodes[7]->feature.rect[2].p0 ) |
tmp[7] = calc_sum(nodes[7]->feature.rect[2],p_offset) * nodes[7]->feature.rect[2].weight; |
sum = _mm256_add_ps(sum,_mm256_load_ps(tmp)); |
__m256 left = _mm256_set_ps(nodes[7]->left,nodes[6]->left,nodes[5]->left,nodes[4]->left,nodes[3]->left,nodes[2]->left,nodes[1]->left,nodes[0]->left); |
__m256 right = _mm256_set_ps(nodes[7]->right,nodes[6]->right,nodes[5]->right,nodes[4]->right,nodes[3]->right,nodes[2]->right,nodes[1]->right,nodes[0]->right); |
_mm256_store_si256((__m256i*)idxV,_mm256_cvttps_epi32(_mm256_blendv_ps(right, left,_mm256_cmp_ps(sum, t, _CMP_LT_OQ )))); |
for(int i = 0; i < 8; i++) |
{ |
if(idxV[i]<=0) |
{ |
if(!flags[i]) |
{ |
exitConditionFlag++; |
flags[i]=1; |
res+=((classifier+i)->alpha[-idxV[i]]); |
} |
idxV[i]=0; |
} |
} |
if(exitConditionFlag==8) |
return res; |
} |
sum = _mm256_add_ps(sum,_mm256_load_ps(tmp)); |
__m256 left = _mm256_set_ps(nodes[7]->left,nodes[6]->left,nodes[5]->left,nodes[4]->left,nodes[3]->left,nodes[2]->left,nodes[1]->left,nodes[0]->left); |
__m256 right = _mm256_set_ps(nodes[7]->right,nodes[6]->right,nodes[5]->right,nodes[4]->right,nodes[3]->right,nodes[2]->right,nodes[1]->right,nodes[0]->right); |
_mm256_store_si256((__m256i*)idxV,_mm256_cvttps_epi32(_mm256_blendv_ps(right, left,_mm256_cmp_ps(sum, t, _CMP_LT_OQ )))); |
for(int i = 0; i < 8; i++) |
{ |
if(idxV[i]<=0) |
{ |
if(!flags[i]) |
{ |
exitConditionFlag++; |
flags[i]=1; |
res+=((classifier+i)->alpha[-idxV[i]]); |
} |
idxV[i]=0; |
} |
} |
if(exitConditionFlag==8) |
return res; |
} |
} |
#endif |
@ -723,50 +723,50 @@ double icvEvalHidHaarClassifier( CvHidHaarClassifier* classifier, |
size_t p_offset ) |
{ |
int idx = 0; |
if(cv::checkHardwareSupport(CV_CPU_SSE2))//based on old SSE variant. Works slow
{ |
double CV_DECL_ALIGNED(16) temp[2];
__m128d zero = _mm_setzero_pd(); |
do |
{ |
CvHidHaarTreeNode* node = classifier->node + idx; |
__m128d t = _mm_set1_pd((node->threshold)*variance_norm_factor); |
__m128d left = _mm_set1_pd(node->left); |
__m128d right = _mm_set1_pd(node->right); |
double _sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; |
_sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; |
if( node->feature.rect[2].p0 ) |
_sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; |
__m128d sum = _mm_set1_pd(_sum); |
t = _mm_cmplt_sd(sum, t); |
sum = _mm_blendv_pd(right, left, t); |
_mm_store_pd(temp, sum); |
idx = (int)temp[0];
} |
while(idx > 0 ); |
} |
else |
#endif*/ |
if(cv::checkHardwareSupport(CV_CPU_SSE2))//based on old SSE variant. Works slow
{ |
double CV_DECL_ALIGNED(16) temp[2]; |
__m128d zero = _mm_setzero_pd(); |
do |
{ |
CvHidHaarTreeNode* node = classifier->node + idx; |
__m128d t = _mm_set1_pd((node->threshold)*variance_norm_factor); |
__m128d left = _mm_set1_pd(node->left); |
__m128d right = _mm_set1_pd(node->right); |
double _sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; |
_sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; |
if( node->feature.rect[2].p0 ) |
_sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; |
__m128d sum = _mm_set1_pd(_sum); |
t = _mm_cmplt_sd(sum, t); |
sum = _mm_blendv_pd(right, left, t); |
_mm_store_pd(temp, sum); |
idx = (int)temp[0]; |
} |
while(idx > 0 ); |
} |
else |
#endif*/ |
{ |
do |
{ |
do |
{ |
CvHidHaarTreeNode* node = classifier->node + idx; |
double t = node->threshold * variance_norm_factor; |
double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; |
sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; |
double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; |
sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; |
if( node->feature.rect[2].p0 ) |
sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; |
if( node->feature.rect[2].p0 ) |
sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; |
idx = sum < t ? node->left : node->right; |
} |
while( idx > 0 ); |
idx = sum < t ? node->left : node->right; |
} |
while( idx > 0 ); |
} |
return classifier->alpha[-idx]; |
} |
@ -777,18 +777,18 @@ static int |
cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade, |
CvPoint pt, double& stage_sum, int start_stage ) |
{ |
#ifdef CV_HAAR_USE_AVX |
bool haveAVX = false; |
if(cv::checkHardwareSupport(CV_CPU_AVX)) |
if(_xgetbv(_XCR_XFEATURE_ENABLED_MASK)&0x6)// Check if the OS will save the YMM registers
{ |
haveAVX = true; |
#else |
#ifdef CV_HAAR_USE_SSE |
bool haveSSE2 = cv::checkHardwareSupport(CV_CPU_SSE2); |
#endif |
#endif |
#ifdef CV_HAAR_USE_AVX |
bool haveAVX = false; |
if(cv::checkHardwareSupport(CV_CPU_AVX)) |
if(_xgetbv(_XCR_XFEATURE_ENABLED_MASK)&0x6)// Check if the OS will save the YMM registers
{ |
haveAVX = true; |
} |
#else |
#ifdef CV_HAAR_USE_SSE |
bool haveSSE2 = cv::checkHardwareSupport(CV_CPU_SSE2); |
#endif |
#endif |
int p_offset, pq_offset; |
int i, j; |
@ -828,17 +828,17 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade, |
{ |
stage_sum = 0.0; |
#ifdef CV_HAAR_USE_AVX |
if(haveAVX) |
{ |
for( ; j < cascade->stage_classifier[i].count-8; j+=8 ) |
{ |
stage_sum += icvEvalHidHaarClassifierAVX( |
variance_norm_factor, p_offset ); |
} |
} |
#endif |
#ifdef CV_HAAR_USE_AVX |
if(haveAVX) |
{ |
for( ; j < cascade->stage_classifier[i].count-8; j+=8 ) |
{ |
stage_sum += icvEvalHidHaarClassifierAVX( |
cascade->stage_classifier[i].classifier+j, |
variance_norm_factor, p_offset ); |
} |
} |
#endif |
for( j = 0; j < ptr->count; j++ ) |
{ |
stage_sum += icvEvalHidHaarClassifier( ptr->classifier + j, variance_norm_factor, p_offset ); |
@ -859,283 +859,283 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade, |
} |
else if( cascade->isStumpBased ) |
{ |
#ifdef CV_HAAR_USE_AVX |
if(haveAVX) |
{ |
CvHidHaarClassifier* classifiers[8]; |
CvHidHaarTreeNode* nodes[8]; |
for( i = start_stage; i < cascade->count; i++ ) |
{ |
stage_sum = 0.0; |
int j = 0; |
float CV_DECL_ALIGNED(32) buf[8]; |
if( cascade->stage_classifier[i].two_rects ) |
{ |
for( ; j <= cascade->stage_classifier[i].count-8; j+=8 ) |
{ |
//__m256 stage_sumPart = _mm256_setzero_ps();
classifiers[0] = cascade->stage_classifier[i].classifier + j; |
nodes[0] = classifiers[0]->node; |
classifiers[1] = cascade->stage_classifier[i].classifier + j + 1; |
nodes[1] = classifiers[1]->node; |
classifiers[2] = cascade->stage_classifier[i].classifier + j + 2; |
nodes[2]= classifiers[2]->node; |
classifiers[3] = cascade->stage_classifier[i].classifier + j + 3; |
nodes[3] = classifiers[3]->node; |
classifiers[4] = cascade->stage_classifier[i].classifier + j + 4; |
nodes[4] = classifiers[4]->node; |
classifiers[5] = cascade->stage_classifier[i].classifier + j + 5; |
nodes[5] = classifiers[5]->node; |
classifiers[6] = cascade->stage_classifier[i].classifier + j + 6; |
nodes[6] = classifiers[6]->node; |
classifiers[7] = cascade->stage_classifier[i].classifier + j + 7; |
nodes[7] = classifiers[7]->node; |
__m256 t = _mm256_set1_ps(variance_norm_factor); |
t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold)); |
__m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset), |
calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0], |
__m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight,
nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight);
__m256 sum = _mm256_mul_ps(offset, weight); |
offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset), |
calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset), |
weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight,
nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight);
sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight)); |
__m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],classifiers[6]->alpha[0],classifiers[5]->alpha[0],classifiers[4]->alpha[0],classifiers[3]->alpha[0], |
classifiers[2]->alpha[0],classifiers[1]->alpha[0],classifiers[0]->alpha[0]); |
__m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],classifiers[6]->alpha[1],classifiers[5]->alpha[1],classifiers[4]->alpha[1],classifiers[3]->alpha[1], |
classifiers[2]->alpha[1],classifiers[1]->alpha[1],classifiers[0]->alpha[1]); |
_mm256_store_ps(buf, _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ ))); |
stage_sum+=(buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]); |
} |
for( ; j < cascade->stage_classifier[i].count; j++ ) |
{ |
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; |
CvHidHaarTreeNode* node = classifier->node; |
double t = node->threshold*variance_norm_factor; |
double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; |
sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; |
stage_sum += classifier->alpha[sum >= t]; |
} |
} |
else |
{ |
for( ; j <= (cascade->stage_classifier[i].count)-8; j+=8 ) |
{ |
float CV_DECL_ALIGNED(32) tmp[8] = {0,0,0,0,0,0,0,0}; |
classifiers[0] = cascade->stage_classifier[i].classifier + j; |
nodes[0] = classifiers[0]->node; |
classifiers[1] = cascade->stage_classifier[i].classifier + j + 1; |
nodes[1] = classifiers[1]->node; |
classifiers[2] = cascade->stage_classifier[i].classifier + j + 2; |
nodes[2]= classifiers[2]->node; |
classifiers[3] = cascade->stage_classifier[i].classifier + j + 3; |
nodes[3] = classifiers[3]->node; |
classifiers[4] = cascade->stage_classifier[i].classifier + j + 4; |
nodes[4] = classifiers[4]->node; |
classifiers[5] = cascade->stage_classifier[i].classifier + j + 5; |
nodes[5] = classifiers[5]->node; |
classifiers[6] = cascade->stage_classifier[i].classifier + j + 6; |
nodes[6] = classifiers[6]->node; |
classifiers[7] = cascade->stage_classifier[i].classifier + j + 7; |
nodes[7] = classifiers[7]->node; |
__m256 t = _mm256_set1_ps(variance_norm_factor); |
t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold)); |
__m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset), |
calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0], |
__m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight,
nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight);
__m256 sum = _mm256_mul_ps(offset, weight); |
offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset), |
calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset), |
weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight,
nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight);
sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight)); |
if( nodes[0]->feature.rect[2].p0 ) |
tmp[0] = calc_sum(nodes[0]->feature.rect[2],p_offset) * nodes[0]->feature.rect[2].weight; |
if( nodes[1]->feature.rect[2].p0 ) |
tmp[1] = calc_sum(nodes[1]->feature.rect[2],p_offset) * nodes[1]->feature.rect[2].weight; |
if( nodes[2]->feature.rect[2].p0 ) |
tmp[2] = calc_sum(nodes[2]->feature.rect[2],p_offset) * nodes[2]->feature.rect[2].weight; |
if( nodes[3]->feature.rect[2].p0 ) |
tmp[3] = calc_sum(nodes[3]->feature.rect[2],p_offset) * nodes[3]->feature.rect[2].weight; |
if( nodes[4]->feature.rect[2].p0 ) |
tmp[4] = calc_sum(nodes[4]->feature.rect[2],p_offset) * nodes[4]->feature.rect[2].weight; |
if( nodes[5]->feature.rect[2].p0 ) |
tmp[5] = calc_sum(nodes[5]->feature.rect[2],p_offset) * nodes[5]->feature.rect[2].weight; |
if( nodes[6]->feature.rect[2].p0 ) |
tmp[6] = calc_sum(nodes[6]->feature.rect[2],p_offset) * nodes[6]->feature.rect[2].weight; |
if( nodes[7]->feature.rect[2].p0 ) |
tmp[7] = calc_sum(nodes[7]->feature.rect[2],p_offset) * nodes[7]->feature.rect[2].weight; |
sum = _mm256_add_ps(sum, _mm256_load_ps(tmp)); |
__m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],classifiers[6]->alpha[0],classifiers[5]->alpha[0],classifiers[4]->alpha[0],classifiers[3]->alpha[0], |
classifiers[2]->alpha[0],classifiers[1]->alpha[0],classifiers[0]->alpha[0]); |
__m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],classifiers[6]->alpha[1],classifiers[5]->alpha[1],classifiers[4]->alpha[1],classifiers[3]->alpha[1], |
classifiers[2]->alpha[1],classifiers[1]->alpha[1],classifiers[0]->alpha[1]); |
__m256 outBuf = _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ )); |
outBuf = _mm256_hadd_ps(outBuf, outBuf); |
outBuf = _mm256_hadd_ps(outBuf, outBuf); |
_mm256_store_ps(buf, outBuf); |
} |
for( ; j < cascade->stage_classifier[i].count; j++ ) |
{ |
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; |
CvHidHaarTreeNode* node = classifier->node; |
double t = node->threshold*variance_norm_factor; |
double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; |
sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; |
if( node->feature.rect[2].p0 ) |
sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; |
stage_sum += classifier->alpha[sum >= t]; |
} |
} |
if( stage_sum < cascade->stage_classifier[i].threshold ) |
return -i; |
} |
} |
else |
#endif |
#ifdef CV_HAAR_USE_SSE && !CV_HAAR_USE_AVX //old SSE optimization
{ |
for( i = start_stage; i < cascade->count; i++ ) |
{ |
__m128d stage_sum = _mm_setzero_pd(); |
if( cascade->stage_classifier[i].two_rects ) |
{ |
for( j = 0; j < cascade->stage_classifier[i].count; j++ ) |
{ |
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; |
CvHidHaarTreeNode* node = classifier->node; |
// ayasin - NHM perf optim. Avoid use of costly flaky jcc
__m128d t = _mm_set_sd(node->threshold*variance_norm_factor); |
__m128d a = _mm_set_sd(classifier->alpha[0]); |
__m128d b = _mm_set_sd(classifier->alpha[1]); |
__m128d sum = _mm_set_sd(calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight + |
calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight); |
t = _mm_cmpgt_sd(t, sum); |
stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t)); |
} |
} |
else |
{ |
for( j = 0; j < cascade->stage_classifier[i].count; j++ ) |
{ |
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; |
CvHidHaarTreeNode* node = classifier->node; |
// ayasin - NHM perf optim. Avoid use of costly flaky jcc
__m128d t = _mm_set_sd(node->threshold*variance_norm_factor); |
__m128d a = _mm_set_sd(classifier->alpha[0]); |
__m128d b = _mm_set_sd(classifier->alpha[1]); |
double _sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; |
_sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; |
if( node->feature.rect[2].p0 ) |
_sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; |
__m128d sum = _mm_set_sd(_sum); |
t = _mm_cmpgt_sd(t, sum); |
stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t)); |
} |
} |
__m128d i_threshold = _mm_set1_pd(cascade->stage_classifier[i].threshold); |
if( _mm_comilt_sd(stage_sum, i_threshold) ) |
return -i; |
} |
} |
#endif |
{ |
for( i = start_stage; i < cascade->count; i++ ) |
{ |
stage_sum = 0.0; |
if( cascade->stage_classifier[i].two_rects ) |
{ |
for( j = 0; j < cascade->stage_classifier[i].count; j++ ) |
{ |
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; |
CvHidHaarTreeNode* node = classifier->node; |
double t = node->threshold*variance_norm_factor; |
double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; |
sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; |
stage_sum += classifier->alpha[sum >= t]; |
} |
} |
else |
{ |
for( j = 0; j < cascade->stage_classifier[i].count; j++ ) |
{ |
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; |
CvHidHaarTreeNode* node = classifier->node; |
double t = node->threshold*variance_norm_factor; |
double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; |
sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; |
if( node->feature.rect[2].p0 ) |
sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; |
stage_sum += classifier->alpha[sum >= t]; |
} |
} |
if( stage_sum < cascade->stage_classifier[i].threshold ) |
return -i; |
} |
} |
} |
else |
#ifdef CV_HAAR_USE_AVX |
if(haveAVX) |
{ |
CvHidHaarClassifier* classifiers[8]; |
CvHidHaarTreeNode* nodes[8]; |
for( i = start_stage; i < cascade->count; i++ ) |
{ |
stage_sum = 0.0; |
int j = 0; |
float CV_DECL_ALIGNED(32) buf[8]; |
if( cascade->stage_classifier[i].two_rects ) |
{ |
for( ; j <= cascade->stage_classifier[i].count-8; j+=8 ) |
{ |
//__m256 stage_sumPart = _mm256_setzero_ps();
classifiers[0] = cascade->stage_classifier[i].classifier + j; |
nodes[0] = classifiers[0]->node; |
classifiers[1] = cascade->stage_classifier[i].classifier + j + 1; |
nodes[1] = classifiers[1]->node; |
classifiers[2] = cascade->stage_classifier[i].classifier + j + 2; |
nodes[2]= classifiers[2]->node; |
classifiers[3] = cascade->stage_classifier[i].classifier + j + 3; |
nodes[3] = classifiers[3]->node; |
classifiers[4] = cascade->stage_classifier[i].classifier + j + 4; |
nodes[4] = classifiers[4]->node; |
classifiers[5] = cascade->stage_classifier[i].classifier + j + 5; |
nodes[5] = classifiers[5]->node; |
classifiers[6] = cascade->stage_classifier[i].classifier + j + 6; |
nodes[6] = classifiers[6]->node; |
classifiers[7] = cascade->stage_classifier[i].classifier + j + 7; |
nodes[7] = classifiers[7]->node; |
__m256 t = _mm256_set1_ps(variance_norm_factor); |
t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold)); |
__m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset), |
calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0], |
p_offset),calc_sum(nodes[0]->feature.rect[0],p_offset)); |
__m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight, |
nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight); |
__m256 sum = _mm256_mul_ps(offset, weight); |
offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset), |
calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset), |
calc_sum(nodes[0]->feature.rect[1],p_offset)); |
weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight, |
nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight); |
sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight)); |
__m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],classifiers[6]->alpha[0],classifiers[5]->alpha[0],classifiers[4]->alpha[0],classifiers[3]->alpha[0], |
classifiers[2]->alpha[0],classifiers[1]->alpha[0],classifiers[0]->alpha[0]); |
__m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],classifiers[6]->alpha[1],classifiers[5]->alpha[1],classifiers[4]->alpha[1],classifiers[3]->alpha[1], |
classifiers[2]->alpha[1],classifiers[1]->alpha[1],classifiers[0]->alpha[1]); |
_mm256_store_ps(buf, _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ ))); |
stage_sum+=(buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]); |
} |
for( ; j < cascade->stage_classifier[i].count; j++ ) |
{ |
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; |
CvHidHaarTreeNode* node = classifier->node; |
double t = node->threshold*variance_norm_factor; |
double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; |
sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; |
stage_sum += classifier->alpha[sum >= t]; |
} |
} |
else |
{ |
for( ; j <= (cascade->stage_classifier[i].count)-8; j+=8 ) |
{ |
float CV_DECL_ALIGNED(32) tmp[8] = {0,0,0,0,0,0,0,0}; |
classifiers[0] = cascade->stage_classifier[i].classifier + j; |
nodes[0] = classifiers[0]->node; |
classifiers[1] = cascade->stage_classifier[i].classifier + j + 1; |
nodes[1] = classifiers[1]->node; |
classifiers[2] = cascade->stage_classifier[i].classifier + j + 2; |
nodes[2]= classifiers[2]->node; |
classifiers[3] = cascade->stage_classifier[i].classifier + j + 3; |
nodes[3] = classifiers[3]->node; |
classifiers[4] = cascade->stage_classifier[i].classifier + j + 4; |
nodes[4] = classifiers[4]->node; |
classifiers[5] = cascade->stage_classifier[i].classifier + j + 5; |
nodes[5] = classifiers[5]->node; |
classifiers[6] = cascade->stage_classifier[i].classifier + j + 6; |
nodes[6] = classifiers[6]->node; |
classifiers[7] = cascade->stage_classifier[i].classifier + j + 7; |
nodes[7] = classifiers[7]->node; |
__m256 t = _mm256_set1_ps(variance_norm_factor); |
t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold)); |
__m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset), |
calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0], |
p_offset),calc_sum(nodes[0]->feature.rect[0],p_offset)); |
__m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight, |
nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight); |
__m256 sum = _mm256_mul_ps(offset, weight); |
offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset), |
calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset), |
calc_sum(nodes[0]->feature.rect[1],p_offset)); |
weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight, |
nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight); |
sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight)); |
if( nodes[0]->feature.rect[2].p0 ) |
tmp[0] = calc_sum(nodes[0]->feature.rect[2],p_offset) * nodes[0]->feature.rect[2].weight; |
if( nodes[1]->feature.rect[2].p0 ) |
tmp[1] = calc_sum(nodes[1]->feature.rect[2],p_offset) * nodes[1]->feature.rect[2].weight; |
if( nodes[2]->feature.rect[2].p0 ) |
tmp[2] = calc_sum(nodes[2]->feature.rect[2],p_offset) * nodes[2]->feature.rect[2].weight; |
if( nodes[3]->feature.rect[2].p0 ) |
tmp[3] = calc_sum(nodes[3]->feature.rect[2],p_offset) * nodes[3]->feature.rect[2].weight; |
if( nodes[4]->feature.rect[2].p0 ) |
tmp[4] = calc_sum(nodes[4]->feature.rect[2],p_offset) * nodes[4]->feature.rect[2].weight; |
if( nodes[5]->feature.rect[2].p0 ) |
tmp[5] = calc_sum(nodes[5]->feature.rect[2],p_offset) * nodes[5]->feature.rect[2].weight; |
if( nodes[6]->feature.rect[2].p0 ) |
tmp[6] = calc_sum(nodes[6]->feature.rect[2],p_offset) * nodes[6]->feature.rect[2].weight; |
if( nodes[7]->feature.rect[2].p0 ) |
tmp[7] = calc_sum(nodes[7]->feature.rect[2],p_offset) * nodes[7]->feature.rect[2].weight; |
sum = _mm256_add_ps(sum, _mm256_load_ps(tmp)); |
__m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],classifiers[6]->alpha[0],classifiers[5]->alpha[0],classifiers[4]->alpha[0],classifiers[3]->alpha[0], |
classifiers[2]->alpha[0],classifiers[1]->alpha[0],classifiers[0]->alpha[0]); |
__m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],classifiers[6]->alpha[1],classifiers[5]->alpha[1],classifiers[4]->alpha[1],classifiers[3]->alpha[1], |
classifiers[2]->alpha[1],classifiers[1]->alpha[1],classifiers[0]->alpha[1]); |
__m256 outBuf = _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ )); |
outBuf = _mm256_hadd_ps(outBuf, outBuf); |
outBuf = _mm256_hadd_ps(outBuf, outBuf); |
_mm256_store_ps(buf, outBuf); |
} |
for( ; j < cascade->stage_classifier[i].count; j++ ) |
{ |
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; |
CvHidHaarTreeNode* node = classifier->node; |
double t = node->threshold*variance_norm_factor; |
double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; |
sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; |
if( node->feature.rect[2].p0 ) |
sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; |
stage_sum += classifier->alpha[sum >= t]; |
} |
} |
if( stage_sum < cascade->stage_classifier[i].threshold ) |
return -i; |
} |
} |
else |
#endif |
#if defined CV_HAAR_USE_SSE && CV_HAAR_USE_SSE && !CV_HAAR_USE_AVX //old SSE optimization
if(haveSSE2) |
{ |
for( i = start_stage; i < cascade->count; i++ ) |
{ |
__m128d stage_sum = _mm_setzero_pd(); |
if( cascade->stage_classifier[i].two_rects ) |
{ |
for( j = 0; j < cascade->stage_classifier[i].count; j++ ) |
{ |
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; |
CvHidHaarTreeNode* node = classifier->node; |
// ayasin - NHM perf optim. Avoid use of costly flaky jcc
__m128d t = _mm_set_sd(node->threshold*variance_norm_factor); |
__m128d a = _mm_set_sd(classifier->alpha[0]); |
__m128d b = _mm_set_sd(classifier->alpha[1]); |
__m128d sum = _mm_set_sd(calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight + |
calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight); |
t = _mm_cmpgt_sd(t, sum); |
stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t)); |
} |
} |
else |
{ |
for( j = 0; j < cascade->stage_classifier[i].count; j++ ) |
{ |
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; |
CvHidHaarTreeNode* node = classifier->node; |
// ayasin - NHM perf optim. Avoid use of costly flaky jcc
__m128d t = _mm_set_sd(node->threshold*variance_norm_factor); |
__m128d a = _mm_set_sd(classifier->alpha[0]); |
__m128d b = _mm_set_sd(classifier->alpha[1]); |
double _sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; |
_sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; |
if( node->feature.rect[2].p0 ) |
_sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; |
__m128d sum = _mm_set_sd(_sum); |
t = _mm_cmpgt_sd(t, sum); |
stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t)); |
} |
} |
__m128d i_threshold = _mm_set1_pd(cascade->stage_classifier[i].threshold); |
if( _mm_comilt_sd(stage_sum, i_threshold) ) |
return -i; |
} |
} |
else |
#endif |
{ |
for( i = start_stage; i < cascade->count; i++ ) |
{ |
stage_sum = 0.0; |
if( cascade->stage_classifier[i].two_rects ) |
{ |
for( j = 0; j < cascade->stage_classifier[i].count; j++ ) |
{ |
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; |
CvHidHaarTreeNode* node = classifier->node; |
double t = node->threshold*variance_norm_factor; |
double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; |
sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; |
stage_sum += classifier->alpha[sum >= t]; |
} |
} |
else |
{ |
for( j = 0; j < cascade->stage_classifier[i].count; j++ ) |
{ |
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; |
CvHidHaarTreeNode* node = classifier->node; |
double t = node->threshold*variance_norm_factor; |
double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; |
sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; |
if( node->feature.rect[2].p0 ) |
sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; |
stage_sum += classifier->alpha[sum >= t]; |
} |
} |
if( stage_sum < cascade->stage_classifier[i].threshold ) |
return -i; |
} |
} |
} |
else |
{ |
for( i = start_stage; i < cascade->count; i++ ) |
{ |
stage_sum = 0.0; |
int j = 0; |
#ifdef CV_HAAR_USE_AVX |
if(haveAVX) |
{ |
for( ; j < cascade->stage_classifier[i].count-8; j+=8 ) |
{ |
stage_sum += icvEvalHidHaarClassifierAVX( |
variance_norm_factor, p_offset ); |
} |
} |
#endif |
for(; j < cascade->stage_classifier[i].count; j++ ) |
{ |
stage_sum += icvEvalHidHaarClassifier( |
cascade->stage_classifier[i].classifier + j, |
variance_norm_factor, p_offset ); |
} |
int k = 0; |
#ifdef CV_HAAR_USE_AVX |
if(haveAVX) |
{ |
for( ; k < cascade->stage_classifier[i].count-8; k+=8 ) |
{ |
stage_sum += icvEvalHidHaarClassifierAVX( |
cascade->stage_classifier[i].classifier+k, |
variance_norm_factor, p_offset ); |
} |
} |
#endif |
for(; k < cascade->stage_classifier[i].count; k++ ) |
{ |
stage_sum += icvEvalHidHaarClassifier( |
cascade->stage_classifier[i].classifier + k, |
variance_norm_factor, p_offset ); |
} |
if( stage_sum < cascade->stage_classifier[i].threshold ) |
return -i; |
} |
} |
return 1; |
} |