@ -41,6 +41,7 @@
# include "precomp.hpp"
# include "opencl_kernels_imgproc.hpp"
# include "opencv2/core/hal/intrin.hpp"
# include "opencv2/core/openvx/ovx_defs.hpp"
@ -1938,10 +1939,6 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
CV_Assert ( it . planes [ 0 ] . isContinuous ( ) & & it . planes [ 1 ] . isContinuous ( ) ) ;
# if CV_SSE2
bool haveSIMD = checkHardwareSupport ( CV_CPU_SSE2 ) ;
# endif
for ( size_t i = 0 ; i < it . nplanes ; i + + , + + it )
{
const float * h1 = it . planes [ 0 ] . ptr < float > ( ) ;
@ -1961,50 +1958,63 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
}
else if ( method = = CV_COMP_CORREL )
{
# if CV_SSE2
if ( haveSIMD )
# if CV_SIMD_64F
v_float64 v_s1 = vx_setzero_f64 ( ) ;
v_float64 v_s2 = vx_setzero_f64 ( ) ;
v_float64 v_s11 = vx_setzero_f64 ( ) ;
v_float64 v_s12 = vx_setzero_f64 ( ) ;
v_float64 v_s22 = vx_setzero_f64 ( ) ;
for ( ; j < = len - v_float32 : : nlanes ; j + = v_float32 : : nlanes )
{
__m128d v_s1 = _mm_setzero_pd ( ) , v_s2 = v_s1 ;
__m128d v_s11 = v_s1 , v_s22 = v_s1 , v_s12 = v_s1 ;
for ( ; j < = len - 4 ; j + = 4 )
{
__m128 v_a = _mm_loadu_ps ( h1 + j ) ;
__m128 v_b = _mm_loadu_ps ( h2 + j ) ;
// 0-1
__m128d v_ad = _mm_cvtps_pd ( v_a ) ;
__m128d v_bd = _mm_cvtps_pd ( v_b ) ;
v_s12 = _mm_add_pd ( v_s12 , _mm_mul_pd ( v_ad , v_bd ) ) ;
v_s11 = _mm_add_pd ( v_s11 , _mm_mul_pd ( v_ad , v_ad ) ) ;
v_s22 = _mm_add_pd ( v_s22 , _mm_mul_pd ( v_bd , v_bd ) ) ;
v_s1 = _mm_add_pd ( v_s1 , v_ad ) ;
v_s2 = _mm_add_pd ( v_s2 , v_bd ) ;
// 2-3
v_ad = _mm_cvtps_pd ( _mm_castsi128_ps ( _mm_srli_si128 ( _mm_castps_si128 ( v_a ) , 8 ) ) ) ;
v_bd = _mm_cvtps_pd ( _mm_castsi128_ps ( _mm_srli_si128 ( _mm_castps_si128 ( v_b ) , 8 ) ) ) ;
v_s12 = _mm_add_pd ( v_s12 , _mm_mul_pd ( v_ad , v_bd ) ) ;
v_s11 = _mm_add_pd ( v_s11 , _mm_mul_pd ( v_ad , v_ad ) ) ;
v_s22 = _mm_add_pd ( v_s22 , _mm_mul_pd ( v_bd , v_bd ) ) ;
v_s1 = _mm_add_pd ( v_s1 , v_ad ) ;
v_s2 = _mm_add_pd ( v_s2 , v_bd ) ;
}
double CV_DECL_ALIGNED ( 16 ) ar [ 10 ] ;
_mm_store_pd ( ar , v_s12 ) ;
_mm_store_pd ( ar + 2 , v_s11 ) ;
_mm_store_pd ( ar + 4 , v_s22 ) ;
_mm_store_pd ( ar + 6 , v_s1 ) ;
_mm_store_pd ( ar + 8 , v_s2 ) ;
s12 + = ar [ 0 ] + ar [ 1 ] ;
s11 + = ar [ 2 ] + ar [ 3 ] ;
s22 + = ar [ 4 ] + ar [ 5 ] ;
s1 + = ar [ 6 ] + ar [ 7 ] ;
s2 + = ar [ 8 ] + ar [ 9 ] ;
v_float32 v_a = vx_load ( h1 + j ) ;
v_float32 v_b = vx_load ( h2 + j ) ;
// 0-1
v_float64 v_ad = v_cvt_f64 ( v_a ) ;
v_float64 v_bd = v_cvt_f64 ( v_b ) ;
v_s12 = v_muladd ( v_ad , v_bd , v_s12 ) ;
v_s11 = v_muladd ( v_ad , v_ad , v_s11 ) ;
v_s22 = v_muladd ( v_bd , v_bd , v_s22 ) ;
v_s1 + = v_ad ;
v_s2 + = v_bd ;
// 2-3
v_ad = v_cvt_f64_high ( v_a ) ;
v_bd = v_cvt_f64_high ( v_b ) ;
v_s12 = v_muladd ( v_ad , v_bd , v_s12 ) ;
v_s11 = v_muladd ( v_ad , v_ad , v_s11 ) ;
v_s22 = v_muladd ( v_bd , v_bd , v_s22 ) ;
v_s1 + = v_ad ;
v_s2 + = v_bd ;
}
# endif
s12 + = v_reduce_sum ( v_s12 ) ;
s11 + = v_reduce_sum ( v_s11 ) ;
s22 + = v_reduce_sum ( v_s22 ) ;
s1 + = v_reduce_sum ( v_s1 ) ;
s2 + = v_reduce_sum ( v_s2 ) ;
# elif CV_SIMD && 0 //Disable vectorization for CV_COMP_CORREL if f64 is unsupported due to low precision
v_float32 v_s1 = vx_setzero_f32 ( ) ;
v_float32 v_s2 = vx_setzero_f32 ( ) ;
v_float32 v_s11 = vx_setzero_f32 ( ) ;
v_float32 v_s12 = vx_setzero_f32 ( ) ;
v_float32 v_s22 = vx_setzero_f32 ( ) ;
for ( ; j < = len - v_float32 : : nlanes ; j + = v_float32 : : nlanes )
{
v_float32 v_a = vx_load ( h1 + j ) ;
v_float32 v_b = vx_load ( h2 + j ) ;
v_s12 = v_muladd ( v_a , v_b , v_s12 ) ;
v_s11 = v_muladd ( v_a , v_a , v_s11 ) ;
v_s22 = v_muladd ( v_b , v_b , v_s22 ) ;
v_s1 + = v_a ;
v_s2 + = v_b ;
}
s12 + = v_reduce_sum ( v_s12 ) ;
s11 + = v_reduce_sum ( v_s11 ) ;
s22 + = v_reduce_sum ( v_s22 ) ;
s1 + = v_reduce_sum ( v_s1 ) ;
s2 + = v_reduce_sum ( v_s2 ) ;
# endif
for ( ; j < len ; j + + )
{
double a = h1 [ j ] ;
@ -2019,67 +2029,68 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
}
else if ( method = = CV_COMP_INTERSECT )
{
# if CV_NEON
float32x4_t v_result = vdupq_n_f32 ( 0.0f ) ;
for ( ; j < = len - 4 ; j + = 4 )
v_result = vaddq_f32 ( v_result , vminq_f32 ( vld1q_f32 ( h1 + j ) , vld1q_f32 ( h2 + j ) ) ) ;
float CV_DECL_ALIGNED ( 16 ) ar [ 4 ] ;
vst1q_f32 ( ar , v_result ) ;
result + = ar [ 0 ] + ar [ 1 ] + ar [ 2 ] + ar [ 3 ] ;
# elif CV_SSE2
if ( haveSIMD )
# if CV_SIMD_64F
v_float64 v_result = vx_setzero_f64 ( ) ;
for ( ; j < = len - v_float32 : : nlanes ; j + = v_float32 : : nlanes )
{
__m128d v_result = _mm_setzero_pd ( ) ;
for ( ; j < = len - 4 ; j + = 4 )
{
__m128 v_src = _mm_min_ps ( _mm_loadu_ps ( h1 + j ) ,
_mm_loadu_ps ( h2 + j ) ) ;
v_result = _mm_add_pd ( v_result , _mm_cvtps_pd ( v_src ) ) ;
v_src = _mm_castsi128_ps ( _mm_srli_si128 ( _mm_castps_si128 ( v_src ) , 8 ) ) ;
v_result = _mm_add_pd ( v_result , _mm_cvtps_pd ( v_src ) ) ;
}
double CV_DECL_ALIGNED ( 16 ) ar [ 2 ] ;
_mm_store_pd ( ar , v_result ) ;
result + = ar [ 0 ] + ar [ 1 ] ;
v_float32 v_src = v_min ( vx_load ( h1 + j ) , vx_load ( h2 + j ) ) ;
v_result + = v_cvt_f64 ( v_src ) + v_cvt_f64_high ( v_src ) ;
}
result + = v_reduce_sum ( v_result ) ;
# elif CV_SIMD
v_float32 v_result = vx_setzero_f32 ( ) ;
for ( ; j < = len - v_float32 : : nlanes ; j + = v_float32 : : nlanes )
{
v_float32 v_src = v_min ( vx_load ( h1 + j ) , vx_load ( h2 + j ) ) ;
v_result + = v_src ;
}
# endif
result + = v_reduce_sum ( v_result ) ;
# endif
for ( ; j < len ; j + + )
result + = std : : min ( h1 [ j ] , h2 [ j ] ) ;
}
else if ( method = = CV_COMP_BHATTACHARYYA )
{
# if CV_SSE2
if ( haveSIMD )
# if CV_SIMD_64F
v_float64 v_s1 = vx_setzero_f64 ( ) ;
v_float64 v_s2 = vx_setzero_f64 ( ) ;
v_float64 v_result = vx_setzero_f64 ( ) ;
for ( ; j < = len - v_float32 : : nlanes ; j + = v_float32 : : nlanes )
{
__m128d v_s1 = _mm_setzero_pd ( ) , v_s2 = v_s1 , v_result = v_s1 ;
for ( ; j < = len - 4 ; j + = 4 )
{
__m128 v_a = _mm_loadu_ps ( h1 + j ) ;
__m128 v_b = _mm_loadu_ps ( h2 + j ) ;
__m128d v_ad = _mm_cvtps_pd ( v_a ) ;
__m128d v_bd = _mm_cvtps_pd ( v_b ) ;
v_s1 = _mm_add_pd ( v_s1 , v_ad ) ;
v_s2 = _mm_add_pd ( v_s2 , v_bd ) ;
v_result = _mm_add_pd ( v_result , _mm_sqrt_pd ( _mm_mul_pd ( v_ad , v_bd ) ) ) ;
v_ad = _mm_cvtps_pd ( _mm_castsi128_ps ( _mm_srli_si128 ( _mm_castps_si128 ( v_a ) , 8 ) ) ) ;
v_bd = _mm_cvtps_pd ( _mm_castsi128_ps ( _mm_srli_si128 ( _mm_castps_si128 ( v_b ) , 8 ) ) ) ;
v_s1 = _mm_add_pd ( v_s1 , v_ad ) ;
v_s2 = _mm_add_pd ( v_s2 , v_bd ) ;
v_result = _mm_add_pd ( v_result , _mm_sqrt_pd ( _mm_mul_pd ( v_ad , v_bd ) ) ) ;
}
double CV_DECL_ALIGNED ( 16 ) ar [ 6 ] ;
_mm_store_pd ( ar , v_s1 ) ;
_mm_store_pd ( ar + 2 , v_s2 ) ;
_mm_store_pd ( ar + 4 , v_result ) ;
s1 + = ar [ 0 ] + ar [ 1 ] ;
s2 + = ar [ 2 ] + ar [ 3 ] ;
result + = ar [ 4 ] + ar [ 5 ] ;
v_float32 v_a = vx_load ( h1 + j ) ;
v_float32 v_b = vx_load ( h2 + j ) ;
v_float64 v_ad = v_cvt_f64 ( v_a ) ;
v_float64 v_bd = v_cvt_f64 ( v_b ) ;
v_s1 + = v_ad ;
v_s2 + = v_bd ;
v_result + = v_sqrt ( v_ad * v_bd ) ;
v_ad = v_cvt_f64_high ( v_a ) ;
v_bd = v_cvt_f64_high ( v_b ) ;
v_s1 + = v_ad ;
v_s2 + = v_bd ;
v_result + = v_sqrt ( v_ad * v_bd ) ;
}
# endif
s1 + = v_reduce_sum ( v_s1 ) ;
s2 + = v_reduce_sum ( v_s2 ) ;
result + = v_reduce_sum ( v_result ) ;
# elif CV_SIMD && 0 //Disable vectorization for CV_COMP_BHATTACHARYYA if f64 is unsupported due to low precision
v_float32 v_s1 = vx_setzero_f32 ( ) ;
v_float32 v_s2 = vx_setzero_f32 ( ) ;
v_float32 v_result = vx_setzero_f32 ( ) ;
for ( ; j < = len - v_float32 : : nlanes ; j + = v_float32 : : nlanes )
{
v_float32 v_a = vx_load ( h1 + j ) ;
v_float32 v_b = vx_load ( h2 + j ) ;
v_s1 + = v_a ;
v_s2 + = v_b ;
v_result + = v_sqrt ( v_a * v_b ) ;
}
s1 + = v_reduce_sum ( v_s1 ) ;
s2 + = v_reduce_sum ( v_s2 ) ;
result + = v_reduce_sum ( v_result ) ;
# endif
for ( ; j < len ; j + + )
{
double a = h1 [ j ] ;