Merge pull request #13317 from terfendail:norm_wintr

* Added performance tests for hal::norm functions * Added sum of absolute differences intrinsic * norm implementation updated to use wide universal intrinsics * improve and fix v_reduce_sad on VSX
7 years ago · 00c9ab8c23
parent ccf96b9e05
commit 00c9ab8c23
8 changed files with 237 additions and 104 deletions
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@ -1133,6 +1133,41 @@ inline v_float32x8 v_reduce_sum4(const v_float32x8& a, const v_float32x8& b,
    return v_float32x8(_mm256_hadd_ps(ab, cd));
 }
 inline unsigned v_reduce_sad(const v_uint8x32& a, const v_uint8x32& b)
 {
    return (unsigned)_v_cvtsi256_si32(_mm256_sad_epu8(a.val, b.val));
 }
 inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
 {
    __m256i half = _mm256_set1_epi8(0x7f);
    return (unsigned)_v_cvtsi256_si32(_mm256_sad_epu8(_mm256_add_epi8(a.val, half), _mm256_add_epi8(b.val, half)));
 }
 inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b)
 {
    v_uint32x8 l, h;
    v_expand(v_add_wrap(a - b, b - a), l, h);
    return v_reduce_sum(l + h);
 }
 inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b)
 {
    v_uint32x8 l, h;
    v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h);
    return v_reduce_sum(l + h);
 }
 inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b)
 {
    return v_reduce_sum(v_max(a, b) - v_min(a, b));
 }
 inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b)
 {
    v_int32x8 m = a < b;
    return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m));
 }
 inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
 {
    return v_reduce_sum((a - b) & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
 }
 /** Popcount **/
 #define OPENCV_HAL_IMPL_AVX_POPCOUNT(_Tpvec)                     \
    inline v_uint32x8 v_popcount(const _Tpvec& a)                \
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@ -1063,6 +1063,21 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
    return r;
 }
 /** @brief Sum absolute differences of values
 Scheme:
@code
 {A1 A2 A3 ...} {B1 B2 B3 ...} => sum{ABS(A1-B1),abs(A2-B2),abs(A3-B3),...}
@endcode
 For all types except 64-bit types.*/
 template<typename _Tp, int n> inline typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type v_reduce_sad(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
 {
    typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type c = _absdiff(a.s[0], b.s[0]);
    for (int i = 1; i < n; i++)
        c += _absdiff(a.s[i], b.s[i]);
    return c;
 }
 /** @brief Get negative values mask
 Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes.
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@ -999,6 +999,49 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
    return v_float32x4(vaddq_f32(v0, v1));
 }
 inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
 {
    uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vabdq_u8(a.val, b.val)));
    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
    return vget_lane_u32(vpadd_u32(t1, t1), 0);
 }
 inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
 {
    uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s8(vabdq_s8(a.val, b.val))));
    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
    return vget_lane_u32(vpadd_u32(t1, t1), 0);
 }
 inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
 {
    uint32x4_t t0 = vpaddlq_u16(vabdq_u16(a.val, b.val));
    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
    return vget_lane_u32(vpadd_u32(t1, t1), 0);
 }
 inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
 {
    uint32x4_t t0 = vpaddlq_u16(vreinterpretq_u16_s16(vabdq_s16(a.val, b.val)));
    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
    return vget_lane_u32(vpadd_u32(t1, t1), 0);
 }
 inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
 {
    uint32x4_t t0 = vabdq_u32(a.val, b.val);
    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
    return vget_lane_u32(vpadd_u32(t1, t1), 0);
 }
 inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
 {
    uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val));
    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
    return vget_lane_u32(vpadd_u32(t1, t1), 0);
 }
 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
 {
    float32x4_t t0 = vabdq_f32(a.val, b.val);
    float32x2_t t1 = vpadd_f32(vget_low_f32(t0), vget_high_f32(t0));
    return vget_lane_f32(vpadd_f32(t1, t1), 0);
 }
 #define OPENCV_HAL_IMPL_NEON_POPCOUNT(_Tpvec, cast) \
 inline v_uint32x4 v_popcount(const _Tpvec& a) \
 { \
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@ -1477,6 +1477,41 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min)
 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
 inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
 {
    return (unsigned)_mm_cvtsi128_si32(_mm_sad_epu8(a.val, b.val));
 }
 inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
 {
    __m128i half = _mm_set1_epi8(0x7f);
    return (unsigned)_mm_cvtsi128_si32(_mm_sad_epu8(_mm_add_epi8(a.val, half),
                                                    _mm_add_epi8(b.val, half)));
 }
 inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
 {
    v_uint32x4 l, h;
    v_expand(v_absdiff(a, b), l, h);
    return v_reduce_sum(l + h);
 }
 inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
 {
    v_uint32x4 l, h;
    v_expand(v_absdiff(a, b), l, h);
    return v_reduce_sum(l + h);
 }
 inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
 {
    return v_reduce_sum(v_absdiff(a, b));
 }
 inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
 {
    return v_reduce_sum(v_absdiff(a, b));
 }
 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
 {
    return v_reduce_sum(v_absdiff(a, b));
 }
 #define OPENCV_HAL_IMPL_SSE_POPCOUNT(_Tpvec) \
 inline v_uint32x4 v_popcount(const _Tpvec& a) \
 { \
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@ -739,6 +739,50 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
    return v_float32x4(vec_mergeh(ac, bd));
 }
 inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
 {
    const vec_uint4 zero4 = vec_uint4_z;
    vec_uint4 sum4 = vec_sum4s(vec_absd(a.val, b.val), zero4);
    return (unsigned)vec_extract(vec_sums(vec_int4_c(sum4), vec_int4_c(zero4)), 3);
 }
 inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
 {
    const vec_int4 zero4 = vec_int4_z;
    vec_char16 ad = vec_abss(vec_subs(a.val, b.val));
    vec_int4 sum4 = vec_sum4s(ad, zero4);
    return (unsigned)vec_extract(vec_sums(sum4, zero4), 3);
 }
 inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
 {
    vec_ushort8 ad = vec_absd(a.val, b.val);
    VSX_UNUSED(vec_int4) sum = vec_sums(vec_int4_c(vec_unpackhu(ad)), vec_int4_c(vec_unpacklu(ad)));
    return (unsigned)vec_extract(sum, 3);
 }
 inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
 {
    const vec_int4 zero4 = vec_int4_z;
    vec_short8 ad = vec_abss(vec_subs(a.val, b.val));
    vec_int4 sum4 = vec_sum4s(ad, zero4);
    return (unsigned)vec_extract(vec_sums(sum4, zero4), 3);
 }
 inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
 {
    const vec_uint4 ad = vec_absd(a.val, b.val);
    const vec_uint4 rd = vec_add(ad, vec_sld(ad, ad, 8));
    return vec_extract(vec_add(rd, vec_sld(rd, rd, 4)), 0);
 }
 inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
 {
    vec_int4 ad = vec_abss(vec_sub(a.val, b.val));
    return (unsigned)vec_extract(vec_sums(ad, vec_int4_z), 3);
 }
 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
 {
    const vec_float4 ad = vec_abs(vec_sub(a.val, b.val));
    const vec_float4 rd = vec_add(ad, vec_sld(ad, ad, 8));
    return vec_extract(vec_add(rd, vec_sld(rd, rd, 4)), 0);
 }
 /** Popcount **/
 template<typename _Tpvec>
 inline v_uint32x4 v_popcount(const _Tpvec& a)
--- a/modules/core/perf/perf_norm.cpp
+++ b/modules/core/perf/perf_norm.cpp
@ -253,4 +253,53 @@ PERF_TEST_P( Size_MatType, normalize_minmax, TYPICAL_MATS )
    SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
 }
 typedef TestBaseWithParam< int > test_len;
 PERF_TEST_P(test_len, hal_normL1_u8,
            testing::Values(300000, 2000000)
           )
 {
    int len = GetParam();
    Mat src1(1, len, CV_8UC1);
    Mat src2(1, len, CV_8UC1);
    declare.in(src1, src2, WARMUP_RNG);
    double n;
    TEST_CYCLE() n = hal::normL1_(src1.ptr<uchar>(0), src2.ptr<uchar>(0), len);
    CV_UNUSED(n);
    SANITY_CHECK_NOTHING();
 }
 PERF_TEST_P(test_len, hal_normL1_f32,
            testing::Values(300000, 2000000)
           )
 {
    int len = GetParam();
    Mat src1(1, len, CV_32FC1);
    Mat src2(1, len, CV_32FC1);
    declare.in(src1, src2, WARMUP_RNG);
    double n;
    TEST_CYCLE() n = hal::normL1_(src1.ptr<float>(0), src2.ptr<float>(0), len);
    CV_UNUSED(n);
    SANITY_CHECK_NOTHING();
 }
 PERF_TEST_P(test_len, hal_normL2Sqr,
            testing::Values(300000, 2000000)
           )
 {
    int len = GetParam();
    Mat src1(1, len, CV_32FC1);
    Mat src2(1, len, CV_32FC1);
    declare.in(src1, src2, WARMUP_RNG);
    double n;
    TEST_CYCLE() n = hal::normL2Sqr_(src1.ptr<float>(0), src2.ptr<float>(0), len);
    CV_UNUSED(n);
    SANITY_CHECK_NOTHING();
 }
 } // namespace
--- a/modules/core/src/norm.cpp
+++ b/modules/core/src/norm.cpp
@ -98,43 +98,15 @@ int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
 float normL2Sqr_(const float* a, const float* b, int n)
 {
    int j = 0; float d = 0.f;
-#if CV_AVX2
+#if CV_SIMD
-    float CV_DECL_ALIGNED(32) buf[8];
+    v_float32 v_d = vx_setzero_f32();
-    __m256 d0 = _mm256_setzero_ps();
+    for (; j <= n - v_float32::nlanes; j += v_float32::nlanes)
    for( ; j <= n - 8; j += 8 )
    {
        __m256 t0 = _mm256_sub_ps(_mm256_loadu_ps(a + j), _mm256_loadu_ps(b + j));
 #if CV_FMA3
        d0 = _mm256_fmadd_ps(t0, t0, d0);
 #else
        d0 = _mm256_add_ps(d0, _mm256_mul_ps(t0, t0));
 #endif
    }
    _mm256_store_ps(buf, d0);
    d = buf[0] + buf[1] + buf[2] + buf[3] + buf[4] + buf[5] + buf[6] + buf[7];
 #elif CV_SSE
    float CV_DECL_ALIGNED(16) buf[4];
    __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
    for( ; j <= n - 8; j += 8 )
    {
-        __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
+        v_float32 t = vx_load(a + j) - vx_load(b + j);
-        __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
+        v_d = v_muladd(t, t, v_d);
        d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
        d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
    }
-    _mm_store_ps(buf, _mm_add_ps(d0, d1));
+    d = v_reduce_sum(v_d);
    d = buf[0] + buf[1] + buf[2] + buf[3];
 #endif
    {
        for( ; j <= n - 4; j += 4 )
        {
            float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
            d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
        }
    }
    for( ; j < n; j++ )
    {
        float t = a[j] - b[j];
@ -147,38 +119,12 @@ float normL2Sqr_(const float* a, const float* b, int n)
 float normL1_(const float* a, const float* b, int n)
 {
    int j = 0; float d = 0.f;
-#if CV_SSE
+#if CV_SIMD
-    float CV_DECL_ALIGNED(16) buf[4];
+    v_float32 v_d = vx_setzero_f32();
-    static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
+    for (; j <= n - v_float32::nlanes; j += v_float32::nlanes)
-    __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
+        v_d += v_absdiff(vx_load(a + j), vx_load(b + j));
-    __m128 absmask = _mm_load_ps((const float*)absbuf);
+    d = v_reduce_sum(v_d);
    for( ; j <= n - 8; j += 8 )
    {
        __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
        __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
        d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask));
        d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask));
    }
    _mm_store_ps(buf, _mm_add_ps(d0, d1));
    d = buf[0] + buf[1] + buf[2] + buf[3];
 #elif CV_NEON
    float32x4_t v_sum = vdupq_n_f32(0.0f);
    for ( ; j <= n - 4; j += 4)
        v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j)));
    float CV_DECL_ALIGNED(16) buf[4];
    vst1q_f32(buf, v_sum);
    d = buf[0] + buf[1] + buf[2] + buf[3];
 #endif
    {
        for( ; j <= n - 4; j += 4 )
        {
            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
            std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
        }
    }
    for( ; j < n; j++ )
        d += std::abs(a[j] - b[j]);
    return d;
@ -187,46 +133,10 @@ float normL1_(const float* a, const float* b, int n)
 int normL1_(const uchar* a, const uchar* b, int n)
 {
    int j = 0, d = 0;
-#if CV_SSE
+#if CV_SIMD
-    __m128i d0 = _mm_setzero_si128();
+    for (; j <= n - v_uint8::nlanes; j += v_uint8::nlanes)
-
+        d += v_reduce_sad(vx_load(a + j), vx_load(b + j));
    for( ; j <= n - 16; j += 16 )
    {
        __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
        __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));
        d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
    }
    for( ; j <= n - 4; j += 4 )
    {
        __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
        __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));
        d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
    }
    d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
 #elif CV_NEON
    uint32x4_t v_sum = vdupq_n_u32(0.0f);
    for ( ; j <= n - 16; j += 16)
    {
        uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j));
        uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst));
        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high)));
        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high)));
    }
    uint CV_DECL_ALIGNED(16) buf[4];
    vst1q_u32(buf, v_sum);
    d = buf[0] + buf[1] + buf[2] + buf[3];
 #endif
    {
        for( ; j <= n - 4; j += 4 )
        {
            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
            std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
        }
    }
    for( ; j < n; j++ )
        d += std::abs(a[j] - b[j]);
    return d;
--- a/modules/ts/include/opencv2/ts.hpp
+++ b/modules/ts/include/opencv2/ts.hpp
@ -17,6 +17,8 @@
 #include "opencv2/core/utils/trace.hpp"
 #include "opencv2/core/hal/hal.hpp"
 #include <stdarg.h> // for va_list
 #include "cvconfig.h"