diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt index db13a0027b..630c64c949 100644 --- a/modules/core/CMakeLists.txt +++ b/modules/core/CMakeLists.txt @@ -2,6 +2,7 @@ set(the_description "The Core Functionality") ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2) ocv_add_dispatched_file(stat SSE4_2 AVX2) +ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2) # dispatching for accuracy tests ocv_add_dispatched_file_force_all(test_intrin128 TEST SSE2 SSE3 SSSE3 SSE4_1 SSE4_2 AVX FP16 AVX2) diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp index 0cf36cf174..78bd14e4d8 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -661,7 +661,7 @@ inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b) { v_uint16x16 c, d; v_mul_expand(a, b, c, d); - return v_pack_u(v_reinterpret_as_s16(c), v_reinterpret_as_s16(d)); + return v_pack(c, d); } inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b) { @@ -1291,6 +1291,16 @@ inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b) inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b) { return v_abs(a - b); } +/** Saturating absolute difference **/ +inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b) +{ + v_int8x32 d = a - b; + v_int8x32 m = a < b; + return (d ^ m) - m; +} +inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b) +{ return v_max(a, b) - v_min(a, b); } + ////////// Conversions ///////// /** Rounding **/ @@ -1300,6 +1310,12 @@ inline v_int32x8 v_round(const v_float32x8& a) inline v_int32x8 v_round(const v_float64x4& a) { return v_int32x8(_mm256_castsi128_si256(_mm256_cvtpd_epi32(a.val))); } +inline v_int32x8 v_round(const v_float64x4& a, const v_float64x4& b) +{ + __m128i ai = _mm256_cvtpd_epi32(a.val), bi = _mm256_cvtpd_epi32(b.val); + return v_int32x8(_v256_combine(ai, bi)); +} + inline v_int32x8 v_trunc(const v_float32x8& a) { return v_int32x8(_mm256_cvttps_epi32(a.val)); } @@ -1689,6 +1705,40 @@ void v_rshr_pack_store(int* ptr, const v_int64x4& a) v_pack_store(ptr, (a + delta) >> n); } +// pack boolean +inline v_uint8x32 v_pack_b(const v_uint16x16& a, const v_uint16x16& b) +{ + __m256i ab = _mm256_packs_epi16(a.val, b.val); + return v_uint8x32(_v256_shuffle_odd_64(ab)); +} + +inline v_uint8x32 v_pack_b(const v_uint32x8& a, const v_uint32x8& b, + const v_uint32x8& c, const v_uint32x8& d) +{ + __m256i ab = _mm256_packs_epi32(a.val, b.val); + __m256i cd = _mm256_packs_epi32(c.val, d.val); + + __m256i abcd = _v256_shuffle_odd_64(_mm256_packs_epi16(ab, cd)); + return v_uint8x32(_mm256_shuffle_epi32(abcd, _MM_SHUFFLE(3, 1, 2, 0))); +} + +inline v_uint8x32 v_pack_b(const v_uint64x4& a, const v_uint64x4& b, const v_uint64x4& c, + const v_uint64x4& d, const v_uint64x4& e, const v_uint64x4& f, + const v_uint64x4& g, const v_uint64x4& h) +{ + __m256i ab = _mm256_packs_epi32(a.val, b.val); + __m256i cd = _mm256_packs_epi32(c.val, d.val); + __m256i ef = _mm256_packs_epi32(e.val, f.val); + __m256i gh = _mm256_packs_epi32(g.val, h.val); + + __m256i abcd = _mm256_packs_epi32(ab, cd); + __m256i efgh = _mm256_packs_epi32(ef, gh); + __m256i pkall = _v256_shuffle_odd_64(_mm256_packs_epi16(abcd, efgh)); + + __m256i rev = _mm256_alignr_epi8(pkall, pkall, 8); + return v_uint8x32(_mm256_unpacklo_epi16(pkall, rev)); +} + /* Recombine */ // its up there with load and store operations diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index 38a39172d0..5712f167a8 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -109,7 +109,7 @@ These operations allow to reorder or recombine elements in one or multiple vecto - Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave - Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand, @ref v_expand_low, @ref v_expand_high -- Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u, +- Pack: @ref v_pack, @ref v_pack_u, @ref v_pack_b, @ref v_rshr_pack, @ref v_rshr_pack_u, @ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store - Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high - Extract: @ref v_extract @@ -159,7 +159,7 @@ Most of these operations return only one value. ### Other math - Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude -- Absolute values: @ref v_abs, @ref v_absdiff +- Absolute values: @ref v_abs, @ref v_absdiff, @ref v_absdiffs ### Conversions @@ -199,10 +199,12 @@ Regular integers: |logical | x | x | x | x | x | x | |min, max | x | x | x | x | x | x | |absdiff | x | x | x | x | x | x | +|absdiffs | | x | | x | | | |reduce | | | | | x | x | |mask | x | x | x | x | x | x | |pack | x | x | x | x | x | x | |pack_u | x | | x | | | | +|pack_b | x | | | | | | |unpack | x | x | x | x | x | x | |extract | x | x | x | x | x | x | |rotate (lanes) | x | x | x | x | x | x | @@ -762,6 +764,19 @@ inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b) return c; } +/** @brief Saturating absolute difference + +Returns \f$ saturate(|a - b|) \f$ . +For 8-, 16-bit signed integer source types. */ +template +inline v_reg<_Tp, n> v_absdiffs(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + v_reg<_Tp, n> c; + for( int i = 0; i < n; i++) + c.s[i] = saturate_cast<_Tp>(std::abs(a.s[i] - b.s[i])); + return c; +} + /** @brief Inversed square root Returns \f$ 1/sqrt(a) \f$ @@ -1613,6 +1628,18 @@ template inline v_reg v_round(const v_reg& a) return c; } +/** @overload */ +template inline v_reg v_round(const v_reg& a, const v_reg& b) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + { + c.s[i] = cvRound(a.s[i]); + c.s[i+n] = cvRound(b.s[i]); + } + return c; +} + /** @brief Floor Floor each value. Input type is float vector ==> output type is int vector.*/ @@ -2059,6 +2086,103 @@ OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, s OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast) //! @} +//! @cond IGNORED +template +inline void _pack_b(_Tpm* mptr, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + for (int i = 0; i < n; ++i) + { + mptr[i] = (_Tpm)a.s[i]; + mptr[i + n] = (_Tpm)b.s[i]; + } +} +//! @endcond + +//! @name Pack boolean values +//! @{ +//! @brief Pack boolean values from multiple vectors to one unsigned 8-bit integer vector +//! +//! @note Must provide valid boolean values to guarantee same result for all architectures. + +/** @brief +//! For 16-bit boolean values + +Scheme: +@code +a {0xFFFF 0 0 0xFFFF 0 0xFFFF 0xFFFF 0} +b {0xFFFF 0 0xFFFF 0 0 0xFFFF 0 0xFFFF} +=============== +{ + 0xFF 0 0 0xFF 0 0xFF 0xFF 0 + 0xFF 0 0xFF 0 0 0xFF 0 0xFF +} +@endcode */ + +inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b) +{ + v_uint8x16 mask; + _pack_b(mask.s, a, b); + return mask; +} + +/** @overload +For 32-bit boolean values + +Scheme: +@code +a {0xFFFF.. 0 0 0xFFFF..} +b {0 0xFFFF.. 0xFFFF.. 0} +c {0xFFFF.. 0 0xFFFF.. 0} +d {0 0xFFFF.. 0 0xFFFF..} +=============== +{ + 0xFF 0 0 0xFF 0 0xFF 0xFF 0 + 0xFF 0 0xFF 0 0 0xFF 0 0xFF +} +@endcode */ + +inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b, + const v_uint32x4& c, const v_uint32x4& d) +{ + v_uint8x16 mask; + _pack_b(mask.s, a, b); + _pack_b(mask.s + 8, c, d); + return mask; +} + +/** @overload +For 64-bit boolean values + +Scheme: +@code +a {0xFFFF.. 0} +b {0 0xFFFF..} +c {0xFFFF.. 0} +d {0 0xFFFF..} + +e {0xFFFF.. 0} +f {0xFFFF.. 0} +g {0 0xFFFF..} +h {0 0xFFFF..} +=============== +{ + 0xFF 0 0 0xFF 0xFF 0 0 0xFF + 0xFF 0 0xFF 0 0 0xFF 0 0xFF +} +@endcode */ +inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c, + const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f, + const v_uint64x2& g, const v_uint64x2& h) +{ + v_uint8x16 mask; + _pack_b(mask.s, a, b); + _pack_b(mask.s + 4, c, d); + _pack_b(mask.s + 8, e, f); + _pack_b(mask.s + 12, g, h); + return mask; +} +//! @} + /** @brief Matrix multiplication Scheme: diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index 8c13ad52db..50c9b154ee 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -394,6 +394,35 @@ OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, pack, vmovn OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, pack_u, vqmovun_s16, vqrshrun_n_s16) OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, pack_u, vqmovun_s32, vqrshrun_n_s32) +// pack boolean +inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b) +{ + uint8x16_t ab = vcombine_u8(vmovn_u16(a.val), vmovn_u16(b.val)); + return v_uint8x16(ab); +} + +inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b, + const v_uint32x4& c, const v_uint32x4& d) +{ + uint16x8_t nab = vcombine_u16(vmovn_u32(a.val), vmovn_u32(b.val)); + uint16x8_t ncd = vcombine_u16(vmovn_u32(c.val), vmovn_u32(d.val)); + return v_uint8x16(vcombine_u8(vmovn_u16(nab), vmovn_u16(ncd))); +} + +inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c, + const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f, + const v_uint64x2& g, const v_uint64x2& h) +{ + uint32x4_t ab = vcombine_u32(vmovn_u64(a.val), vmovn_u64(b.val)); + uint32x4_t cd = vcombine_u32(vmovn_u64(c.val), vmovn_u64(d.val)); + uint32x4_t ef = vcombine_u32(vmovn_u64(e.val), vmovn_u64(f.val)); + uint32x4_t gh = vcombine_u32(vmovn_u64(g.val), vmovn_u64(h.val)); + + uint16x8_t abcd = vcombine_u16(vmovn_u32(ab), vmovn_u32(cd)); + uint16x8_t efgh = vcombine_u16(vmovn_u32(ef), vmovn_u32(gh)); + return v_uint8x16(vcombine_u8(vmovn_u16(abcd), vmovn_u16(efgh))); +} + inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, const v_float32x4& m1, const v_float32x4& m2, const v_float32x4& m3) @@ -748,7 +777,6 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_mul_wrap, vmulq_s8) OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_mul_wrap, vmulq_u16) OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mul_wrap, vmulq_s16) -// TODO: absdiff for signed integers OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8) OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16) OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32) @@ -757,6 +785,12 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32) OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_absdiff, vabdq_f64) #endif +/** Saturating absolute difference **/ +inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b) +{ return v_int8x16(vqabsq_s8(vqsubq_s8(a.val, b.val))); } +inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b) +{ return v_int16x8(vqabsq_s16(vqsubq_s16(a.val, b.val))); } + #define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \ inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \ { \ @@ -1242,6 +1276,11 @@ inline v_int32x4 v_round(const v_float64x2& a) return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero)); } +inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b) +{ + return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), vmovn_s64(vcvtaq_s64_f64(b.val)))); +} + inline v_int32x4 v_floor(const v_float64x2& a) { static const int32x2_t zero = vdup_n_s32(0); diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index d4740b72fe..c49d0de377 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -634,6 +634,35 @@ void v_rshr_pack_store(int* ptr, const v_int64x2& a) _mm_storel_epi64((__m128i*)ptr, a2); } +// pack boolean +inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b) +{ + __m128i ab = _mm_packs_epi16(a.val, b.val); + return v_uint8x16(ab); +} + +inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b, + const v_uint32x4& c, const v_uint32x4& d) +{ + __m128i ab = _mm_packs_epi32(a.val, b.val); + __m128i cd = _mm_packs_epi32(c.val, d.val); + return v_uint8x16(_mm_packs_epi16(ab, cd)); +} + +inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c, + const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f, + const v_uint64x2& g, const v_uint64x2& h) +{ + __m128i ab = _mm_packs_epi32(a.val, b.val); + __m128i cd = _mm_packs_epi32(c.val, d.val); + __m128i ef = _mm_packs_epi32(e.val, f.val); + __m128i gh = _mm_packs_epi32(g.val, h.val); + + __m128i abcd = _mm_packs_epi32(ab, cd); + __m128i efgh = _mm_packs_epi32(ef, gh); + return v_uint8x16(_mm_packs_epi16(abcd, efgh)); +} + inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, const v_float32x4& m1, const v_float32x4& m2, const v_float32x4& m3) @@ -706,19 +735,11 @@ OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64) inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \ { a = a * b; return a; } +OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint8x16, v_uint16x8) OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int8x16, v_int16x8) OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint16x8, v_uint32x4) OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int16x8, v_int32x4) -inline v_uint8x16 operator * (const v_uint8x16& a, const v_uint8x16& b) -{ - v_uint16x8 c, d; - v_mul_expand(a, b, c, d); - return v_pack_u(v_reinterpret_as_s16(c), v_reinterpret_as_s16(d)); -} -inline v_uint8x16& operator *= (v_uint8x16& a, const v_uint8x16& b) -{ a = a * b; return a; } - // Multiply and expand inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b, v_uint16x8& c, v_uint16x8& d) @@ -1045,34 +1066,43 @@ inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b) return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b))); } -#define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \ -inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \ -{ \ - return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a.val, b.val), _mm_subs_epu##bits(b.val, a.val))); \ -} \ -inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \ -{ \ - __m128i smask = _mm_set1_epi32(smask32); \ - __m128i a1 = _mm_xor_si128(a.val, smask); \ - __m128i b1 = _mm_xor_si128(b.val, smask); \ - return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a1, b1), _mm_subs_epu##bits(b1, a1))); \ -} - -OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080) -OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000) +/** Absolute difference **/ +inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b) +{ return v_add_wrap(a - b, b - a); } +inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b) +{ return v_add_wrap(a - b, b - a); } inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b) +{ return v_max(a, b) - v_min(a, b); } + +inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b) { - return v_max(a, b) - v_min(a, b); + v_int8x16 d = v_sub_wrap(a, b); + v_int8x16 m = a < b; + return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m)); +} +inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b) +{ + return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); } - inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b) { - __m128i d = _mm_sub_epi32(a.val, b.val); - __m128i m = _mm_cmpgt_epi32(b.val, a.val); - return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m)); + v_int32x4 d = a - b; + v_int32x4 m = a < b; + return v_reinterpret_as_u32((d ^ m) - m); } +/** Saturating absolute difference **/ +inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b) +{ + v_int8x16 d = a - b; + v_int8x16 m = a < b; + return (d ^ m) - m; + } +inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b) +{ return v_max(a, b) - v_min(a, b); } + + inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) { return a * b + c; @@ -1623,6 +1653,12 @@ inline v_int32x4 v_trunc(const v_float32x4& a) inline v_int32x4 v_round(const v_float64x2& a) { return v_int32x4(_mm_cvtpd_epi32(a.val)); } +inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b) +{ + __m128i ai = _mm_cvtpd_epi32(a.val), bi = _mm_cvtpd_epi32(b.val); + return v_int32x4(_mm_unpacklo_epi64(ai, bi)); +} + inline v_int32x4 v_floor(const v_float64x2& a) { __m128i a1 = _mm_cvtpd_epi32(a.val); diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index 27efd2ad9c..b23e19950e 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -383,6 +383,35 @@ OPENCV_HAL_IMPL_VSX_PACK(v_uint16x8, ushort, v_int32x4, unsigned int, int, //OPENCV_HAL_IMPL_VSX_PACK(v_uint32x4, uint, v_int64x2, unsigned long long, long long, // vec_sra, vec_packsu, vec_add, pack_u) +// pack boolean +inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b) +{ + vec_uchar16 ab = vec_pack(a.val, b.val); + return v_uint8x16(ab); +} + +inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b, + const v_uint32x4& c, const v_uint32x4& d) +{ + vec_ushort8 ab = vec_pack(a.val, b.val); + vec_ushort8 cd = vec_pack(c.val, d.val); + return v_uint8x16(vec_pack(ab, cd)); +} + +inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c, + const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f, + const v_uint64x2& g, const v_uint64x2& h) +{ + vec_uint4 ab = vec_pack(a.val, b.val); + vec_uint4 cd = vec_pack(c.val, d.val); + vec_uint4 ef = vec_pack(e.val, f.val); + vec_uint4 gh = vec_pack(g.val, h.val); + + vec_ushort8 abcd = vec_pack(ab, cd); + vec_ushort8 efgh = vec_pack(ef, gh); + return v_uint8x16(vec_pack(abcd, efgh)); +} + /* Recombine */ template inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) @@ -834,16 +863,27 @@ inline v_float32x4 v_abs(const v_float32x4& x) inline v_float64x2 v_abs(const v_float64x2& x) { return v_float64x2(vec_abs(x.val)); } +/** Absolute difference **/ +// unsigned OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_absdiff, vec_absd) -#define OPENCV_HAL_IMPL_VSX_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \ -inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \ -{ return _Tpvec2(cast(intrin(a.val, b.val))); } +inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b) +{ return v_reinterpret_as_u8(v_sub_wrap(v_max(a, b), v_min(a, b))); } +inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b) +{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); } +inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b) +{ return v_reinterpret_as_u32(v_max(a, b) - v_min(a, b)); } -OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int8x16, v_uint8x16, vec_uchar16_c, v_absdiff, vec_absd) -OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int16x8, v_uint16x8, vec_ushort8_c, v_absdiff, vec_absd) -OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int32x4, v_uint32x4, vec_uint4_c, v_absdiff, vec_absd) -OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int64x2, v_uint64x2, vec_udword2_c, v_absdiff, vec_absd) +inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b) +{ return v_abs(a - b); } +inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b) +{ return v_abs(a - b); } + +/** Absolute difference for signed integers **/ +inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b) +{ return v_int8x16(vec_abss(vec_subs(a.val, b.val))); } +inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b) +{ return v_int16x8(vec_abss(vec_subs(a.val, b.val))); } ////////// Conversions ///////// @@ -854,6 +894,9 @@ inline v_int32x4 v_round(const v_float32x4& a) inline v_int32x4 v_round(const v_float64x2& a) { return v_int32x4(vec_mergesqo(vec_ctso(vec_rint(a.val)), vec_int4_z)); } +inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b) +{ return v_int32x4(vec_mergesqo(vec_ctso(vec_rint(a.val)), vec_ctso(vec_rint(b.val)))); } + inline v_int32x4 v_floor(const v_float32x4& a) { return v_int32x4(vec_cts(vec_floor(a.val))); } diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index 0626607e2f..197b4f0dcf 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -2134,1014 +2134,4 @@ cvMaxS( const void* srcarr1, double value, void* dstarr ) cv::max( src1, value, dst ); } - - -namespace cv { namespace hal { - -//======================================= - -#if (ARITHM_USE_IPP == 1) -static inline void fixSteps(int width, int height, size_t elemSize, size_t& step1, size_t& step2, size_t& step) -{ - if( height == 1 ) - step1 = step2 = step = width*elemSize; -} -#define CALL_IPP_BIN_E_12(fun) \ - CV_IPP_CHECK() \ - { \ - fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \ - if (0 <= CV_INSTRUMENT_FUN_IPP(fun, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0)) \ - { \ - CV_IMPL_ADD(CV_IMPL_IPP); \ - return; \ - } \ - setIppErrorStatus(); \ - } - -#define CALL_IPP_BIN_E_21(fun) \ - CV_IPP_CHECK() \ - { \ - fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \ - if (0 <= CV_INSTRUMENT_FUN_IPP(fun, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height), 0)) \ - { \ - CV_IMPL_ADD(CV_IMPL_IPP); \ - return; \ - } \ - setIppErrorStatus(); \ - } - -#define CALL_IPP_BIN_12(fun) \ - CV_IPP_CHECK() \ - { \ - fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \ - if (0 <= CV_INSTRUMENT_FUN_IPP(fun, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height))) \ - { \ - CV_IMPL_ADD(CV_IMPL_IPP); \ - return; \ - } \ - setIppErrorStatus(); \ - } - -#define CALL_IPP_BIN_21(fun) \ - CV_IPP_CHECK() \ - { \ - fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \ - if (0 <= CV_INSTRUMENT_FUN_IPP(fun, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height))) \ - { \ - CV_IMPL_ADD(CV_IMPL_IPP); \ - return; \ - } \ - setIppErrorStatus(); \ - } - -#else -#define CALL_IPP_BIN_E_12(fun) -#define CALL_IPP_BIN_E_21(fun) -#define CALL_IPP_BIN_12(fun) -#define CALL_IPP_BIN_21(fun) -#endif - - -//======================================= -// Add -//======================================= - -void add8u( const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(add8u, cv_hal_add8u, src1, step1, src2, step2, dst, step, width, height) - CALL_IPP_BIN_E_12(ippiAdd_8u_C1RSfs) - (vBinOp, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, width, height)); -} - -void add8s( const schar* src1, size_t step1, - const schar* src2, size_t step2, - schar* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(add8s, cv_hal_add8s, src1, step1, src2, step2, dst, step, width, height) - vBinOp, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, width, height); -} - -void add16u( const ushort* src1, size_t step1, - const ushort* src2, size_t step2, - ushort* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(add16u, cv_hal_add16u, src1, step1, src2, step2, dst, step, width, height) - CALL_IPP_BIN_E_12(ippiAdd_16u_C1RSfs) - (vBinOp, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, width, height)); -} - -void add16s( const short* src1, size_t step1, - const short* src2, size_t step2, - short* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(add16s, cv_hal_add16s, src1, step1, src2, step2, dst, step, width, height) - CALL_IPP_BIN_E_12(ippiAdd_16s_C1RSfs) - (vBinOp, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, width, height)); -} - -void add32s( const int* src1, size_t step1, - const int* src2, size_t step2, - int* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(add32s, cv_hal_add32s, src1, step1, src2, step2, dst, step, width, height) - vBinOp32, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, width, height); -} - -void add32f( const float* src1, size_t step1, - const float* src2, size_t step2, - float* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(add32f, cv_hal_add32f, src1, step1, src2, step2, dst, step, width, height) - CALL_IPP_BIN_12(ippiAdd_32f_C1R) - (vBinOp32, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, width, height)); -} - -void add64f( const double* src1, size_t step1, - const double* src2, size_t step2, - double* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(add64f, cv_hal_add64f, src1, step1, src2, step2, dst, step, width, height) - vBinOp64, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, width, height); -} - -//======================================= -// Subtract -//======================================= - -void sub8u( const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(sub8u, cv_hal_sub8u, src1, step1, src2, step2, dst, step, width, height) - CALL_IPP_BIN_E_21(ippiSub_8u_C1RSfs) - (vBinOp, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, width, height)); -} - -void sub8s( const schar* src1, size_t step1, - const schar* src2, size_t step2, - schar* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(sub8s, cv_hal_sub8s, src1, step1, src2, step2, dst, step, width, height) - vBinOp, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, width, height); -} - -void sub16u( const ushort* src1, size_t step1, - const ushort* src2, size_t step2, - ushort* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(sub16u, cv_hal_sub16u, src1, step1, src2, step2, dst, step, width, height) - CALL_IPP_BIN_E_21(ippiSub_16u_C1RSfs) - (vBinOp, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, width, height)); -} - -void sub16s( const short* src1, size_t step1, - const short* src2, size_t step2, - short* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(sub16s, cv_hal_sub16s, src1, step1, src2, step2, dst, step, width, height) - CALL_IPP_BIN_E_21(ippiSub_16s_C1RSfs) - (vBinOp, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, width, height)); -} - -void sub32s( const int* src1, size_t step1, - const int* src2, size_t step2, - int* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(sub32s, cv_hal_sub32s, src1, step1, src2, step2, dst, step, width, height) - vBinOp32, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, width, height); -} - -void sub32f( const float* src1, size_t step1, - const float* src2, size_t step2, - float* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(sub32f, cv_hal_sub32f, src1, step1, src2, step2, dst, step, width, height) - CALL_IPP_BIN_21(ippiSub_32f_C1R) - (vBinOp32, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, width, height)); -} - -void sub64f( const double* src1, size_t step1, - const double* src2, size_t step2, - double* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(sub64f, cv_hal_sub64f, src1, step1, src2, step2, dst, step, width, height) - vBinOp64, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, width, height); -} - -//======================================= - -#if (ARITHM_USE_IPP == 1) -#define CALL_IPP_MIN_MAX(fun, type) \ - CV_IPP_CHECK() \ - { \ - type* s1 = (type*)src1; \ - type* s2 = (type*)src2; \ - type* d = dst; \ - fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \ - int i = 0; \ - for(; i < height; i++) \ - { \ - if (0 > CV_INSTRUMENT_FUN_IPP(fun, s1, s2, d, width)) \ - break; \ - s1 = (type*)((uchar*)s1 + step1); \ - s2 = (type*)((uchar*)s2 + step2); \ - d = (type*)((uchar*)d + step); \ - } \ - if (i == height) \ - { \ - CV_IMPL_ADD(CV_IMPL_IPP); \ - return; \ - } \ - setIppErrorStatus(); \ - } -#else -#define CALL_IPP_MIN_MAX(fun, type) -#endif - -//======================================= -// Max -//======================================= - -void max8u( const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(max8u, cv_hal_max8u, src1, step1, src2, step2, dst, step, width, height) - CALL_IPP_MIN_MAX(ippsMaxEvery_8u, uchar) - vBinOp, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, width, height); -} - -void max8s( const schar* src1, size_t step1, - const schar* src2, size_t step2, - schar* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(max8s, cv_hal_max8s, src1, step1, src2, step2, dst, step, width, height) - vBinOp, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, width, height); -} - -void max16u( const ushort* src1, size_t step1, - const ushort* src2, size_t step2, - ushort* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(max16u, cv_hal_max16u, src1, step1, src2, step2, dst, step, width, height) - CALL_IPP_MIN_MAX(ippsMaxEvery_16u, ushort) - vBinOp, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, width, height); -} - -void max16s( const short* src1, size_t step1, - const short* src2, size_t step2, - short* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(max16s, cv_hal_max16s, src1, step1, src2, step2, dst, step, width, height) - vBinOp, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, width, height); -} - -void max32s( const int* src1, size_t step1, - const int* src2, size_t step2, - int* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(max32s, cv_hal_max32s, src1, step1, src2, step2, dst, step, width, height) - vBinOp32, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, width, height); -} - -void max32f( const float* src1, size_t step1, - const float* src2, size_t step2, - float* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(max32f, cv_hal_max32f, src1, step1, src2, step2, dst, step, width, height) - CALL_IPP_MIN_MAX(ippsMaxEvery_32f, float) - vBinOp32, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, width, height); -} - -void max64f( const double* src1, size_t step1, - const double* src2, size_t step2, - double* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(max64f, cv_hal_max64f, src1, step1, src2, step2, dst, step, width, height) - CALL_IPP_MIN_MAX(ippsMaxEvery_64f, double) - vBinOp64, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, width, height); -} - -//======================================= -// Min -//======================================= - -void min8u( const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(min8u, cv_hal_min8u, src1, step1, src2, step2, dst, step, width, height) - CALL_IPP_MIN_MAX(ippsMinEvery_8u, uchar) - vBinOp, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, width, height); -} - -void min8s( const schar* src1, size_t step1, - const schar* src2, size_t step2, - schar* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(min8s, cv_hal_min8s, src1, step1, src2, step2, dst, step, width, height) - vBinOp, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, width, height); -} - -void min16u( const ushort* src1, size_t step1, - const ushort* src2, size_t step2, - ushort* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(min16u, cv_hal_min16u, src1, step1, src2, step2, dst, step, width, height) - CALL_IPP_MIN_MAX(ippsMinEvery_16u, ushort) - vBinOp, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, width, height); -} - -void min16s( const short* src1, size_t step1, - const short* src2, size_t step2, - short* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(min16s, cv_hal_min16s, src1, step1, src2, step2, dst, step, width, height) - vBinOp, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, width, height); -} - -void min32s( const int* src1, size_t step1, - const int* src2, size_t step2, - int* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(min32s, cv_hal_min32s, src1, step1, src2, step2, dst, step, width, height) - vBinOp32, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, width, height); -} - -void min32f( const float* src1, size_t step1, - const float* src2, size_t step2, - float* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(min32f, cv_hal_min32f, src1, step1, src2, step2, dst, step, width, height) - CALL_IPP_MIN_MAX(ippsMinEvery_32f, float) - vBinOp32, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, width, height); -} - -void min64f( const double* src1, size_t step1, - const double* src2, size_t step2, - double* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(min64f, cv_hal_min64f, src1, step1, src2, step2, dst, step, width, height) - CALL_IPP_MIN_MAX(ippsMinEvery_64f, double) - vBinOp64, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, width, height); -} - -//======================================= -// AbsDiff -//======================================= - -void absdiff8u( const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(absdiff8u, cv_hal_absdiff8u, src1, step1, src2, step2, dst, step, width, height) - CALL_IPP_BIN_12(ippiAbsDiff_8u_C1R) - (vBinOp, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, width, height)); -} - -void absdiff8s( const schar* src1, size_t step1, - const schar* src2, size_t step2, - schar* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(absdiff8s, cv_hal_absdiff8s, src1, step1, src2, step2, dst, step, width, height) - vBinOp, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, width, height); -} - -void absdiff16u( const ushort* src1, size_t step1, - const ushort* src2, size_t step2, - ushort* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(absdiff16u, cv_hal_absdiff16u, src1, step1, src2, step2, dst, step, width, height) - CALL_IPP_BIN_12(ippiAbsDiff_16u_C1R) - (vBinOp, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, width, height)); -} - -void absdiff16s( const short* src1, size_t step1, - const short* src2, size_t step2, - short* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(absdiff16s, cv_hal_absdiff16s, src1, step1, src2, step2, dst, step, width, height) - vBinOp, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, width, height); -} - -void absdiff32s( const int* src1, size_t step1, - const int* src2, size_t step2, - int* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(absdiff32s, cv_hal_absdiff32s, src1, step1, src2, step2, dst, step, width, height) - vBinOp32, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, width, height); -} - -void absdiff32f( const float* src1, size_t step1, - const float* src2, size_t step2, - float* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(absdiff32f, cv_hal_absdiff32f, src1, step1, src2, step2, dst, step, width, height) - CALL_IPP_BIN_12(ippiAbsDiff_32f_C1R) - (vBinOp32, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, width, height)); -} - -void absdiff64f( const double* src1, size_t step1, - const double* src2, size_t step2, - double* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(absdiff64f, cv_hal_absdiff64f, src1, step1, src2, step2, dst, step, width, height) - vBinOp64, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, width, height); -} - -//======================================= -// Logical -//======================================= - -#if (ARITHM_USE_IPP == 1) -#define CALL_IPP_UN(fun) \ - CV_IPP_CHECK() \ - { \ - fixSteps(width, height, sizeof(dst[0]), step1, step2, step); CV_UNUSED(src2); \ - if (0 <= CV_INSTRUMENT_FUN_IPP(fun, src1, (int)step1, dst, (int)step, ippiSize(width, height))) \ - { \ - CV_IMPL_ADD(CV_IMPL_IPP); \ - return; \ - } \ - setIppErrorStatus(); \ - } -#else -#define CALL_IPP_UN(fun) -#endif - -void and8u( const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(and8u, cv_hal_and8u, src1, step1, src2, step2, dst, step, width, height) - CALL_IPP_BIN_12(ippiAnd_8u_C1R) - (vBinOp, IF_SIMD(VAnd)>(src1, step1, src2, step2, dst, step, width, height)); -} - -void or8u( const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(or8u, cv_hal_or8u, src1, step1, src2, step2, dst, step, width, height) - CALL_IPP_BIN_12(ippiOr_8u_C1R) - (vBinOp, IF_SIMD(VOr)>(src1, step1, src2, step2, dst, step, width, height)); -} - -void xor8u( const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(xor8u, cv_hal_xor8u, src1, step1, src2, step2, dst, step, width, height) - CALL_IPP_BIN_12(ippiXor_8u_C1R) - (vBinOp, IF_SIMD(VXor)>(src1, step1, src2, step2, dst, step, width, height)); -} - -void not8u( const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, int width, int height, void* ) -{ - CALL_HAL(not8u, cv_hal_not8u, src1, step1, dst, step, width, height) - CALL_IPP_UN(ippiNot_8u_C1R) - (vBinOp, IF_SIMD(VNot)>(src1, step1, src2, step2, dst, step, width, height)); -} - -//======================================= - -#if ARITHM_USE_IPP -inline static IppCmpOp convert_cmp(int _cmpop) -{ - return _cmpop == CMP_EQ ? ippCmpEq : - _cmpop == CMP_GT ? ippCmpGreater : - _cmpop == CMP_GE ? ippCmpGreaterEq : - _cmpop == CMP_LT ? ippCmpLess : - _cmpop == CMP_LE ? ippCmpLessEq : - (IppCmpOp)-1; -} -#define CALL_IPP_CMP(fun) \ - CV_IPP_CHECK() \ - { \ - IppCmpOp op = convert_cmp(*(int *)_cmpop); \ - if( op >= 0 ) \ - { \ - fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \ - if (0 <= CV_INSTRUMENT_FUN_IPP(fun, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), op)) \ - { \ - CV_IMPL_ADD(CV_IMPL_IPP); \ - return; \ - } \ - setIppErrorStatus(); \ - } \ - } -#else -#define CALL_IPP_CMP(fun) -#endif - -//======================================= -// Compare -//======================================= - -void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, - uchar* dst, size_t step, int width, int height, void* _cmpop) -{ - CALL_HAL(cmp8u, cv_hal_cmp8u, src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop) - CALL_IPP_CMP(ippiCompare_8u_C1R) - //vz optimized cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop); - int code = *(int*)_cmpop; - step1 /= sizeof(src1[0]); - step2 /= sizeof(src2[0]); - if( code == CMP_GE || code == CMP_LT ) - { - std::swap(src1, src2); - std::swap(step1, step2); - code = code == CMP_GE ? CMP_LE : CMP_GT; - } - - if( code == CMP_GT || code == CMP_LE ) - { - int m = code == CMP_GT ? 0 : 255; - for( ; height--; src1 += step1, src2 += step2, dst += step ) - { - int x =0; -#if CV_SIMD128 - if( hasSIMD128() ) - { - v_uint8x16 mask = v_setall_u8((uchar)m); - - for( ; x <= width - v_uint8x16::nlanes; x += v_uint8x16::nlanes ) - { - v_store(dst + x, (v_load(src1 + x) > v_load(src2 + x)) ^ mask); - } - } -#endif - - for( ; x < width; x++ ){ - dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m); - } - } - } - else if( code == CMP_EQ || code == CMP_NE ) - { - int m = code == CMP_EQ ? 0 : 255; - for( ; height--; src1 += step1, src2 += step2, dst += step ) - { - int x = 0; -#if CV_SIMD128 - if( hasSIMD128() ) - { - v_uint8x16 mask = v_setall_u8((uchar)m); - - for( ; x <= width - v_uint8x16::nlanes; x += v_uint8x16::nlanes ) - { - v_store(dst+x, (v_load(src1+x) == v_load(src2+x)) ^ mask); - } - } -#endif - for( ; x < width; x++ ) - dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m); - } - } -} - -void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2, - uchar* dst, size_t step, int width, int height, void* _cmpop) -{ - CALL_HAL(cmp8s, cv_hal_cmp8s, src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop) - cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop); -} - -void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, - uchar* dst, size_t step, int width, int height, void* _cmpop) -{ - CALL_HAL(cmp16u, cv_hal_cmp16u, src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop) - CALL_IPP_CMP(ippiCompare_16u_C1R) - cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop); -} - -void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2, - uchar* dst, size_t step, int width, int height, void* _cmpop) -{ - CALL_HAL(cmp16s, cv_hal_cmp16s, src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop) - CALL_IPP_CMP(ippiCompare_16s_C1R) - //vz optimized cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop); - - int code = *(int*)_cmpop; - step1 /= sizeof(src1[0]); - step2 /= sizeof(src2[0]); - if( code == CMP_GE || code == CMP_LT ) - { - std::swap(src1, src2); - std::swap(step1, step2); - code = code == CMP_GE ? CMP_LE : CMP_GT; - } - - if( code == CMP_GT || code == CMP_LE ) - { - int m = code == CMP_GT ? 0 : 255; - for( ; height--; src1 += step1, src2 += step2, dst += step ) - { - int x =0; -#if CV_SIMD128 - if( hasSIMD128() ) - { - v_uint8x16 mask = v_setall_u8((uchar)m); - const int dWidth = v_uint8x16::nlanes; - - for( ; x <= width - dWidth; x += dWidth ) - { - v_int16x8 in1 = v_load(src1 + x); - v_int16x8 in2 = v_load(src2 + x); - v_uint16x8 t1 = v_reinterpret_as_u16(in1 > in2); - - in1 = v_load(src1 + x + v_uint16x8::nlanes); - in2 = v_load(src2 + x + v_uint16x8::nlanes); - v_uint16x8 t2 = v_reinterpret_as_u16(in1 > in2); - - v_store(dst+x, (v_pack(t1, t2)) ^ mask); - } - } -#endif - for( ; x < width; x++ ){ - dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m); - } - } - } - else if( code == CMP_EQ || code == CMP_NE ) - { - int m = code == CMP_EQ ? 0 : 255; - for( ; height--; src1 += step1, src2 += step2, dst += step ) - { - int x = 0; -#if CV_SIMD128 - if( hasSIMD128() ) - { - v_uint8x16 mask = v_setall_u8((uchar)m); - const int dWidth = v_uint8x16::nlanes; - - for( ; x <= width - dWidth; x += dWidth ) - { - v_int16x8 in1 = v_load(src1 + x); - v_int16x8 in2 = v_load(src2 + x); - v_uint16x8 t1 = v_reinterpret_as_u16(in1 == in2); - - in1 = v_load(src1 + x + 8); - in2 = v_load(src2 + x + 8); - v_uint16x8 t2 = v_reinterpret_as_u16(in1 == in2); - - v_store(dst+x, (v_pack(t1, t2)^ mask)); - } - } -#endif - for( ; x < width; x++ ) - dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m); - } - } -} - -void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2, - uchar* dst, size_t step, int width, int height, void* _cmpop) -{ - CALL_HAL(cmp32s, cv_hal_cmp32s, src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop) - cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop); -} - -void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2, - uchar* dst, size_t step, int width, int height, void* _cmpop) -{ - CALL_HAL(cmp32f, cv_hal_cmp32f, src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop) - CALL_IPP_CMP(ippiCompare_32f_C1R) - cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop); -} - -void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2, - uchar* dst, size_t step, int width, int height, void* _cmpop) -{ - CALL_HAL(cmp64f, cv_hal_cmp64f, src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop) - cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop); -} - -//======================================= - -#if defined HAVE_IPP -#define CALL_IPP_MUL(fun) \ - CV_IPP_CHECK() \ - { \ - if (std::fabs(fscale - 1) <= FLT_EPSILON) \ - { \ - if (CV_INSTRUMENT_FUN_IPP(fun, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0) >= 0) \ - { \ - CV_IMPL_ADD(CV_IMPL_IPP); \ - return; \ - } \ - setIppErrorStatus(); \ - } \ - } - -#define CALL_IPP_MUL_2(fun) \ - CV_IPP_CHECK() \ - { \ - if (std::fabs(fscale - 1) <= FLT_EPSILON) \ - { \ - if (CV_INSTRUMENT_FUN_IPP(fun, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height)) >= 0) \ - { \ - CV_IMPL_ADD(CV_IMPL_IPP); \ - return; \ - } \ - setIppErrorStatus(); \ - } \ - } - -#else -#define CALL_IPP_MUL(fun) -#define CALL_IPP_MUL_2(fun) -#endif - -//======================================= -// Multilpy -//======================================= - -void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, - uchar* dst, size_t step, int width, int height, void* scale) -{ - CALL_HAL(mul8u, cv_hal_mul8u, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale) - float fscale = (float)*(const double*)scale; - CALL_IPP_MUL(ippiMul_8u_C1RSfs) - mul_(src1, step1, src2, step2, dst, step, width, height, fscale); -} - -void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2, - schar* dst, size_t step, int width, int height, void* scale) -{ - CALL_HAL(mul8s, cv_hal_mul8s, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale) - mul_(src1, step1, src2, step2, dst, step, width, height, (float)*(const double*)scale); -} - -void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, - ushort* dst, size_t step, int width, int height, void* scale) -{ - CALL_HAL(mul16u, cv_hal_mul16u, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale) - float fscale = (float)*(const double*)scale; - CALL_IPP_MUL(ippiMul_16u_C1RSfs) - mul_(src1, step1, src2, step2, dst, step, width, height, fscale); -} - -void mul16s( const short* src1, size_t step1, const short* src2, size_t step2, - short* dst, size_t step, int width, int height, void* scale) -{ - CALL_HAL(mul16s, cv_hal_mul16s, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale) - float fscale = (float)*(const double*)scale; - CALL_IPP_MUL(ippiMul_16s_C1RSfs) - mul_(src1, step1, src2, step2, dst, step, width, height, fscale); -} - -void mul32s( const int* src1, size_t step1, const int* src2, size_t step2, - int* dst, size_t step, int width, int height, void* scale) -{ - CALL_HAL(mul32s, cv_hal_mul32s, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale) - mul_(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); -} - -void mul32f( const float* src1, size_t step1, const float* src2, size_t step2, - float* dst, size_t step, int width, int height, void* scale) -{ - CALL_HAL(mul32f, cv_hal_mul32f, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale) - float fscale = (float)*(const double*)scale; - CALL_IPP_MUL_2(ippiMul_32f_C1R) - mul_(src1, step1, src2, step2, dst, step, width, height, fscale); -} - -void mul64f( const double* src1, size_t step1, const double* src2, size_t step2, - double* dst, size_t step, int width, int height, void* scale) -{ - CALL_HAL(mul64f, cv_hal_mul64f, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale) - mul_(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); -} - -//======================================= -// Divide -//======================================= - -void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, - uchar* dst, size_t step, int width, int height, void* scale) -{ - CALL_HAL(div8u, cv_hal_div8u, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale) - if( src1 ) - div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); - else - recip_i(src2, step2, dst, step, width, height, *(const double*)scale); -} - -void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2, - schar* dst, size_t step, int width, int height, void* scale) -{ - CALL_HAL(div8s, cv_hal_div8s, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale) - div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); -} - -void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, - ushort* dst, size_t step, int width, int height, void* scale) -{ - CALL_HAL(div16u, cv_hal_div16u, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale) - div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); -} - -void div16s( const short* src1, size_t step1, const short* src2, size_t step2, - short* dst, size_t step, int width, int height, void* scale) -{ - CALL_HAL(div16s, cv_hal_div16s, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale) - div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); -} - -void div32s( const int* src1, size_t step1, const int* src2, size_t step2, - int* dst, size_t step, int width, int height, void* scale) -{ - CALL_HAL(div32s, cv_hal_div32s, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale) - div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); -} - -void div32f( const float* src1, size_t step1, const float* src2, size_t step2, - float* dst, size_t step, int width, int height, void* scale) -{ - CALL_HAL(div32f, cv_hal_div32f, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale) - div_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); -} - -void div64f( const double* src1, size_t step1, const double* src2, size_t step2, - double* dst, size_t step, int width, int height, void* scale) -{ - CALL_HAL(div64f, cv_hal_div64f, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale) - div_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); -} - -//======================================= -// Reciprocial -//======================================= - -void recip8u( const uchar*, size_t, const uchar* src2, size_t step2, - uchar* dst, size_t step, int width, int height, void* scale) -{ - CALL_HAL(recip8u, cv_hal_recip8u, src2, step2, dst, step, width, height, *(const double*)scale) - recip_i(src2, step2, dst, step, width, height, *(const double*)scale); -} - -void recip8s( const schar*, size_t, const schar* src2, size_t step2, - schar* dst, size_t step, int width, int height, void* scale) -{ - CALL_HAL(recip8s, cv_hal_recip8s, src2, step2, dst, step, width, height, *(const double*)scale) - recip_i(src2, step2, dst, step, width, height, *(const double*)scale); -} - -void recip16u( const ushort*, size_t, const ushort* src2, size_t step2, - ushort* dst, size_t step, int width, int height, void* scale) -{ - CALL_HAL(recip16u, cv_hal_recip16u, src2, step2, dst, step, width, height, *(const double*)scale) - recip_i(src2, step2, dst, step, width, height, *(const double*)scale); -} - -void recip16s( const short*, size_t, const short* src2, size_t step2, - short* dst, size_t step, int width, int height, void* scale) -{ - CALL_HAL(recip16s, cv_hal_recip16s, src2, step2, dst, step, width, height, *(const double*)scale) - recip_i(src2, step2, dst, step, width, height, *(const double*)scale); -} - -void recip32s( const int*, size_t, const int* src2, size_t step2, - int* dst, size_t step, int width, int height, void* scale) -{ - CALL_HAL(recip32s, cv_hal_recip32s, src2, step2, dst, step, width, height, *(const double*)scale) - recip_i(src2, step2, dst, step, width, height, *(const double*)scale); -} - -void recip32f( const float*, size_t, const float* src2, size_t step2, - float* dst, size_t step, int width, int height, void* scale) -{ - CALL_HAL(recip32f, cv_hal_recip32f, src2, step2, dst, step, width, height, *(const double*)scale) - recip_f(src2, step2, dst, step, width, height, *(const double*)scale); -} - -void recip64f( const double*, size_t, const double* src2, size_t step2, - double* dst, size_t step, int width, int height, void* scale) -{ - CALL_HAL(recip64f, cv_hal_recip64f, src2, step2, dst, step, width, height, *(const double*)scale) - recip_f(src2, step2, dst, step, width, height, *(const double*)scale); -} - -//======================================= -// Add weighted -//======================================= - -void -addWeighted8u( const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, int width, int height, - void* scalars ) -{ - CALL_HAL(addWeighted8u, cv_hal_addWeighted8u, src1, step1, src2, step2, dst, step, width, height, (const double*)scalars) - const double* scalars_ = (const double*)scalars; - float alpha = (float)scalars_[0], beta = (float)scalars_[1], gamma = (float)scalars_[2]; - - for( ; height--; src1 += step1, src2 += step2, dst += step ) - { - int x = 0; - -#if CV_SIMD128 - if( hasSIMD128() ) - { - v_float32x4 g = v_setall_f32(gamma); - v_float32x4 a = v_setall_f32(alpha); - v_float32x4 b = v_setall_f32(beta); - - for( ; x <= width - v_uint16x8::nlanes; x += v_uint16x8::nlanes ) - { - v_uint16x8 in1_16 = v_load_expand(src1 + x); - v_int32x4 in1_32_l, in1_32_h; - v_expand(v_reinterpret_as_s16(in1_16), in1_32_l, in1_32_h); - v_float32x4 in1_f_l = v_cvt_f32(in1_32_l); - v_float32x4 in1_f_h = v_cvt_f32(in1_32_h); - - v_uint16x8 in2_16 = v_load_expand(src2 + x); - v_int32x4 in2_32_l, in2_32_h; - v_expand(v_reinterpret_as_s16(in2_16), in2_32_l, in2_32_h); - v_float32x4 in2_f_l = v_cvt_f32(in2_32_l); - v_float32x4 in2_f_h = v_cvt_f32(in2_32_h); - - v_int32x4 out_l = v_round(in1_f_l * a + in2_f_l * b + g); - v_int32x4 out_h = v_round(in1_f_h * a + in2_f_h * b + g); - - v_int16x8 out_16 = v_pack(out_l, out_h); - v_pack_u_store(dst + x, out_16); - } - } -#endif - #if CV_ENABLE_UNROLLED - for( ; x <= width - 4; x += 4 ) - { - float t0, t1; - t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma; - t1 = CV_8TO32F(src1[x+1])*alpha + CV_8TO32F(src2[x+1])*beta + gamma; - - dst[x] = saturate_cast(t0); - dst[x+1] = saturate_cast(t1); - - t0 = CV_8TO32F(src1[x+2])*alpha + CV_8TO32F(src2[x+2])*beta + gamma; - t1 = CV_8TO32F(src1[x+3])*alpha + CV_8TO32F(src2[x+3])*beta + gamma; - - dst[x+2] = saturate_cast(t0); - dst[x+3] = saturate_cast(t1); - } - #endif - - for( ; x < width; x++ ) - { - float t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma; - dst[x] = saturate_cast(t0); - } - } -} - -void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2, - schar* dst, size_t step, int width, int height, void* scalars ) -{ - CALL_HAL(addWeighted8s, cv_hal_addWeighted8s, src1, step1, src2, step2, dst, step, width, height, (const double*)scalars) - addWeighted_(src1, step1, src2, step2, dst, step, width, height, scalars); -} - -void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, - ushort* dst, size_t step, int width, int height, void* scalars ) -{ - CALL_HAL(addWeighted16u, cv_hal_addWeighted16u, src1, step1, src2, step2, dst, step, width, height, (const double*)scalars) - addWeighted_(src1, step1, src2, step2, dst, step, width, height, scalars); -} - -void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2, - short* dst, size_t step, int width, int height, void* scalars ) -{ - CALL_HAL(addWeighted16s, cv_hal_addWeighted16s, src1, step1, src2, step2, dst, step, width, height, (const double*)scalars) - addWeighted_(src1, step1, src2, step2, dst, step, width, height, scalars); -} - -void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2, - int* dst, size_t step, int width, int height, void* scalars ) -{ - CALL_HAL(addWeighted32s, cv_hal_addWeighted32s, src1, step1, src2, step2, dst, step, width, height, (const double*)scalars) - addWeighted_(src1, step1, src2, step2, dst, step, width, height, scalars); -} - -void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, - float* dst, size_t step, int width, int height, void* scalars ) -{ - CALL_HAL(addWeighted32f, cv_hal_addWeighted32f, src1, step1, src2, step2, dst, step, width, height, (const double*)scalars) - addWeighted_(src1, step1, src2, step2, dst, step, width, height, scalars); -} - -void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, - double* dst, size_t step, int width, int height, void* scalars ) -{ - CALL_HAL(addWeighted64f, cv_hal_addWeighted64f, src1, step1, src2, step2, dst, step, width, height, (const double*)scalars) - addWeighted_(src1, step1, src2, step2, dst, step, width, height, scalars); -} - -}} // cv::hal:: - -/* End of file. */ +/* End of file. */ \ No newline at end of file diff --git a/modules/core/src/arithm.dispatch.cpp b/modules/core/src/arithm.dispatch.cpp new file mode 100644 index 0000000000..1cbceaee29 --- /dev/null +++ b/modules/core/src/arithm.dispatch.cpp @@ -0,0 +1,11 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html + +#include "precomp.hpp" +#include "arithm_ipp.hpp" +#include "arithm.simd.hpp" +#include "arithm.simd_declarations.hpp" + +#define ARITHM_DISPATCHING_ONLY +#include "arithm.simd.hpp" \ No newline at end of file diff --git a/modules/core/src/arithm.simd.hpp b/modules/core/src/arithm.simd.hpp new file mode 100644 index 0000000000..0c1b33af18 --- /dev/null +++ b/modules/core/src/arithm.simd.hpp @@ -0,0 +1,1937 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html + +#include "opencv2/core/hal/intrin.hpp" + +//========================================= +// Declare & Define & Dispatch in one step +//========================================= + +// ARITHM_DISPATCHING_ONLY defined by arithm dispatch file + +#undef ARITHM_DECLARATIONS_ONLY +#ifdef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + #define ARITHM_DECLARATIONS_ONLY +#endif + +#undef ARITHM_DEFINITIONS_ONLY +#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && !defined(ARITHM_DISPATCHING_ONLY) + #define ARITHM_DEFINITIONS_ONLY +#endif + +#ifdef ARITHM_DECLARATIONS_ONLY + #undef DEFINE_SIMD + #define DEFINE_SIMD(fun_name, c_type, ...) \ + DECLARE_SIMD_FUN(fun_name, c_type) +#endif // ARITHM_DECLARATIONS_ONLY + +#ifdef ARITHM_DEFINITIONS_ONLY + #undef DEFINE_SIMD + #define DEFINE_SIMD(fun_name, c_type, v_type, ...) \ + DECLARE_SIMD_FUN(fun_name, c_type) \ + DEFINE_SIMD_FUN(fun_name, c_type, v_type, __VA_ARGS__) +#endif // ARITHM_DEFINITIONS_ONLY + +#ifdef ARITHM_DISPATCHING_ONLY + #undef DEFINE_SIMD + #define DEFINE_SIMD(fun_name, c_type, v_type, ...) \ + DISPATCH_SIMD_FUN(fun_name, c_type, v_type, __VA_ARGS__) +#endif // ARITHM_DISPATCHING_ONLY + +// workaround when neon miss support of double precision +#undef DEFINE_NOSIMD +#ifdef ARITHM_DEFINITIONS_ONLY + #define DEFINE_NOSIMD(fun_name, c_type, ...) \ + DECLARE_SIMD_FUN(fun_name, c_type) \ + DEFINE_NOSIMD_FUN(fun_name, c_type, __VA_ARGS__) +#else + #define DEFINE_NOSIMD DEFINE_SIMD +#endif // ARITHM_DEFINITIONS_ONLY + +#ifndef SIMD_GUARD + +#define DEFINE_SIMD_U8(fun, ...) \ + DEFINE_SIMD(__CV_CAT(fun, 8u), uchar, v_uint8, __VA_ARGS__) + +#define DEFINE_SIMD_S8(fun, ...) \ + DEFINE_SIMD(__CV_CAT(fun, 8s), schar, v_int8, __VA_ARGS__) + +#define DEFINE_SIMD_U16(fun, ...) \ + DEFINE_SIMD(__CV_CAT(fun, 16u), ushort, v_uint16, __VA_ARGS__) + +#define DEFINE_SIMD_S16(fun, ...) \ + DEFINE_SIMD(__CV_CAT(fun, 16s), short, v_int16, __VA_ARGS__) + +#define DEFINE_SIMD_S32(fun, ...) \ + DEFINE_SIMD(__CV_CAT(fun, 32s), int, v_int32, __VA_ARGS__) + +#define DEFINE_SIMD_F32(fun, ...) \ + DEFINE_SIMD(__CV_CAT(fun, 32f), float, v_float32, __VA_ARGS__) + +#if CV_SIMD_64F + #define DEFINE_SIMD_F64(fun, ...) \ + DEFINE_SIMD(__CV_CAT(fun, 64f), double, v_float64, __VA_ARGS__) +#else + #define DEFINE_SIMD_F64(fun, ...) \ + DEFINE_NOSIMD(__CV_CAT(fun, 64f), double, __VA_ARGS__) +#endif + +#define DEFINE_SIMD_SAT(fun, ...) \ + DEFINE_SIMD_U8(fun, __VA_ARGS__) \ + DEFINE_SIMD_S8(fun, __VA_ARGS__) \ + DEFINE_SIMD_U16(fun, __VA_ARGS__) \ + DEFINE_SIMD_S16(fun, __VA_ARGS__) + +#define DEFINE_SIMD_NSAT(fun, ...) \ + DEFINE_SIMD_S32(fun, __VA_ARGS__) \ + DEFINE_SIMD_F32(fun, __VA_ARGS__) \ + DEFINE_SIMD_F64(fun, __VA_ARGS__) + +#define DEFINE_SIMD_ALL(fun, ...) \ + DEFINE_SIMD_SAT(fun, __VA_ARGS__) \ + DEFINE_SIMD_NSAT(fun, __VA_ARGS__) + +#endif // SIMD_GUARD + +/////////////////////////////////////////////////////////////////////////// + +namespace cv { namespace hal { + +#ifndef ARITHM_DISPATCHING_ONLY + CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN +#endif + +#ifdef ARITHM_DEFINITIONS_ONLY + +#if !CV_SIMD_64F +typedef int v_float64; // dummy +#endif + +//======================================= +// Utility +//======================================= + +/** add **/ +template +static inline T c_add(T a, T b) +{ return saturate_cast(a + b); } +template<> +inline uchar c_add(uchar a, uchar b) +{ return CV_FAST_CAST_8U(a + b); } +// scale +template +static inline T1 c_add(T1 a, T1 b, T2 scalar) +{ return saturate_cast((T2)a * scalar + b); } +template<> +inline uchar c_add(uchar a, uchar b, float scalar) +{ return saturate_cast(CV_8TO32F(a) * scalar + b); } +// weight +template +static inline T1 c_add(T1 a, T1 b, T2 alpha, T2 beta, T2 gamma) +{ return saturate_cast(a * alpha + b * beta + gamma); } +template<> +inline uchar c_add(uchar a, uchar b, float alpha, float beta, float gamma) +{ return saturate_cast(CV_8TO32F(a) * alpha + CV_8TO32F(b) * beta + gamma); } + +/** sub **/ +template +static inline T c_sub(T a, T b) +{ return saturate_cast(a - b); } +template<> +inline uchar c_sub(uchar a, uchar b) +{ return CV_FAST_CAST_8U(a - b); } + +/** max **/ +template +static inline T c_max(T a, T b) +{ return std::max(a, b); } +template<> +inline uchar c_max(uchar a, uchar b) +{ return CV_MAX_8U(a, b); } + +/** min **/ +template +static inline T c_min(T a, T b) +{ return std::min(a, b); } +template<> +inline uchar c_min(uchar a, uchar b) +{ return CV_MIN_8U(a, b); } + +/** absdiff **/ +template +static inline T c_absdiff(T a, T b) +{ return a > b ? a - b : b - a; } +template<> +inline schar c_absdiff(schar a, schar b) +{ return saturate_cast(std::abs(a - b)); } +template<> +inline short c_absdiff(short a, short b) +{ return saturate_cast(std::abs(a - b)); } +// specializations to prevent "-0" results +template<> +inline float c_absdiff(float a, float b) +{ return std::abs(a - b); } +template<> +inline double c_absdiff(double a, double b) +{ return std::abs(a - b); } + +/** multiply **/ +template +static inline T c_mul(T a, T b) +{ return saturate_cast(a * b); } +template<> +inline uchar c_mul(uchar a, uchar b) +{ return CV_FAST_CAST_8U(a * b); } +// scale +template +static inline T1 c_mul(T1 a, T1 b, T2 scalar) +{ return saturate_cast(scalar * (T2)a * b); } +template<> +inline uchar c_mul(uchar a, uchar b, float scalar) +{ return saturate_cast(scalar * CV_8TO32F(a) * CV_8TO32F(b)); } + +/** divide & reciprocal **/ +template +static inline T2 c_div(T1 a, T2 b) +{ return saturate_cast(a / b); } +// recip +template<> +inline uchar c_div(float a, uchar b) +{ return saturate_cast(a / CV_8TO32F(b)); } +// scale +template +static inline T1 c_div(T1 a, T1 b, T2 scalar) +{ return saturate_cast(scalar * (T2)a / b); } +template<> +inline uchar c_div(uchar a, uchar b, float scalar) +{ return saturate_cast(scalar * CV_8TO32F(a) / CV_8TO32F(b)); } + +//======================================= +// Arithmetic and logical operations +// +, -, *, /, &, |, ^, ~, abs ... +//======================================= + +///////////////////////////// Operations ////////////////////////////////// + +// Add +template +struct op_add +{ + static inline Tvec r(const Tvec& a, const Tvec& b) + { return a + b; } + static inline T1 r(T1 a, T1 b) + { return c_add(a, b); } +}; + +// Subtract +template +struct op_sub +{ + static inline Tvec r(const Tvec& a, const Tvec& b) + { return a - b; } + static inline T1 r(T1 a, T1 b) + { return c_sub(a, b); } +}; + +// Max & Min +template +struct op_max +{ + static inline Tvec r(const Tvec& a, const Tvec& b) + { return v_max(a, b); } + static inline T1 r(T1 a, T1 b) + { return c_max(a, b); } +}; + +template +struct op_min +{ + static inline Tvec r(const Tvec& a, const Tvec& b) + { return v_min(a, b); } + static inline T1 r(T1 a, T1 b) + { return c_min(a, b); } +}; + +// Absolute difference +template +struct op_absdiff +{ + static inline Tvec r(const Tvec& a, const Tvec& b) + { return v_absdiff(a, b); } + static inline T1 r(T1 a, T1 b) + { return c_absdiff(a, b); } +}; +// Signed absolute difference, 's' +template<> +struct op_absdiff +{ + static inline v_int8 r(const v_int8& a, const v_int8& b) + { return v_absdiffs(a, b); } + static inline schar r(schar a, schar b) + { return c_absdiff(a, b); } +}; +template<> +struct op_absdiff +{ + static inline v_int16 r(const v_int16& a, const v_int16& b) + { return v_absdiffs(a, b); } + static inline short r(short a, short b) + { return c_absdiff(a, b); } +}; +template<> +struct op_absdiff +{ + static inline v_int32 r(const v_int32& a, const v_int32& b) + { return v_reinterpret_as_s32(v_absdiff(a, b)); } + static inline int r(int a, int b) + { return c_absdiff(a, b); } +}; + +// Logical +template +struct op_or +{ + static inline Tvec r(const Tvec& a, const Tvec& b) + { return a | b; } + static inline T1 r(T1 a, T1 b) + { return a | b; } +}; +template +struct op_xor +{ + static inline Tvec r(const Tvec& a, const Tvec& b) + { return a ^ b; } + static inline T1 r(T1 a, T1 b) + { return a ^ b; } +}; +template +struct op_and +{ + static inline Tvec r(const Tvec& a, const Tvec& b) + { return a & b; } + static inline T1 r(T1 a, T1 b) + { return a & b; } +}; +template +struct op_not +{ + // ignored b from loader level + static inline Tvec r(const Tvec& a) + { return ~a; } + static inline T1 r(T1 a, T1) + { return ~a; } +}; + +//////////////////////////// Loaders ///////////////////////////////// + +#if CV_SIMD + +template< template class OP, typename T1, typename Tvec> +struct bin_loader +{ + typedef OP op; + + static inline void l(const T1* src1, const T1* src2, T1* dst) + { + Tvec a = vx_load(src1); + Tvec b = vx_load(src2); + v_store(dst, op::r(a, b)); + } + + static inline void la(const T1* src1, const T1* src2, T1* dst) + { + Tvec a = vx_load_aligned(src1); + Tvec b = vx_load_aligned(src2); + v_store_aligned(dst, op::r(a, b)); // todo: try write without cache + } + + static inline void l64(const T1* src1, const T1* src2, T1* dst) + { + Tvec a = vx_load_low(src1), b = vx_load_low(src2); + v_store_low(dst, op::r(a, b)); + } +}; + +// void src2 for operation "not" +template +struct bin_loader +{ + typedef op_not op; + + static inline void l(const T1* src1, const T1*, T1* dst) + { + Tvec a = vx_load(src1); + v_store(dst, op::r(a)); + } + + static inline void la(const T1* src1, const T1*, T1* dst) + { + Tvec a = vx_load_aligned(src1); + v_store_aligned(dst, op::r(a)); + } + + static inline void l64(const T1* src1, const T1*, T1* dst) + { + Tvec a = vx_load_low(src1); + v_store_low(dst, op::r(a)); + } +}; + +#endif // CV_SIMD + +//////////////////////////// Loops ///////////////////////////////// + +template +static inline bool is_aligned(const T1* src1, const T1* src2, const T2* dst) +{ return (((size_t)src1|(size_t)src2|(size_t)dst) & (CV_SIMD_WIDTH - 1)) == 0; } + +template class OP, typename T1, typename Tvec> +static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height) +{ + typedef OP op; +#if CV_SIMD + typedef bin_loader ldr; + enum {wide_step = Tvec::nlanes}; + #if !CV_NEON && CV_SIMD_WIDTH == 16 + enum {wide_step_l = wide_step * 2}; + #else + enum {wide_step_l = wide_step}; + #endif +#endif // CV_SIMD + + step1 /= sizeof(T1); + step2 /= sizeof(T1); + step /= sizeof(T1); + + for (; height--; src1 += step1, src2 += step2, dst += step) + { + int x = 0; + + #if CV_SIMD + #if !CV_NEON + if (is_aligned(src1, src2, dst)) + { + for (; x <= width - wide_step_l; x += wide_step_l) + { + ldr::la(src1 + x, src2 + x, dst + x); + #if !CV_NEON && CV_SIMD_WIDTH == 16 + ldr::la(src1 + x + wide_step, src2 + x + wide_step, dst + x + wide_step); + #endif + } + } + else + #endif + for (; x <= width - wide_step_l; x += wide_step_l) + { + ldr::l(src1 + x, src2 + x, dst + x); + #if !CV_NEON && CV_SIMD_WIDTH == 16 + ldr::l(src1 + x + wide_step, src2 + x + wide_step, dst + x + wide_step); + #endif + } + + #if CV_SIMD_WIDTH == 16 + for (; x <= width - 8/(int)sizeof(T1); x += 8/(int)sizeof(T1)) + { + ldr::l64(src1 + x, src2 + x, dst + x); + } + #endif + #endif // CV_SIMD + + #if CV_ENABLE_UNROLLED || CV_SIMD_WIDTH > 16 + for (; x <= width - 4; x += 4) + { + T1 t0 = op::r(src1[x], src2[x]); + T1 t1 = op::r(src1[x + 1], src2[x + 1]); + dst[x] = t0; dst[x + 1] = t1; + + t0 = op::r(src1[x + 2], src2[x + 2]); + t1 = op::r(src1[x + 3], src2[x + 3]); + dst[x + 2] = t0; dst[x + 3] = t1; + } + #endif + + for (; x < width; x++) + dst[x] = op::r(src1[x], src2[x]); + } + + vx_cleanup(); +} + +#if !CV_SIMD_64F +template class OP, typename T1, typename Tvec> +static void bin_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height) +{ + typedef OP op; + + step1 /= sizeof(T1); + step2 /= sizeof(T1); + step /= sizeof(T1); + + for (; height--; src1 += step1, src2 += step2, dst += step) + { + int x = 0; + + for (; x <= width - 4; x += 4) + { + T1 t0 = op::r(src1[x], src2[x]); + T1 t1 = op::r(src1[x + 1], src2[x + 1]); + dst[x] = t0; dst[x + 1] = t1; + + t0 = op::r(src1[x + 2], src2[x + 2]); + t1 = op::r(src1[x + 3], src2[x + 3]); + dst[x + 2] = t0; dst[x + 3] = t1; + } + + for (; x < width; x++) + dst[x] = op::r(src1[x], src2[x]); + } +} +#define BIN_LOOP64F bin_loop_nosimd +#else +#define BIN_LOOP64F bin_loop +#endif //!CV_SIMD_64F + +#endif // ARITHM_DEFINITIONS_ONLY + +//////////////////////////////////////////////////////////////////////////////////// + +#ifndef SIMD_GUARD +#define BIN_ARGS(_T1) const _T1* src1, size_t step1, const _T1* src2, size_t step2, \ + _T1* dst, size_t step, int width, int height + +#define BIN_ARGS_PASS src1, step1, src2, step2, dst, step, width, height +#endif // SIMD_GUARD + +#undef DECLARE_SIMD_FUN +#define DECLARE_SIMD_FUN(fun, _T1) void fun(BIN_ARGS(_T1)); + +#undef DISPATCH_SIMD_FUN +#define DISPATCH_SIMD_FUN(fun, _T1, _Tvec, _OP) \ + void fun(BIN_ARGS(_T1), void*) \ + { \ + CV_INSTRUMENT_REGION(); \ + CALL_HAL(fun, __CV_CAT(cv_hal_, fun), BIN_ARGS_PASS) \ + ARITHM_CALL_IPP(__CV_CAT(arithm_ipp_, fun), BIN_ARGS_PASS) \ + CV_CPU_DISPATCH(fun, (BIN_ARGS_PASS), CV_CPU_DISPATCH_MODES_ALL); \ + } + +#undef DEFINE_SIMD_FUN +#define DEFINE_SIMD_FUN(fun, _T1, _Tvec, _OP) \ + void fun(BIN_ARGS(_T1)) \ + { \ + CV_INSTRUMENT_REGION(); \ + bin_loop<_OP, _T1, _Tvec>(BIN_ARGS_PASS); \ + } + +#undef DEFINE_NOSIMD_FUN +#define DEFINE_NOSIMD_FUN(fun, _T1, _OP) \ + void fun(BIN_ARGS(_T1)) \ + { \ + CV_INSTRUMENT_REGION(); \ + bin_loop_nosimd<_OP, _T1, v_float64>(BIN_ARGS_PASS); \ + } + +DEFINE_SIMD_ALL(add, op_add) +DEFINE_SIMD_ALL(sub, op_sub) + +DEFINE_SIMD_ALL(min, op_min) +DEFINE_SIMD_ALL(max, op_max) + +DEFINE_SIMD_ALL(absdiff, op_absdiff) + +DEFINE_SIMD_U8(or, op_or) +DEFINE_SIMD_U8(xor, op_xor) +DEFINE_SIMD_U8(and, op_and) + +// One source!, an exception for operation "not" +// we could use macros here but it's better to implement it +// with that way to give more clarification +// about how macroS "DEFINE_SIMD_*" are works + +#if defined(ARITHM_DECLARATIONS_ONLY) || defined(ARITHM_DEFINITIONS_ONLY) +void not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height); +#endif +#ifdef ARITHM_DEFINITIONS_ONLY +void not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height) +{ + CV_INSTRUMENT_REGION(); + bin_loop(src1, step1, src2, step2, dst, step, width, height); +} +#endif +#ifdef ARITHM_DISPATCHING_ONLY +void not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void*) +{ + CV_INSTRUMENT_REGION(); + CALL_HAL(not8u, cv_hal_not8u, src1, step1, dst, step, width, height) + ARITHM_CALL_IPP(arithm_ipp_not8u, src1, step1, dst, step, width, height) + CV_CPU_DISPATCH(not8u, (src1, step1, src2, step2, dst, step, width, height), CV_CPU_DISPATCH_MODES_ALL); +} +#endif + +//======================================= +// Compare +//======================================= + +#ifdef ARITHM_DEFINITIONS_ONLY + +///////////////////////////// Operations ////////////////////////////////// + +template +struct op_cmplt +{ + static inline Tvec r(const Tvec& a, const Tvec& b) + { return a < b; } + static inline uchar r(T1 a, T1 b) + { return (uchar)-(int)(a < b); } +}; + +template +struct op_cmple +{ + static inline Tvec r(const Tvec& a, const Tvec& b) + { return a <= b; } + static inline uchar r(T1 a, T1 b) + { return (uchar)-(int)(a <= b); } +}; + +template +struct op_cmpeq +{ + static inline Tvec r(const Tvec& a, const Tvec& b) + { return a == b; } + static inline uchar r(T1 a, T1 b) + { return (uchar)-(int)(a == b); } +}; + +template +struct op_cmpne +{ + static inline Tvec r(const Tvec& a, const Tvec& b) + { return a != b; } + static inline uchar r(T1 a, T1 b) + { return (uchar)-(int)(a != b); } +}; + +//////////////////////////// Loaders ///////////////////////////////// + +#if CV_SIMD +// todo: add support for RW alignment & stream +template class OP, typename T1, typename Tvec> +struct cmp_loader_n +{ + void l(const T1* src1, const T1* src2, uchar* dst); +}; + +template class OP, typename T1, typename Tvec> +struct cmp_loader_n +{ + typedef OP op; + + static inline void l(const T1* src1, const T1* src2, uchar* dst) + { + Tvec a = vx_load(src1); + Tvec b = vx_load(src2); + v_store(dst, v_reinterpret_as_u8(op::r(a, b))); + } +}; + +template class OP, typename T1, typename Tvec> +struct cmp_loader_n +{ + typedef OP op; + enum {step = Tvec::nlanes}; + + static inline void l(const T1* src1, const T1* src2, uchar* dst) + { + Tvec c0 = op::r(vx_load(src1), vx_load(src2)); + Tvec c1 = op::r(vx_load(src1 + step), vx_load(src2 + step)); + v_store(dst, v_pack_b(v_reinterpret_as_u16(c0), v_reinterpret_as_u16(c1))); + } +}; + +template class OP, typename T1, typename Tvec> +struct cmp_loader_n +{ + typedef OP op; + enum {step = Tvec::nlanes}; + + static inline void l(const T1* src1, const T1* src2, uchar* dst) + { + v_uint32 c0 = v_reinterpret_as_u32(op::r(vx_load(src1), vx_load(src2))); + v_uint32 c1 = v_reinterpret_as_u32(op::r(vx_load(src1 + step), vx_load(src2 + step))); + v_uint32 c2 = v_reinterpret_as_u32(op::r(vx_load(src1 + step * 2), vx_load(src2 + step * 2))); + v_uint32 c3 = v_reinterpret_as_u32(op::r(vx_load(src1 + step * 3), vx_load(src2 + step * 3))); + v_store(dst, v_pack_b(c0, c1, c2, c3)); + } +}; + +template class OP, typename T1, typename Tvec> +struct cmp_loader_n +{ + typedef OP op; + enum {step = Tvec::nlanes}; + + static inline void l(const T1* src1, const T1* src2, uchar* dst) + { + v_uint64 c0 = v_reinterpret_as_u64(op::r(vx_load(src1), vx_load(src2))); + v_uint64 c1 = v_reinterpret_as_u64(op::r(vx_load(src1 + step), vx_load(src2 + step))); + v_uint64 c2 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 2), vx_load(src2 + step * 2))); + v_uint64 c3 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 3), vx_load(src2 + step * 3))); + + v_uint64 c4 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 4), vx_load(src2 + step * 4))); + v_uint64 c5 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 5), vx_load(src2 + step * 5))); + v_uint64 c6 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 6), vx_load(src2 + step * 6))); + v_uint64 c7 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 7), vx_load(src2 + step * 7))); + v_store(dst, v_pack_b(c0, c1, c2, c3, c4, c5, c6, c7)); + } +}; + +#endif // CV_SIMD + +//////////////////////////// Loops ///////////////////////////////// + +template class OP, typename T1, typename Tvec> +static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height) +{ + typedef OP op; +#if CV_SIMD + typedef cmp_loader_n ldr; + enum {wide_step = Tvec::nlanes * sizeof(T1)}; +#endif // CV_SIMD + + step1 /= sizeof(T1); + step2 /= sizeof(T1); + + for (; height--; src1 += step1, src2 += step2, dst += step) + { + int x = 0; + + #if CV_SIMD + for (; x <= width - wide_step; x += wide_step) + { + ldr::l(src1 + x, src2 + x, dst + x); + } + #endif // CV_SIMD + + #if CV_ENABLE_UNROLLED || CV_SIMD_WIDTH > 16 + for (; x <= width - 4; x += 4) + { + uchar t0 = op::r(src1[x], src2[x]); + uchar t1 = op::r(src1[x + 1], src2[x + 1]); + dst[x] = t0; dst[x + 1] = t1; + + t0 = op::r(src1[x + 2], src2[x + 2]); + t1 = op::r(src1[x + 3], src2[x + 3]); + dst[x + 2] = t0; dst[x + 3] = t1; + } + #endif + + for (; x < width; x++) + dst[x] = op::r(src1[x], src2[x]); + } + + vx_cleanup(); +} + +template +static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, + uchar* dst, size_t step, int width, int height, int cmpop) +{ + switch(cmpop) + { + case CMP_LT: + cmp_loop(src1, step1, src2, step2, dst, step, width, height); + break; + case CMP_GT: + cmp_loop(src2, step2, src1, step1, dst, step, width, height); + break; + case CMP_LE: + cmp_loop(src1, step1, src2, step2, dst, step, width, height); + break; + case CMP_GE: + cmp_loop(src2, step2, src1, step1, dst, step, width, height); + break; + case CMP_EQ: + cmp_loop(src1, step1, src2, step2, dst, step, width, height); + break; + default: + CV_Assert(cmpop == CMP_NE); + cmp_loop(src1, step1, src2, step2, dst, step, width, height); + break; + } +} + +#if !CV_SIMD_64F +template< template class OP, typename T1> +static void cmp_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height) +{ + typedef OP op; + + step1 /= sizeof(T1); + step2 /= sizeof(T1); + + for (; height--; src1 += step1, src2 += step2, dst += step) + { + int x = 0; + + for (; x <= width - 4; x += 4) + { + uchar t0 = op::r(src1[x], src2[x]); + uchar t1 = op::r(src1[x + 1], src2[x + 1]); + dst[x] = t0; dst[x + 1] = t1; + + t0 = op::r(src1[x + 2], src2[x + 2]); + t1 = op::r(src1[x + 3], src2[x + 3]); + dst[x + 2] = t0; dst[x + 3] = t1; + } + + for (; x < width; x++) + dst[x] = op::r(src1[x], src2[x]); + } +} +static void cmp_loop_nosimd(const double* src1, size_t step1, const double* src2, size_t step2, + uchar* dst, size_t step, int width, int height, int cmpop) +{ + switch(cmpop) + { + case CMP_LT: + cmp_loop_nosimd(src1, step1, src2, step2, dst, step, width, height); + break; + case CMP_GT: + cmp_loop_nosimd(src2, step2, src1, step1, dst, step, width, height); + break; + case CMP_LE: + cmp_loop_nosimd(src1, step1, src2, step2, dst, step, width, height); + break; + case CMP_GE: + cmp_loop_nosimd(src2, step2, src1, step1, dst, step, width, height); + break; + case CMP_EQ: + cmp_loop_nosimd(src1, step1, src2, step2, dst, step, width, height); + break; + default: + CV_Assert(cmpop == CMP_NE); + cmp_loop_nosimd(src1, step1, src2, step2, dst, step, width, height); + break; + } +} +#endif // !CV_SIMD_64F + +#endif // ARITHM_DEFINITIONS_ONLY + +///////////////////////////////////////////////////////////////////////////////////////////// + +#ifndef SIMD_GUARD +#define CMP_ARGS(_T1) const _T1* src1, size_t step1, const _T1* src2, size_t step2, \ + uchar* dst, size_t step, int width, int height + +#define CMP_ARGS_PASS src1, step1, src2, step2, dst, step, width, height +#endif // SIMD_GUARD + +#undef DECLARE_SIMD_FUN +#define DECLARE_SIMD_FUN(fun, _T1) void fun(CMP_ARGS(_T1), int cmpop); + +#undef DISPATCH_SIMD_FUN +#define DISPATCH_SIMD_FUN(fun, _T1, _Tvec, ...) \ + void fun(CMP_ARGS(_T1), void* _cmpop) \ + { \ + CV_INSTRUMENT_REGION(); \ + CALL_HAL(fun, __CV_CAT(cv_hal_, fun), CMP_ARGS_PASS, *(int*)_cmpop) \ + ARITHM_CALL_IPP(__CV_CAT(arithm_ipp_, fun), CMP_ARGS_PASS, *(int*)_cmpop) \ + CV_CPU_DISPATCH(fun, (CMP_ARGS_PASS, *(int*)_cmpop), CV_CPU_DISPATCH_MODES_ALL); \ + } + +#undef DEFINE_SIMD_FUN +#define DEFINE_SIMD_FUN(fun, _T1, _Tvec, ...) \ + void fun(CMP_ARGS(_T1), int cmpop) \ + { \ + CV_INSTRUMENT_REGION(); \ + cmp_loop<_T1, _Tvec>(CMP_ARGS_PASS, cmpop); \ + } + +#undef DEFINE_NOSIMD_FUN +#define DEFINE_NOSIMD_FUN(fun, _T1, _Tvec, ...) \ + void fun(CMP_ARGS(_T1), int cmpop) \ + { \ + CV_INSTRUMENT_REGION(); \ + cmp_loop_nosimd(CMP_ARGS_PASS, cmpop); \ + } + +// todo: try to avoid define dispatcher functions using macros with these such cases +DEFINE_SIMD_ALL(cmp) + +//========================================================================= +// scaling helpers for single and dual source +// +// Dual: Multiply, Div, AddWeighted +// +// Single: Reciprocal +// +//========================================================================= + +#ifdef ARITHM_DEFINITIONS_ONLY + +//////////////////////////// Loaders /////////////////////////////// + +#if CV_SIMD +// todo: add support for RW alignment & stream +template class OP, typename T1, typename T2, typename Tvec> +struct scalar_loader_n +{ + void l(const T1* src1, const T1* src2, const T2* scalar, T1* dst); + // single source + void l(const T1* src1, const T2* scalar, T1* dst); +}; + +template class OP, typename T1, typename T2, typename Tvec> +struct scalar_loader_n +{ + typedef OP op; + + static inline void l(const T1* src1, const T1* src2, const T2* scalar, T1* dst) + { + v_int16 v_src1 = v_reinterpret_as_s16(vx_load_expand(src1)); + v_int16 v_src2 = v_reinterpret_as_s16(vx_load_expand(src2)); + + v_int32 t0, t1, t2, t3; + v_expand(v_src1, t0, t2); + v_expand(v_src2, t1, t3); + + v_float32 f0, f1, f2, f3; + f0 = v_cvt_f32(t0); + f1 = v_cvt_f32(t1); + f2 = v_cvt_f32(t2); + f3 = v_cvt_f32(t3); + + f0 = op::r(f0, f1, scalar); + f2 = op::r(f2, f3, scalar); + + v_int32 r0 = v_round(f0); + v_int32 r1 = v_round(f2); + + store(dst, v_src2, r0, r1); + } + + static inline void l(const T1* src1, const T2* scalar, T1* dst) + { + v_int16 v_src1 = v_reinterpret_as_s16(vx_load_expand(src1)); + + v_int32 t0, t1; + v_expand(v_src1, t0, t1); + + v_float32 f0, f1; + f0 = v_cvt_f32(t0); + f1 = v_cvt_f32(t1); + + f0 = op::r(f0, scalar); + f1 = op::r(f1, scalar); + + v_int32 r0 = v_round(f0); + v_int32 r1 = v_round(f1); + + store(dst, v_src1, r0, r1); + } + + static inline void store(uchar* dst, const v_int16& src, const v_int32& a, const v_int32& b) + { + v_pack_u_store(dst, op::pre(src, v_pack(a, b))); + } + static inline void store(schar* dst, const v_int16& src, const v_int32& a, const v_int32& b) + { + v_pack_store(dst, op::pre(src, v_pack(a, b))); + } +}; + +template class OP, typename T1, typename T2, typename Tvec> +struct scalar_loader_n +{ + typedef typename V_RegTraits::w_reg Twvec; + typedef OP op; + + static inline void l(const T1* src1, const T1* src2, const T2* scalar, T1* dst) + { + Tvec v_src1 = vx_load(src1); + Tvec v_src2 = vx_load(src2); + + Twvec t0, t1, t2, t3; + v_expand(v_src1, t0, t2); + v_expand(v_src2, t1, t3); + + v_float32 f0, f1, f2, f3; + f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); + f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); + f2 = v_cvt_f32(v_reinterpret_as_s32(t2)); + f3 = v_cvt_f32(v_reinterpret_as_s32(t3)); + + f0 = op::r(f0, f1, scalar); + f2 = op::r(f2, f3, scalar); + + v_int32 r0 = v_round(f0); + v_int32 r1 = v_round(f2); + + store(dst, v_src2, r0, r1); + } + + static inline void l(const T1* src1, const T2* scalar, T1* dst) + { + Tvec v_src1 = vx_load(src1); + + Twvec t0, t1; + v_expand(v_src1, t0, t1); + + v_float32 f0, f1; + f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); + f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); + + f0 = op::r(f0, scalar); + f1 = op::r(f1, scalar); + + v_int32 r0 = v_round(f0); + v_int32 r1 = v_round(f1); + + store(dst, v_src1, r0, r1); + } + + static inline void store(ushort* dst, const Tvec& src, const v_int32& a, const v_int32& b) + { + v_store(dst, op::pre(src, v_pack_u(a, b))); + } + static inline void store(short* dst, const Tvec& src, const v_int32& a, const v_int32& b) + { + v_store(dst, op::pre(src, v_pack(a, b))); + } +}; + +template class OP, typename T2> +struct scalar_loader_n +{ + typedef OP op; + enum {step = v_int32::nlanes}; + + static inline void l(const int* src1, const int* src2, const T2* scalar, int* dst) + { + v_int32 v_src1 = vx_load(src1); + v_int32 v_src2 = vx_load(src2); + v_int32 v_src1s = vx_load(src1 + step); + v_int32 v_src2s = vx_load(src2 + step); + + v_float32 f0, f1, f2, f3; + f0 = v_cvt_f32(v_reinterpret_as_s32(v_src1)); + f1 = v_cvt_f32(v_reinterpret_as_s32(v_src2)); + f2 = v_cvt_f32(v_reinterpret_as_s32(v_src1s)); + f3 = v_cvt_f32(v_reinterpret_as_s32(v_src2s)); + + f0 = op::r(f0, f1, scalar); + f2 = op::r(f2, f3, scalar); + + v_int32 r0 = v_round(f0); + v_int32 r1 = v_round(f2); + + r0 = op::pre(v_src2, r0); + r1 = op::pre(v_src2s, r1); + + v_store(dst, r0); + v_store(dst + step, r1); + } + + static inline void l(const int* src1, const T2* scalar, int* dst) + { + v_int32 v_src1 = vx_load(src1); + v_int32 v_src1s = vx_load(src1 + step); + + v_float32 f0, f1; + f0 = v_cvt_f32(v_src1); + f1 = v_cvt_f32(v_src1s); + + f0 = op::r(f0, scalar); + f1 = op::r(f1, scalar); + + v_int32 r0 = v_round(f0); + v_int32 r1 = v_round(f1); + + r0 = op::pre(v_src1, r0); + r1 = op::pre(v_src1s, r1); + + v_store(dst, r0); + v_store(dst + step, r1); + } +}; + +template class OP, typename T2> +struct scalar_loader_n +{ + typedef OP op; + enum {step = v_float32::nlanes}; + + static inline void l(const float* src1, const float* src2, const T2* scalar, float* dst) + { + v_float32 v_src1 = vx_load(src1); + v_float32 v_src2 = vx_load(src2); + v_float32 v_src1s = vx_load(src1 + step); + v_float32 v_src2s = vx_load(src2 + step); + + v_float32 r0 = op::r(v_src1, v_src2, scalar); + v_float32 r1 = op::r(v_src1s, v_src2s, scalar); + + #if CV_VERSION_MAJOR == 3 + r0 = op::pre(v_src2, r0); + r1 = op::pre(v_src2s, r1); + #endif + + v_store(dst, r0); + v_store(dst + step, r1); + } + + static inline void l(const float* src1, const T2* scalar, float* dst) + { + v_float32 v_src1 = vx_load(src1); + v_float32 v_src1s = vx_load(src1 + step); + + v_float32 r0 = op::r(v_src1, scalar); + v_float32 r1 = op::r(v_src1s, scalar); + + #if CV_VERSION_MAJOR == 3 + r0 = op::pre(v_src1, r0); + r1 = op::pre(v_src1s, r1); + #endif + + v_store(dst, r0); + v_store(dst + step, r1); + } +}; +#endif // CV_SIMD + +#if CV_SIMD_64F +template class OP> +struct scalar_loader_n +{ + typedef OP op; + typedef OP op64; + enum {step = v_int32::nlanes}; + + static inline void l(const int* src1, const int* src2, const double* scalar, int* dst) + { + v_int32 v_src1 = vx_load(src1); + v_int32 v_src2 = vx_load(src2); + v_int32 v_src1s = vx_load(src1 + step); + v_int32 v_src2s = vx_load(src2 + step); + + v_int32 r0 = r(v_src1, v_src2, scalar); + v_int32 r1 = r(v_src1s, v_src2s, scalar); + + r0 = op::pre(v_src2, r0); + r1 = op::pre(v_src2s, r1); + + v_store(dst, r0); + v_store(dst + step, r1); + } + static inline void l(const int* src1, const double* scalar, int* dst) + { + v_int32 v_src1 = vx_load(src1); + v_int32 v_src1s = vx_load(src1 + step); + + v_int32 r0 = r(v_src1, scalar); + v_int32 r1 = r(v_src1s, scalar); + + r0 = op::pre(v_src1, r0); + r1 = op::pre(v_src1s, r1); + + v_store(dst, r0); + v_store(dst + step, r1); + } + + static inline v_int32 r(const v_int32& a, const v_int32& b, const double* scalar) + { + v_float64 f0, f1, f2, f3; + f0 = v_cvt_f64(a); + f1 = v_cvt_f64_high(a); + f2 = v_cvt_f64(b); + f3 = v_cvt_f64_high(b); + + v_float64 r0 = op64::r(f0, f2, scalar); + v_float64 r1 = op64::r(f1, f3, scalar); + + return v_round(r0, r1); + } + static inline v_int32 r(const v_int32& a, const double* scalar) + { + v_float64 f0, f1; + f0 = v_cvt_f64(a); + f1 = v_cvt_f64_high(a); + + v_float64 r0 = op64::r(f0, scalar); + v_float64 r1 = op64::r(f1, scalar); + + return v_round(r0, r1); + } +}; + +template class OP> +struct scalar_loader_n +{ + typedef OP op; + typedef OP op64; + enum {step = v_float32::nlanes}; + + static inline void l(const float* src1, const float* src2, const double* scalar, float* dst) + { + v_float32 v_src1 = vx_load(src1); + v_float32 v_src2 = vx_load(src2); + v_float32 v_src1s = vx_load(src1 + step); + v_float32 v_src2s = vx_load(src2 + step); + + v_float32 r0 = r(v_src1, v_src2, scalar); + v_float32 r1 = r(v_src1s, v_src2s, scalar); + + #if CV_VERSION_MAJOR == 3 + r0 = op::pre(v_src2, r0); + r1 = op::pre(v_src2s, r1); + #endif + + v_store(dst, r0); + v_store(dst + step, r1); + } + static inline void l(const float* src1, const double* scalar, float* dst) + { + v_float32 v_src1 = vx_load(src1); + v_float32 v_src1s = vx_load(src1 + step); + + v_float32 r0 = r(v_src1, scalar); + v_float32 r1 = r(v_src1s, scalar); + + #if CV_VERSION_MAJOR == 3 + r0 = op::pre(v_src1, r0); + r1 = op::pre(v_src1s, r1); + #endif + + v_store(dst, r0); + v_store(dst + step, r1); + } + + static inline v_float32 r(const v_float32& a, const v_float32& b, const double* scalar) + { + v_float64 f0, f1, f2, f3; + f0 = v_cvt_f64(a); + f1 = v_cvt_f64_high(a); + f2 = v_cvt_f64(b); + f3 = v_cvt_f64_high(b); + + v_float64 r0 = op64::r(f0, f2, scalar); + v_float64 r1 = op64::r(f1, f3, scalar); + + return v_cvt_f32(r0, r1); + } + static inline v_float32 r(const v_float32& a, const double* scalar) + { + v_float64 f0, f1; + f0 = v_cvt_f64(a); + f1 = v_cvt_f64_high(a); + + v_float64 r0 = op64::r(f0, scalar); + v_float64 r1 = op64::r(f1, scalar); + + return v_cvt_f32(r0, r1); + } +}; + +template class OP> +struct scalar_loader_n +{ + typedef OP op; + enum {step = v_float64::nlanes}; + + static inline void l(const double* src1, const double* src2, const double* scalar, double* dst) + { + v_float64 v_src1 = vx_load(src1); + v_float64 v_src2 = vx_load(src2); + v_float64 v_src1s = vx_load(src1 + step); + v_float64 v_src2s = vx_load(src2 + step); + + v_float64 r0 = op::r(v_src1, v_src2, scalar); + v_float64 r1 = op::r(v_src1s, v_src2s, scalar); + + #if CV_VERSION_MAJOR == 3 + r0 = op::pre(v_src2, r0); + r1 = op::pre(v_src2s, r1); + #endif + + v_store(dst, r0); + v_store(dst + step, r1); + } + static inline void l(const double* src1, const double* scalar, double* dst) + { + v_float64 v_src1 = vx_load(src1); + v_float64 v_src1s = vx_load(src1 + step); + + v_float64 r0 = op::r(v_src1, scalar); + v_float64 r1 = op::r(v_src1s, scalar); + + #if CV_VERSION_MAJOR == 3 + r0 = op::pre(v_src1, r0); + r1 = op::pre(v_src1s, r1); + #endif + + v_store(dst, r0); + v_store(dst + step, r1); + } +}; +#endif // CV_SIMD_64F + +//////////////////////////// Loops ///////////////////////////////// + +// dual source +template class OP, typename T1, typename T2, typename Tvec> +static void scalar_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, + T1* dst, size_t step, int width, int height, const T2* scalar) +{ + typedef OP op; +#if CV_SIMD + typedef scalar_loader_n ldr; + const int wide_step = sizeof(T1) > sizeof(ushort) ? Tvec::nlanes * 2 : + sizeof(T1) == sizeof(uchar) ? Tvec::nlanes / 2 : Tvec::nlanes; +#endif // CV_SIMD + + step1 /= sizeof(T1); + step2 /= sizeof(T1); + step /= sizeof(T1); + + for (; height--; src1 += step1, src2 += step2, dst += step) + { + int x = 0; + + #if CV_SIMD + for (; x <= width - wide_step; x += wide_step) + { + ldr::l(src1 + x, src2 + x, scalar, dst + x); + } + #endif // CV_SIMD + + #if CV_ENABLE_UNROLLED || CV_SIMD_WIDTH > 16 + for (; x <= width - 4; x += 4) + { + T1 t0 = op::r(src1[x], src2[x], scalar); + T1 t1 = op::r(src1[x + 1], src2[x + 1], scalar); + dst[x] = t0; dst[x + 1] = t1; + + t0 = op::r(src1[x + 2], src2[x + 2], scalar); + t1 = op::r(src1[x + 3], src2[x + 3], scalar); + dst[x + 2] = t0; dst[x + 3] = t1; + } + #endif + + for (; x < width; ++x) + dst[x] = op::r(src1[x], src2[x], scalar); + } + + vx_cleanup(); +} + +// single source +template class OP, typename T1, typename T2, typename Tvec> +static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int width, int height, const T2* scalar) +{ + typedef OP op; +#if CV_SIMD + typedef scalar_loader_n ldr; + const int wide_step = sizeof(T1) > sizeof(ushort) ? Tvec::nlanes * 2 : + sizeof(T1) == sizeof(uchar) ? Tvec::nlanes / 2 : Tvec::nlanes; +#endif // CV_SIMD + + step1 /= sizeof(T1); + step /= sizeof(T1); + + for (; height--; src1 += step1, dst += step) + { + int x = 0; + + #if CV_SIMD + for (; x <= width - wide_step; x += wide_step) + { + ldr::l(src1 + x, scalar, dst + x); + } + #endif // CV_SIMD + + #if CV_ENABLE_UNROLLED || CV_SIMD_WIDTH > 16 + for (; x <= width - 4; x += 4) + { + T1 t0 = op::r(src1[x], scalar); + T1 t1 = op::r(src1[x + 1], scalar); + dst[x] = t0; dst[x + 1] = t1; + + t0 = op::r(src1[x + 2], scalar); + t1 = op::r(src1[x + 3], scalar); + dst[x + 2] = t0; dst[x + 3] = t1; + } + #endif + + for (; x < width; ++x) + dst[x] = op::r(src1[x], scalar); + } + + vx_cleanup(); +} + +#if !CV_SIMD_64F +// dual source +template class OP, typename T1, typename T2, typename Tvec> +static void scalar_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2, + T1* dst, size_t step, int width, int height, const T2* scalar) +{ + typedef OP op; + + step1 /= sizeof(T1); + step2 /= sizeof(T1); + step /= sizeof(T1); + + for (; height--; src1 += step1, src2 += step2, dst += step) + { + int x = 0; + + for (; x <= width - 4; x += 4) + { + T1 t0 = op::r(src1[x], src2[x], scalar); + T1 t1 = op::r(src1[x + 1], src2[x + 1], scalar); + dst[x] = t0; dst[x + 1] = t1; + + t0 = op::r(src1[x + 2], src2[x + 2], scalar); + t1 = op::r(src1[x + 3], src2[x + 3], scalar); + dst[x + 2] = t0; dst[x + 3] = t1; + } + + for (; x < width; ++x) + dst[x] = op::r(src1[x], src2[x], scalar); + } +} + +// single source +template class OP, typename T1, typename T2, typename Tvec> +static void scalar_loop_nosimd(const T1* src1, size_t step1, T1* dst, size_t step, int width, int height, const T2* scalar) +{ + typedef OP op; + + step1 /= sizeof(T1); + step /= sizeof(T1); + + for (; height--; src1 += step1, dst += step) + { + int x = 0; + + for (; x <= width - 4; x += 4) + { + T1 t0 = op::r(src1[x], scalar); + T1 t1 = op::r(src1[x + 1], scalar); + dst[x] = t0; dst[x + 1] = t1; + + t0 = op::r(src1[x + 2], scalar); + t1 = op::r(src1[x + 3], scalar); + dst[x + 2] = t0; dst[x + 3] = t1; + } + + for (; x < width; ++x) + dst[x] = op::r(src1[x], scalar); + } +} + +#define SCALAR_LOOP64F scalar_loop_nosimd +#else +#define SCALAR_LOOP64F scalar_loop +#endif // !CV_SIMD_64F + +#endif // ARITHM_DEFINITIONS_ONLY + +//========================================================================= +// Multiply +//========================================================================= + +#ifdef ARITHM_DEFINITIONS_ONLY + +///////////////////////////// Operations ////////////////////////////////// + +template +struct op_mul +{ + static inline Tvec r(const Tvec& a, const Tvec& b) + { return a * b; } + static inline T1 r(T1 a, T1 b) + { return saturate_cast(a * b); } +}; + +template +struct op_mul_scale +{ + static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar) + { + const v_float32 v_scalar = vx_setall_f32(*scalar); + return v_scalar * a * b; + } + static inline T1 r(T1 a, T1 b, const T2* scalar) + { return c_mul(a, b, *scalar); } + static inline Tvec pre(const Tvec&, const Tvec& res) + { return res; } +}; + +template<> +struct op_mul_scale +{ +#if CV_SIMD_64F + static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar) + { + const v_float64 v_scalar = vx_setall_f64(*scalar); + return v_scalar * a * b; + } +#endif + static inline double r(double a, double b, const double* scalar) + { return c_mul(a, b, *scalar); } + static inline v_float64 pre(const v_float64&, const v_float64& res) + { return res; } +}; + +//////////////////////////// Loops ///////////////////////////////// + +template +static void mul_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, + T1* dst, size_t step, int width, int height, const double* scalar) +{ + float fscalar = (float)*scalar; + if (std::fabs(fscalar - 1.0f) <= FLT_EPSILON) + { + bin_loop(src1, step1, src2, step2, dst, step, width, height); + } + else + { + scalar_loop(src1, step1, src2, step2, + dst, step, width, height, &fscalar); + } +} + +template +static void mul_loop_d(const T1* src1, size_t step1, const T1* src2, size_t step2, + T1* dst, size_t step, int width, int height, const double* scalar) +{ + if (std::fabs(*scalar - 1.0) <= FLT_EPSILON) + { + bin_loop(src1, step1, src2, step2, dst, step, width, height); + } + else + { + SCALAR_LOOP64F(src1, step1, src2, step2, + dst, step, width, height, scalar); + } +} + +template<> +void mul_loop_d(const double* src1, size_t step1, const double* src2, size_t step2, + double* dst, size_t step, int width, int height, const double* scalar) +{ + if (*scalar == 1.0) + { + BIN_LOOP64F(src1, step1, src2, step2, dst, step, width, height); + } + else + { + SCALAR_LOOP64F(src1, step1, src2, step2, + dst, step, width, height, scalar); + } +} + +#endif // ARITHM_DEFINITIONS_ONLY + +////////////////////////////////////////////////////////////////////////// + +#undef SCALAR_ARGS +#define SCALAR_ARGS(_T1) const _T1* src1, size_t step1, const _T1* src2, size_t step2, \ + _T1* dst, size_t step, int width, int height + +#undef SCALAR_ARGS_PASS +#define SCALAR_ARGS_PASS src1, step1, src2, step2, dst, step, width, height + +#undef DECLARE_SIMD_FUN +#define DECLARE_SIMD_FUN(fun, _T1) void fun(SCALAR_ARGS(_T1), const double* scalar); + +#undef DISPATCH_SIMD_FUN +#define DISPATCH_SIMD_FUN(fun, _T1, _Tvec, ...) \ + void fun(SCALAR_ARGS(_T1), void* scalar) \ + { \ + CV_INSTRUMENT_REGION(); \ + CALL_HAL(fun, __CV_CAT(cv_hal_, fun), \ + SCALAR_ARGS_PASS, *(const double*)scalar) \ + ARITHM_CALL_IPP(__CV_CAT(arithm_ipp_, fun), \ + SCALAR_ARGS_PASS, *(const double*)scalar) \ + CV_CPU_DISPATCH(fun, (SCALAR_ARGS_PASS, (const double*)scalar), \ + CV_CPU_DISPATCH_MODES_ALL); \ + } + +#undef DEFINE_SIMD_FUN +#define DEFINE_SIMD_FUN(fun, _T1, _Tvec, op) \ + void fun(SCALAR_ARGS(_T1), const double* scalar) \ + { \ + CV_INSTRUMENT_REGION(); \ + op<_T1, _Tvec>(SCALAR_ARGS_PASS, scalar); \ + } + +#undef DEFINE_NOSIMD_FUN +#define DEFINE_NOSIMD_FUN(fun, _T1, _OP) \ + DEFINE_SIMD_FUN(fun, _T1, v_float64, _OP) + +DEFINE_SIMD_SAT(mul, mul_loop) +DEFINE_SIMD_F32(mul, mul_loop_d) +DEFINE_SIMD_S32(mul, mul_loop_d) +DEFINE_SIMD_F64(mul, mul_loop_d) + +//========================================================================= +// Div +//========================================================================= + +#ifdef ARITHM_DEFINITIONS_ONLY + +///////////////////////////// Operations ////////////////////////////////// + +#if CV_VERSION_MAJOR == 3 +template +struct op_div_f +{ + static inline Tvec r(const Tvec& a, const Tvec& b) + { + const Tvec v_zero = Tvec(); + return v_select(b == v_zero, v_zero, a / b); + } + static inline T1 r(T1 a, T1 b) + { return b != (T1)0 ? a / b : (T1)0; } +}; +#else +template +struct op_div_f +{ + static inline Tvec r(const Tvec& a, const Tvec& b) + { return a / b; } + static inline T1 r(T1 a, T1 b) + { return a / b; } +}; +#endif + +template +struct op_div_scale +{ + static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar) + { + const v_float32 v_scalar = vx_setall_f32(*scalar); + return a * v_scalar / b; + } + static inline Tvec pre(const Tvec& denom, const Tvec& res) + { + const Tvec v_zero = Tvec(); + return v_select(denom == v_zero, v_zero, res); + } + static inline T1 r(T1 a, T1 denom, const T2* scalar) + { return denom != (T1)0 ? c_div(a, denom, *scalar) : (T1)0; } +}; + +template<> +struct op_div_scale +{ +#if CV_SIMD_64F + static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar) + { + const v_float64 v_scalar = vx_setall_f64(*scalar); + return a * v_scalar / b; + } + static inline v_float64 pre(const v_float64& denom, const v_float64& res) + { + const v_float64 v_zero = vx_setzero_f64(); + return v_select(denom == v_zero, v_zero, res); + } +#endif + static inline double r(double a, double denom, const double* scalar) + { return denom != 0.0 ? c_div(a, denom, *scalar) : 0.0; } +}; + +//////////////////////////// Loops ///////////////////////////////// + +template +static void div_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, + T1* dst, size_t step, int width, int height, const double* scalar) +{ + float fscalar = (float)*scalar; + // todo: add new intrinsics for integer divide + scalar_loop(src1, step1, src2, step2, + dst, step, width, height, &fscalar); +} + +template<> +void div_loop(const float* src1, size_t step1, const float* src2, size_t step2, + float* dst, size_t step, int width, int height, const double* scalar) +{ + float fscalar = (float)*scalar; + if (std::fabs(fscalar - 1.0f) <= FLT_EPSILON) + { + bin_loop(src1, step1, src2, step2, dst, step, width, height); + } + else + { + SCALAR_LOOP64F(src1, step1, src2, step2, + dst, step, width, height, &fscalar); + } +} + +template<> +void div_loop(const double* src1, size_t step1, const double* src2, size_t step2, + double* dst, size_t step, int width, int height, const double* scalar) +{ + if (*scalar == 1.0) + { + BIN_LOOP64F(src1, step1, src2, step2, dst, step, width, height); + } + else + { + SCALAR_LOOP64F(src1, step1, src2, step2, + dst, step, width, height, scalar); + } +} + +#endif // ARITHM_DEFINITIONS_ONLY + +////////////////////////////////////////////////////////////////////////// + +DEFINE_SIMD_ALL(div, div_loop) + +//========================================================================= +// AddWeighted +//========================================================================= + +#ifdef ARITHM_DEFINITIONS_ONLY + +///////////////////////////// Operations ////////////////////////////////// + +///// Add scale +template +struct op_add_scale +{ + static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar) + { + const v_float32 v_alpha = vx_setall_f32(*scalar); + return v_fma(a, v_alpha, b); + } + static inline T1 r(T1 a, T1 b, const T2* scalar) + { return c_add(a, b, *scalar); } + static inline Tvec pre(const Tvec&, const Tvec& res) + { return res; } +}; + +template<> +struct op_add_scale +{ +#if CV_SIMD_64F + static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar) + { + const v_float64 v_alpha = vx_setall_f64(*scalar); + return v_fma(a, v_alpha, b); + } +#endif + static inline double r(double a, double b, const double* scalar) + { return c_add(a, b, *scalar); } + static inline v_float64 pre(const v_float64&, const v_float64& res) + { return res; } +}; + +///// Weighted sum +template +struct op_add_weighted +{ + static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalars) + { + const v_float32 v_alpha = vx_setall_f32(scalars[0]); + const v_float32 v_beta = vx_setall_f32(scalars[1]); + const v_float32 v_gamma = vx_setall_f32(scalars[2]); + return v_fma(a, v_alpha, v_fma(b, v_beta, v_gamma)); + } + static inline T1 r(T1 a, T1 b, const T2* scalars) + { return c_add(a, b, scalars[0], scalars[1], scalars[2]); } + static inline Tvec pre(const Tvec&, const Tvec& res) + { return res; } +}; + +template<> +struct op_add_weighted +{ +#if CV_SIMD_64F + static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalars) + { + const v_float64 v_alpha = vx_setall_f64(scalars[0]); + const v_float64 v_beta = vx_setall_f64(scalars[1]); + const v_float64 v_gamma = vx_setall_f64(scalars[2]); + return v_fma(a, v_alpha, v_fma(b, v_beta, v_gamma)); + } +#endif + static inline double r(double a, double b, const double* scalars) + { return c_add(a, b, scalars[0], scalars[1], scalars[2]); } + static inline v_float64 pre(const v_float64&, const v_float64& res) + { return res; } +}; + +//////////////////////////// Loops ///////////////////////////////// + +template +static void add_weighted_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, + T1* dst, size_t step, int width, int height, const double* scalars) +{ + float fscalars[] = {(float)scalars[0], (float)scalars[1], (float)scalars[2]}; + if (fscalars[1] == 1.0f && fscalars[2] == 0.0f) + { + scalar_loop(src1, step1, src2, step2, + dst, step, width, height, fscalars); + } + else + { + scalar_loop(src1, step1, src2, step2, + dst, step, width, height, fscalars); + } +} + +template +static void add_weighted_loop_d(const T1* src1, size_t step1, const T1* src2, size_t step2, + T1* dst, size_t step, int width, int height, const double* scalars) +{ + if (scalars[1] == 1.0 && scalars[2] == 0.0) + { + SCALAR_LOOP64F(src1, step1, src2, step2, + dst, step, width, height, scalars); + } + else + { + SCALAR_LOOP64F(src1, step1, src2, step2, + dst, step, width, height, scalars); + } +} + +template<> +void add_weighted_loop_d(const double* src1, size_t step1, const double* src2, size_t step2, + double* dst, size_t step, int width, int height, const double* scalars) +{ + if (scalars[1] == 1.0 && scalars[2] == 0.0) + { + SCALAR_LOOP64F(src1, step1, src2, step2, + dst, step, width, height, scalars); + } + else + { + SCALAR_LOOP64F(src1, step1, src2, step2, + dst, step, width, height, scalars); + } +} + +#endif // ARITHM_DEFINITIONS_ONLY + +////////////////////////////////////////////////////////////////////////// + +#undef DISPATCH_SIMD_FUN +#define DISPATCH_SIMD_FUN(fun, _T1, _Tvec, ...) \ + void fun(SCALAR_ARGS(_T1), void* scalar) \ + { \ + CV_INSTRUMENT_REGION(); \ + CALL_HAL(fun, __CV_CAT(cv_hal_, fun), \ + SCALAR_ARGS_PASS, (const double*)scalar) \ + ARITHM_CALL_IPP(__CV_CAT(arithm_ipp_, fun), \ + SCALAR_ARGS_PASS, (const double*)scalar) \ + CV_CPU_DISPATCH(fun, (SCALAR_ARGS_PASS, (const double*)scalar), \ + CV_CPU_DISPATCH_MODES_ALL); \ + } + +DEFINE_SIMD_SAT(addWeighted, add_weighted_loop) +DEFINE_SIMD_S32(addWeighted, add_weighted_loop_d) +DEFINE_SIMD_F32(addWeighted, add_weighted_loop_d) +DEFINE_SIMD_F64(addWeighted, add_weighted_loop_d) + +//======================================= +// Reciprocal +//======================================= + +#ifdef ARITHM_DEFINITIONS_ONLY + +///////////////////////////// Operations ////////////////////////////////// + +template +struct op_recip +{ + static inline v_float32 r(const v_float32& a, const T2* scalar) + { + const v_float32 v_scalar = vx_setall_f32(*scalar); + return v_scalar / a; + } + static inline Tvec pre(const Tvec& denom, const Tvec& res) + { + const Tvec v_zero = Tvec(); + return v_select(denom == v_zero, v_zero, res); + } + static inline T1 r(T1 denom, const T2* scalar) + { return denom != (T1)0 ? c_div(*scalar, denom) : (T1)0; } +}; + +template<> +struct op_recip +{ +#if CV_SIMD_64F + static inline v_float64 r(const v_float64& a, const double* scalar) + { + const v_float64 v_scalar = vx_setall_f64(*scalar); + return v_scalar / a; + } + static inline v_float64 pre(const v_float64& denom, const v_float64& res) + { + const v_float64 v_zero = vx_setzero_f64(); + return v_select(denom == v_zero, v_zero, res); + } +#endif + static inline double r(double denom, const double* scalar) + { return denom != 0.0 ? c_div(*scalar, denom) : 0.0; } +}; + +//////////////////////////// Loops ///////////////////////////////// + +template +static void recip_loop(const T1* src1, size_t step1, T1* dst, size_t step, int width, int height, const double* scalar) +{ + float fscalar = (float)*scalar; + scalar_loop(src1, step1, dst, step, width, height, &fscalar); +} + +template<> +void recip_loop(const double* src1, size_t step1, double* dst, size_t step, int width, int height, const double* scalar) +{ + SCALAR_LOOP64F(src1, step1, dst, step, width, height, scalar); +} + +#endif // ARITHM_DEFINITIONS_ONLY + +////////////////////////////////////////////////////////////////////////// + +#undef SCALAR_ARGS +#define SCALAR_ARGS(_T1) const _T1* src1, size_t step1, _T1* dst, size_t step, int width, int height + +#undef SCALAR_ARGS_PASS +#define SCALAR_ARGS_PASS src1, step1, dst, step, width, height + +#undef DISPATCH_SIMD_FUN +#define DISPATCH_SIMD_FUN(fun, _T1, _Tvec, ...) \ + void fun(const _T1*, size_t, SCALAR_ARGS(_T1), void* scalar) \ + { \ + CV_INSTRUMENT_REGION(); \ + CALL_HAL(fun, __CV_CAT(cv_hal_, fun), \ + SCALAR_ARGS_PASS, *(const double*)scalar) \ + ARITHM_CALL_IPP(__CV_CAT(arithm_ipp_, fun), \ + SCALAR_ARGS_PASS, *(const double*)scalar) \ + CV_CPU_DISPATCH(fun, (SCALAR_ARGS_PASS, (const double*)scalar), \ + CV_CPU_DISPATCH_MODES_ALL); \ + } + +DEFINE_SIMD_ALL(recip, recip_loop) + +#ifndef ARITHM_DISPATCHING_ONLY + CV_CPU_OPTIMIZATION_NAMESPACE_END +#endif + +#ifndef SIMD_GUARD + #define SIMD_GUARD +#endif + +}} // cv::hal:: \ No newline at end of file diff --git a/modules/core/src/arithm_core.hpp b/modules/core/src/arithm_core.hpp deleted file mode 100644 index 99b564cf74..0000000000 --- a/modules/core/src/arithm_core.hpp +++ /dev/null @@ -1,623 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. -// Copyright (C) 2013, OpenCV Foundation, all rights reserved. -// Copyright (C) 2015, Itseez Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#ifndef __OPENCV_ARITHM_CORE_HPP__ -#define __OPENCV_ARITHM_CORE_HPP__ - -#include "arithm_simd.hpp" - -namespace cv { - -template struct OpAdd -{ - typedef T1 type1; - typedef T2 type2; - typedef T3 rtype; - T3 operator ()(const T1 a, const T2 b) const { return saturate_cast(a + b); } -}; - -template struct OpSub -{ - typedef T1 type1; - typedef T2 type2; - typedef T3 rtype; - T3 operator ()(const T1 a, const T2 b) const { return saturate_cast(a - b); } -}; - -template struct OpRSub -{ - typedef T1 type1; - typedef T2 type2; - typedef T3 rtype; - T3 operator ()(const T1 a, const T2 b) const { return saturate_cast(b - a); } -}; - -template struct OpMin -{ - typedef T type1; - typedef T type2; - typedef T rtype; - T operator ()(const T a, const T b) const { return std::min(a, b); } -}; - -template struct OpMax -{ - typedef T type1; - typedef T type2; - typedef T rtype; - T operator ()(const T a, const T b) const { return std::max(a, b); } -}; - -template struct OpAbsDiff -{ - typedef T type1; - typedef T type2; - typedef T rtype; - T operator()(T a, T b) const { return a > b ? a - b : b - a; } -}; - -// specializations to prevent "-0" results -template<> struct OpAbsDiff -{ - typedef float type1; - typedef float type2; - typedef float rtype; - float operator()(float a, float b) const { return std::abs(a - b); } -}; -template<> struct OpAbsDiff -{ - typedef double type1; - typedef double type2; - typedef double rtype; - double operator()(double a, double b) const { return std::abs(a - b); } -}; - -template struct OpAnd -{ - typedef T type1; - typedef T type2; - typedef T rtype; - T operator()( T a, T b ) const { return a & b; } -}; - -template struct OpOr -{ - typedef T type1; - typedef T type2; - typedef T rtype; - T operator()( T a, T b ) const { return a | b; } -}; - -template struct OpXor -{ - typedef T type1; - typedef T type2; - typedef T rtype; - T operator()( T a, T b ) const { return a ^ b; } -}; - -template struct OpNot -{ - typedef T type1; - typedef T type2; - typedef T rtype; - T operator()( T a, T ) const { return ~a; } -}; - -//============================================================================= - -template -void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, int width, int height) -{ -#if CV_SSE2 || CV_NEON - VOp vop; -#endif - Op op; - - for( ; height--; src1 = (const T *)((const uchar *)src1 + step1), - src2 = (const T *)((const uchar *)src2 + step2), - dst = (T *)((uchar *)dst + step) ) - { - int x = 0; - -#if CV_NEON || CV_SSE2 -#if CV_AVX2 - if( USE_AVX2 ) - { - for( ; x <= width - 32/(int)sizeof(T); x += 32/sizeof(T) ) - { - typename VLoadStore256::reg_type r0 = VLoadStore256::load(src1 + x); - r0 = vop(r0, VLoadStore256::load(src2 + x)); - VLoadStore256::store(dst + x, r0); - } - } -#else -#if CV_SSE2 - if( USE_SSE2 ) - { -#endif // CV_SSE2 - for( ; x <= width - 32/(int)sizeof(T); x += 32/sizeof(T) ) - { - typename VLoadStore128::reg_type r0 = VLoadStore128::load(src1 + x ); - typename VLoadStore128::reg_type r1 = VLoadStore128::load(src1 + x + 16/sizeof(T)); - r0 = vop(r0, VLoadStore128::load(src2 + x )); - r1 = vop(r1, VLoadStore128::load(src2 + x + 16/sizeof(T))); - VLoadStore128::store(dst + x , r0); - VLoadStore128::store(dst + x + 16/sizeof(T), r1); - } -#if CV_SSE2 - } -#endif // CV_SSE2 -#endif // CV_AVX2 -#endif // CV_NEON || CV_SSE2 - -#if CV_AVX2 - // nothing -#elif CV_SSE2 - if( USE_SSE2 ) - { - for( ; x <= width - 8/(int)sizeof(T); x += 8/sizeof(T) ) - { - typename VLoadStore64::reg_type r = VLoadStore64::load(src1 + x); - r = vop(r, VLoadStore64::load(src2 + x)); - VLoadStore64::store(dst + x, r); - } - } -#endif - -#if CV_ENABLE_UNROLLED - for( ; x <= width - 4; x += 4 ) - { - T v0 = op(src1[x], src2[x]); - T v1 = op(src1[x+1], src2[x+1]); - dst[x] = v0; dst[x+1] = v1; - v0 = op(src1[x+2], src2[x+2]); - v1 = op(src1[x+3], src2[x+3]); - dst[x+2] = v0; dst[x+3] = v1; - } -#endif - - for( ; x < width; x++ ) - dst[x] = op(src1[x], src2[x]); - } -} - -template -void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2, - T* dst, size_t step, int width, int height) -{ -#if CV_SSE2 || CV_NEON - Op32 op32; -#endif - Op op; - - for( ; height--; src1 = (const T *)((const uchar *)src1 + step1), - src2 = (const T *)((const uchar *)src2 + step2), - dst = (T *)((uchar *)dst + step) ) - { - int x = 0; - -#if CV_AVX2 - if( USE_AVX2 ) - { - if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 ) - { - for( ; x <= width - 8; x += 8 ) - { - typename VLoadStore256Aligned::reg_type r0 = VLoadStore256Aligned::load(src1 + x); - r0 = op32(r0, VLoadStore256Aligned::load(src2 + x)); - VLoadStore256Aligned::store(dst + x, r0); - } - } - } -#elif CV_SSE2 - if( USE_SSE2 ) - { - if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 ) - { - for( ; x <= width - 8; x += 8 ) - { - typename VLoadStore128Aligned::reg_type r0 = VLoadStore128Aligned::load(src1 + x ); - typename VLoadStore128Aligned::reg_type r1 = VLoadStore128Aligned::load(src1 + x + 4); - r0 = op32(r0, VLoadStore128Aligned::load(src2 + x )); - r1 = op32(r1, VLoadStore128Aligned::load(src2 + x + 4)); - VLoadStore128Aligned::store(dst + x , r0); - VLoadStore128Aligned::store(dst + x + 4, r1); - } - } - } -#endif // CV_AVX2 - -#if CV_NEON || CV_SSE2 -#if CV_AVX2 - if( USE_AVX2 ) - { - for( ; x <= width - 8; x += 8 ) - { - typename VLoadStore256::reg_type r0 = VLoadStore256::load(src1 + x); - r0 = op32(r0, VLoadStore256::load(src2 + x)); - VLoadStore256::store(dst + x, r0); - } - } -#else -#if CV_SSE2 - if( USE_SSE2 ) - { -#endif // CV_SSE2 - for( ; x <= width - 8; x += 8 ) - { - typename VLoadStore128::reg_type r0 = VLoadStore128::load(src1 + x ); - typename VLoadStore128::reg_type r1 = VLoadStore128::load(src1 + x + 4); - r0 = op32(r0, VLoadStore128::load(src2 + x )); - r1 = op32(r1, VLoadStore128::load(src2 + x + 4)); - VLoadStore128::store(dst + x , r0); - VLoadStore128::store(dst + x + 4, r1); - } -#if CV_SSE2 - } -#endif // CV_SSE2 -#endif // CV_AVX2 -#endif // CV_NEON || CV_SSE2 - -#if CV_ENABLE_UNROLLED - for( ; x <= width - 4; x += 4 ) - { - T v0 = op(src1[x], src2[x]); - T v1 = op(src1[x+1], src2[x+1]); - dst[x] = v0; dst[x+1] = v1; - v0 = op(src1[x+2], src2[x+2]); - v1 = op(src1[x+3], src2[x+3]); - dst[x+2] = v0; dst[x+3] = v1; - } -#endif - - for( ; x < width; x++ ) - dst[x] = op(src1[x], src2[x]); - } -} - - -template -void vBinOp64(const T* src1, size_t step1, const T* src2, size_t step2, - T* dst, size_t step, int width, int height) -{ -#if CV_SSE2 - Op64 op64; -#endif - Op op; - - for( ; height--; src1 = (const T *)((const uchar *)src1 + step1), - src2 = (const T *)((const uchar *)src2 + step2), - dst = (T *)((uchar *)dst + step) ) - { - int x = 0; - -#if CV_AVX2 - if( USE_AVX2 ) - { - if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 ) - { - for( ; x <= width - 4; x += 4 ) - { - typename VLoadStore256Aligned::reg_type r0 = VLoadStore256Aligned::load(src1 + x); - r0 = op64(r0, VLoadStore256Aligned::load(src2 + x)); - VLoadStore256Aligned::store(dst + x, r0); - } - } - } -#elif CV_SSE2 - if( USE_SSE2 ) - { - if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 ) - { - for( ; x <= width - 4; x += 4 ) - { - typename VLoadStore128Aligned::reg_type r0 = VLoadStore128Aligned::load(src1 + x ); - typename VLoadStore128Aligned::reg_type r1 = VLoadStore128Aligned::load(src1 + x + 2); - r0 = op64(r0, VLoadStore128Aligned::load(src2 + x )); - r1 = op64(r1, VLoadStore128Aligned::load(src2 + x + 2)); - VLoadStore128Aligned::store(dst + x , r0); - VLoadStore128Aligned::store(dst + x + 2, r1); - } - } - } -#endif - - for( ; x <= width - 4; x += 4 ) - { - T v0 = op(src1[x], src2[x]); - T v1 = op(src1[x+1], src2[x+1]); - dst[x] = v0; dst[x+1] = v1; - v0 = op(src1[x+2], src2[x+2]); - v1 = op(src1[x+3], src2[x+3]); - dst[x+2] = v0; dst[x+3] = v1; - } - - for( ; x < width; x++ ) - dst[x] = op(src1[x], src2[x]); - } -} - -template static void -cmp_(const T* src1, size_t step1, const T* src2, size_t step2, - uchar* dst, size_t step, int width, int height, int code) -{ - step1 /= sizeof(src1[0]); - step2 /= sizeof(src2[0]); - if( code == CMP_GE || code == CMP_LT ) - { - std::swap(src1, src2); - std::swap(step1, step2); - code = code == CMP_GE ? CMP_LE : CMP_GT; - } - - Cmp_SIMD vop(code); - - if( code == CMP_GT || code == CMP_LE ) - { - int m = code == CMP_GT ? 0 : 255; - for( ; height--; src1 += step1, src2 += step2, dst += step ) - { - int x = vop(src1, src2, dst, width); - #if CV_ENABLE_UNROLLED - for( ; x <= width - 4; x += 4 ) - { - int t0, t1; - t0 = -(src1[x] > src2[x]) ^ m; - t1 = -(src1[x+1] > src2[x+1]) ^ m; - dst[x] = (uchar)t0; dst[x+1] = (uchar)t1; - t0 = -(src1[x+2] > src2[x+2]) ^ m; - t1 = -(src1[x+3] > src2[x+3]) ^ m; - dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1; - } - #endif - for( ; x < width; x++ ) - dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m); - } - } - else if( code == CMP_EQ || code == CMP_NE ) - { - int m = code == CMP_EQ ? 0 : 255; - for( ; height--; src1 += step1, src2 += step2, dst += step ) - { - int x = 0; - #if CV_ENABLE_UNROLLED - for( ; x <= width - 4; x += 4 ) - { - int t0, t1; - t0 = -(src1[x] == src2[x]) ^ m; - t1 = -(src1[x+1] == src2[x+1]) ^ m; - dst[x] = (uchar)t0; dst[x+1] = (uchar)t1; - t0 = -(src1[x+2] == src2[x+2]) ^ m; - t1 = -(src1[x+3] == src2[x+3]) ^ m; - dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1; - } - #endif - for( ; x < width; x++ ) - dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m); - } - } -} - -template static void -mul_( const T* src1, size_t step1, const T* src2, size_t step2, - T* dst, size_t step, int width, int height, WT scale ) -{ - step1 /= sizeof(src1[0]); - step2 /= sizeof(src2[0]); - step /= sizeof(dst[0]); - - Mul_SIMD vop; - - if( scale == (WT)1. ) - { - for( ; height--; src1 += step1, src2 += step2, dst += step ) - { - int i = vop(src1, src2, dst, width, scale); - #if CV_ENABLE_UNROLLED - for(; i <= width - 4; i += 4 ) - { - T t0; - T t1; - t0 = saturate_cast(src1[i ] * src2[i ]); - t1 = saturate_cast(src1[i+1] * src2[i+1]); - dst[i ] = t0; - dst[i+1] = t1; - - t0 = saturate_cast(src1[i+2] * src2[i+2]); - t1 = saturate_cast(src1[i+3] * src2[i+3]); - dst[i+2] = t0; - dst[i+3] = t1; - } - #endif - for( ; i < width; i++ ) - dst[i] = saturate_cast(src1[i] * src2[i]); - } - } - else - { - for( ; height--; src1 += step1, src2 += step2, dst += step ) - { - int i = vop(src1, src2, dst, width, scale); - #if CV_ENABLE_UNROLLED - for(; i <= width - 4; i += 4 ) - { - T t0 = saturate_cast(scale*(WT)src1[i]*src2[i]); - T t1 = saturate_cast(scale*(WT)src1[i+1]*src2[i+1]); - dst[i] = t0; dst[i+1] = t1; - - t0 = saturate_cast(scale*(WT)src1[i+2]*src2[i+2]); - t1 = saturate_cast(scale*(WT)src1[i+3]*src2[i+3]); - dst[i+2] = t0; dst[i+3] = t1; - } - #endif - for( ; i < width; i++ ) - dst[i] = saturate_cast(scale*(WT)src1[i]*src2[i]); - } - } -} - - -template static void -div_i( const T* src1, size_t step1, const T* src2, size_t step2, - T* dst, size_t step, int width, int height, double scale ) -{ - step1 /= sizeof(src1[0]); - step2 /= sizeof(src2[0]); - step /= sizeof(dst[0]); - - Div_SIMD vop; - float scale_f = (float)scale; - - for( ; height--; src1 += step1, src2 += step2, dst += step ) - { - int i = vop(src1, src2, dst, width, scale); - for( ; i < width; i++ ) - { - T num = src1[i], denom = src2[i]; - dst[i] = denom != 0 ? saturate_cast(num*scale_f/denom) : (T)0; - } - } -} - -template static void -div_f( const T* src1, size_t step1, const T* src2, size_t step2, - T* dst, size_t step, int width, int height, double scale ) -{ - T scale_f = (T)scale; - step1 /= sizeof(src1[0]); - step2 /= sizeof(src2[0]); - step /= sizeof(dst[0]); - - Div_SIMD vop; - - for( ; height--; src1 += step1, src2 += step2, dst += step ) - { - int i = vop(src1, src2, dst, width, scale); - for( ; i < width; i++ ) - { - T num = src1[i], denom = src2[i]; - dst[i] = denom != 0 ? saturate_cast(num*scale_f/denom) : (T)0; - } - } -} - -template static void -recip_i( const T* src2, size_t step2, - T* dst, size_t step, int width, int height, double scale ) -{ - step2 /= sizeof(src2[0]); - step /= sizeof(dst[0]); - - Recip_SIMD vop; - float scale_f = (float)scale; - - for( ; height--; src2 += step2, dst += step ) - { - int i = vop(src2, dst, width, scale); - for( ; i < width; i++ ) - { - T denom = src2[i]; - dst[i] = denom != 0 ? saturate_cast(scale_f/denom) : (T)0; - } - } -} - -template static void -recip_f( const T* src2, size_t step2, - T* dst, size_t step, int width, int height, double scale ) -{ - T scale_f = (T)scale; - step2 /= sizeof(src2[0]); - step /= sizeof(dst[0]); - - Recip_SIMD vop; - - for( ; height--; src2 += step2, dst += step ) - { - int i = vop(src2, dst, width, scale); - for( ; i < width; i++ ) - { - T denom = src2[i]; - dst[i] = denom != 0 ? saturate_cast(scale_f/denom) : (T)0; - } - } -} - -template static void -addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2, - T* dst, size_t step, int width, int height, void* _scalars ) -{ - const double* scalars = (const double*)_scalars; - WT alpha = (WT)scalars[0], beta = (WT)scalars[1], gamma = (WT)scalars[2]; - step1 /= sizeof(src1[0]); - step2 /= sizeof(src2[0]); - step /= sizeof(dst[0]); - - AddWeighted_SIMD vop; - - for( ; height--; src1 += step1, src2 += step2, dst += step ) - { - int x = vop(src1, src2, dst, width, alpha, beta, gamma); - #if CV_ENABLE_UNROLLED - for( ; x <= width - 4; x += 4 ) - { - T t0 = saturate_cast(src1[x]*alpha + src2[x]*beta + gamma); - T t1 = saturate_cast(src1[x+1]*alpha + src2[x+1]*beta + gamma); - dst[x] = t0; dst[x+1] = t1; - - t0 = saturate_cast(src1[x+2]*alpha + src2[x+2]*beta + gamma); - t1 = saturate_cast(src1[x+3]*alpha + src2[x+3]*beta + gamma); - dst[x+2] = t0; dst[x+3] = t1; - } - #endif - for( ; x < width; x++ ) - dst[x] = saturate_cast(src1[x]*alpha + src2[x]*beta + gamma); - } -} - -} // cv:: - - -#endif // __OPENCV_ARITHM_CORE_HPP__ diff --git a/modules/core/src/arithm_ipp.hpp b/modules/core/src/arithm_ipp.hpp new file mode 100644 index 0000000000..4aa7d006e4 --- /dev/null +++ b/modules/core/src/arithm_ipp.hpp @@ -0,0 +1,417 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html +#if ARITHM_USE_IPP + +namespace cv { namespace hal { + +//======================================= +// Arithmetic and logical operations +// +, -, *, /, &, |, ^, ~, abs ... +//======================================= + +#define ARITHM_IPP_BIN(fun, ...) \ +do { \ + if (!CV_IPP_CHECK_COND) \ + return 0; \ + if (height == 1) \ + step1 = step2 = step = width * sizeof(dst[0]); \ + if (0 <= CV_INSTRUMENT_FUN_IPP(fun, __VA_ARGS__)) \ + { \ + CV_IMPL_ADD(CV_IMPL_IPP); \ + return 1; \ + } \ + setIppErrorStatus(); \ + return 0; \ +} while(0) + +//======================================= +// Addition +//======================================= + +inline int arithm_ipp_add8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height) +{ + ARITHM_IPP_BIN(ippiAdd_8u_C1RSfs, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0); +} + +inline int arithm_ipp_add16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, + ushort* dst, size_t step, int width, int height) +{ + ARITHM_IPP_BIN(ippiAdd_16u_C1RSfs, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0); +} + +inline int arithm_ipp_add16s(const short* src1, size_t step1, const short* src2, size_t step2, + short* dst, size_t step, int width, int height) +{ + ARITHM_IPP_BIN(ippiAdd_16s_C1RSfs, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0); +} + +inline int arithm_ipp_add32f(const float* src1, size_t step1, const float* src2, size_t step2, + float* dst, size_t step, int width, int height) +{ + ARITHM_IPP_BIN(ippiAdd_32f_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height)); +} + +#define arithm_ipp_add8s(...) 0 +#define arithm_ipp_add32s(...) 0 +#define arithm_ipp_add64f(...) 0 + +//======================================= +// Subtract +//======================================= + +inline int arithm_ipp_sub8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height) +{ + ARITHM_IPP_BIN(ippiSub_8u_C1RSfs, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height), 0); +} + +inline int arithm_ipp_sub16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, + ushort* dst, size_t step, int width, int height) +{ + ARITHM_IPP_BIN(ippiSub_16u_C1RSfs, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height), 0); +} + +inline int arithm_ipp_sub16s(const short* src1, size_t step1, const short* src2, size_t step2, + short* dst, size_t step, int width, int height) +{ + ARITHM_IPP_BIN(ippiSub_16s_C1RSfs, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height), 0); +} + +inline int arithm_ipp_sub32f(const float* src1, size_t step1, const float* src2, size_t step2, + float* dst, size_t step, int width, int height) +{ + ARITHM_IPP_BIN(ippiSub_32f_C1R, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height)); +} + +#define arithm_ipp_sub8s(...) 0 +#define arithm_ipp_sub32s(...) 0 +#define arithm_ipp_sub64f(...) 0 + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#define ARITHM_IPP_MIN_MAX(fun, type) \ +do { \ + if (!CV_IPP_CHECK_COND) \ + return 0; \ + type* s1 = (type*)src1; \ + type* s2 = (type*)src2; \ + type* d = dst; \ + if (height == 1) \ + step1 = step2 = step = width * sizeof(dst[0]); \ + int i = 0; \ + for(; i < height; i++) \ + { \ + if (0 > CV_INSTRUMENT_FUN_IPP(fun, s1, s2, d, width)) \ + break; \ + s1 = (type*)((uchar*)s1 + step1); \ + s2 = (type*)((uchar*)s2 + step2); \ + d = (type*)((uchar*)d + step); \ + } \ + if (i == height) \ + { \ + CV_IMPL_ADD(CV_IMPL_IPP); \ + return 1; \ + } \ + setIppErrorStatus(); \ + return 0; \ +} while(0) + +//======================================= +// Max +//======================================= + +inline int arithm_ipp_max8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height) +{ + ARITHM_IPP_MIN_MAX(ippsMaxEvery_8u, uchar); +} + +inline int arithm_ipp_max16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, + ushort* dst, size_t step, int width, int height) +{ + ARITHM_IPP_MIN_MAX(ippsMaxEvery_16u, ushort); +} + +inline int arithm_ipp_max32f(const float* src1, size_t step1, const float* src2, size_t step2, + float* dst, size_t step, int width, int height) +{ + ARITHM_IPP_MIN_MAX(ippsMaxEvery_32f, float); +} + +inline int arithm_ipp_max64f(const double* src1, size_t step1, const double* src2, size_t step2, + double* dst, size_t step, int width, int height) +{ + ARITHM_IPP_MIN_MAX(ippsMaxEvery_64f, double); +} + +#define arithm_ipp_max8s(...) 0 +#define arithm_ipp_max16s(...) 0 +#define arithm_ipp_max32s(...) 0 + +//======================================= +// Min +//======================================= + +inline int arithm_ipp_min8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height) +{ + ARITHM_IPP_MIN_MAX(ippsMinEvery_8u, uchar); +} + +inline int arithm_ipp_min16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, + ushort* dst, size_t step, int width, int height) +{ + ARITHM_IPP_MIN_MAX(ippsMinEvery_16u, ushort); +} + +inline int arithm_ipp_min32f(const float* src1, size_t step1, const float* src2,size_t step2, + float* dst, size_t step, int width, int height) +{ + ARITHM_IPP_MIN_MAX(ippsMinEvery_32f, float); +} + +inline int arithm_ipp_min64f(const double* src1, size_t step1, const double* src2, size_t step2, + double* dst, size_t step, int width, int height) +{ + ARITHM_IPP_MIN_MAX(ippsMinEvery_64f, double); +} + +#define arithm_ipp_min8s(...) 0 +#define arithm_ipp_min16s(...) 0 +#define arithm_ipp_min32s(...) 0 + +//======================================= +// AbsDiff +//======================================= + +inline int arithm_ipp_absdiff8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height) +{ + ARITHM_IPP_BIN(ippiAbsDiff_8u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height)); +} + +inline int arithm_ipp_absdiff16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, + ushort* dst, size_t step, int width, int height) +{ + ARITHM_IPP_BIN(ippiAbsDiff_16u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height)); +} + +inline int arithm_ipp_absdiff32f(const float* src1, size_t step1, const float* src2, size_t step2, + float* dst, size_t step, int width, int height) +{ + ARITHM_IPP_BIN(ippiAbsDiff_32f_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height)); +} +#define arithm_ipp_absdiff8s(...) 0 +#define arithm_ipp_absdiff16s(...) 0 +#define arithm_ipp_absdiff32s(...) 0 +#define arithm_ipp_absdiff64f(...) 0 + +//======================================= +// Logical +//======================================= + +inline int arithm_ipp_and8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height) +{ + ARITHM_IPP_BIN(ippiAnd_8u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height)); +} + +inline int arithm_ipp_or8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height) +{ + ARITHM_IPP_BIN(ippiOr_8u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height)); +} + +inline int arithm_ipp_xor8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height) +{ + ARITHM_IPP_BIN(ippiXor_8u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height)); +} + +inline int arithm_ipp_not8u(const uchar* src1, size_t step1, uchar* dst, size_t step, int width, int height) +{ + if (!CV_IPP_CHECK_COND) + return 0; + if (height == 1) + step1 = step = width * sizeof(dst[0]); + if (0 <= CV_INSTRUMENT_FUN_IPP(ippiNot_8u_C1R, src1, (int)step1, dst, (int)step, ippiSize(width, height))) + { + CV_IMPL_ADD(CV_IMPL_IPP); + return 1; + } + setIppErrorStatus(); + return 0; +} + +//======================================= +// Compare +//======================================= + +#define ARITHM_IPP_CMP(fun, ...) \ +do { \ + if (!CV_IPP_CHECK_COND) \ + return 0; \ + IppCmpOp op = arithm_ipp_convert_cmp(cmpop); \ + if (op < 0) \ + return 0; \ + if (height == 1) \ + step1 = step2 = step = width * sizeof(dst[0]); \ + if (0 <= CV_INSTRUMENT_FUN_IPP(fun, __VA_ARGS__, op)) \ + { \ + CV_IMPL_ADD(CV_IMPL_IPP); \ + return 1; \ + } \ + setIppErrorStatus(); \ + return 0; \ +} while(0) + +inline IppCmpOp arithm_ipp_convert_cmp(int cmpop) +{ + switch(cmpop) + { + case CMP_EQ: return ippCmpEq; + case CMP_GT: return ippCmpGreater; + case CMP_GE: return ippCmpGreaterEq; + case CMP_LT: return ippCmpLess; + case CMP_LE: return ippCmpLessEq; + default: return (IppCmpOp)-1; + } +} + +inline int arithm_ipp_cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, int cmpop) +{ + ARITHM_IPP_CMP(ippiCompare_8u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height)); +} + +inline int arithm_ipp_cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, + uchar* dst, size_t step, int width, int height, int cmpop) +{ + ARITHM_IPP_CMP(ippiCompare_16u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height)); +} + +inline int arithm_ipp_cmp16s(const short* src1, size_t step1, const short* src2, size_t step2, + uchar* dst, size_t step, int width, int height, int cmpop) +{ + ARITHM_IPP_CMP(ippiCompare_16s_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height)); +} + +inline int arithm_ipp_cmp32f(const float* src1, size_t step1, const float* src2, size_t step2, + uchar* dst, size_t step, int width, int height, int cmpop) +{ + ARITHM_IPP_CMP(ippiCompare_32f_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height)); +} + +#define arithm_ipp_cmp8s(...) 0 +#define arithm_ipp_cmp32s(...) 0 +#define arithm_ipp_cmp64f(...) 0 + +//======================================= +// Multiply +//======================================= + +#define ARITHM_IPP_MUL(fun, ...) \ +do { \ + if (!CV_IPP_CHECK_COND) \ + return 0; \ + float fscale = (float)scale; \ + if (std::fabs(fscale - 1) > FLT_EPSILON) \ + return 0; \ + if (0 <= CV_INSTRUMENT_FUN_IPP(fun, __VA_ARGS__)) \ + { \ + CV_IMPL_ADD(CV_IMPL_IPP); \ + return 1; \ + } \ + setIppErrorStatus(); \ + return 0; \ +} while(0) + +inline int arithm_ipp_mul8u(const uchar *src1, size_t step1, const uchar *src2, size_t step2, + uchar *dst, size_t step, int width, int height, double scale) +{ + ARITHM_IPP_MUL(ippiMul_8u_C1RSfs, src1, (int)step1, src2, (int)step2,dst, (int)step, ippiSize(width, height), 0); +} +inline int arithm_ipp_mul16u(const ushort *src1, size_t step1, const ushort *src2, size_t step2, + ushort *dst, size_t step, int width, int height, double scale) +{ + ARITHM_IPP_MUL(ippiMul_16u_C1RSfs, src1, (int)step1, src2, (int)step2,dst, (int)step, ippiSize(width, height), 0); +} + +inline int arithm_ipp_mul16s(const short *src1, size_t step1, const short *src2, size_t step2, + short *dst, size_t step, int width, int height, double scale) +{ + ARITHM_IPP_MUL(ippiMul_16s_C1RSfs, src1, (int)step1, src2, (int)step2,dst, (int)step, ippiSize(width, height), 0); +} + +inline int arithm_ipp_mul32f(const float *src1, size_t step1, const float *src2, size_t step2, + float *dst, size_t step, int width, int height, double scale) +{ + ARITHM_IPP_MUL(ippiMul_32f_C1R, src1, (int)step1, src2, (int)step2,dst, (int)step, ippiSize(width, height)); +} + +#define arithm_ipp_mul8s(...) 0 +#define arithm_ipp_mul32s(...) 0 +#define arithm_ipp_mul64f(...) 0 + +//======================================= +// Div +//======================================= + +#define arithm_ipp_div8u(...) 0 +#define arithm_ipp_div8s(...) 0 +#define arithm_ipp_div16u(...) 0 +#define arithm_ipp_div16s(...) 0 +#define arithm_ipp_div32s(...) 0 +#define arithm_ipp_div32f(...) 0 +#define arithm_ipp_div64f(...) 0 + +//======================================= +// AddWeighted +//======================================= + +#define arithm_ipp_addWeighted8u(...) 0 +#define arithm_ipp_addWeighted8s(...) 0 +#define arithm_ipp_addWeighted16u(...) 0 +#define arithm_ipp_addWeighted16s(...) 0 +#define arithm_ipp_addWeighted32s(...) 0 +#define arithm_ipp_addWeighted32f(...) 0 +#define arithm_ipp_addWeighted64f(...) 0 + +//======================================= +// Reciprocial +//======================================= + +#define arithm_ipp_recip8u(...) 0 +#define arithm_ipp_recip8s(...) 0 +#define arithm_ipp_recip16u(...) 0 +#define arithm_ipp_recip16s(...) 0 +#define arithm_ipp_recip32s(...) 0 +#define arithm_ipp_recip32f(...) 0 +#define arithm_ipp_recip64f(...) 0 + +/** empty block in case if you have "fun" +#define arithm_ipp_8u(...) 0 +#define arithm_ipp_8s(...) 0 +#define arithm_ipp_16u(...) 0 +#define arithm_ipp_16s(...) 0 +#define arithm_ipp_32s(...) 0 +#define arithm_ipp_32f(...) 0 +#define arithm_ipp_64f(...) 0 +**/ + +}} // cv::hal:: + +#define ARITHM_CALL_IPP(fun, ...) \ +{ \ + if (__CV_EXPAND(fun(__VA_ARGS__))) \ + return; \ +} + +#endif // ARITHM_USE_IPP + + +#if !ARITHM_USE_IPP +#define ARITHM_CALL_IPP(...) +#endif \ No newline at end of file diff --git a/modules/core/src/arithm_simd.hpp b/modules/core/src/arithm_simd.hpp deleted file mode 100644 index 5a37b4c200..0000000000 --- a/modules/core/src/arithm_simd.hpp +++ /dev/null @@ -1,2025 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. -// Copyright (C) 2013, OpenCV Foundation, all rights reserved. -// Copyright (C) 2015, Itseez Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#ifndef __OPENCV_ARITHM_SIMD_HPP__ -#define __OPENCV_ARITHM_SIMD_HPP__ - -namespace cv { - -struct NOP {}; - -#if CV_SSE2 || CV_NEON -#define IF_SIMD(op) op -#else -#define IF_SIMD(op) NOP -#endif - - -#if CV_SSE2 || CV_NEON - -#define FUNCTOR_TEMPLATE(name) \ - template struct name {} - -FUNCTOR_TEMPLATE(VLoadStore128); -#if CV_SSE2 -FUNCTOR_TEMPLATE(VLoadStore64); -FUNCTOR_TEMPLATE(VLoadStore128Aligned); -#if CV_AVX2 -FUNCTOR_TEMPLATE(VLoadStore256); -FUNCTOR_TEMPLATE(VLoadStore256Aligned); -#endif -#endif - -#endif - -#if CV_AVX2 - -#define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body) \ - template <> \ - struct name{ \ - typedef register_type reg_type; \ - static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \ - static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); } \ - } - -#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body) \ - template <> \ - struct name{ \ - typedef register_type reg_type; \ - static reg_type load(const template_arg * p) { return load_body (p); } \ - static void store(template_arg * p, reg_type v) { store_body (p, v); } \ - } - -#define FUNCTOR_CLOSURE_2arg(name, template_arg, body) \ - template<> \ - struct name \ - { \ - VLoadStore256::reg_type operator()( \ - const VLoadStore256::reg_type & a, \ - const VLoadStore256::reg_type & b) const \ - { \ - body; \ - } \ - } - -#define FUNCTOR_CLOSURE_1arg(name, template_arg, body) \ - template<> \ - struct name \ - { \ - VLoadStore256::reg_type operator()( \ - const VLoadStore256::reg_type & a, \ - const VLoadStore256::reg_type & ) const \ - { \ - body; \ - } \ - } - -FUNCTOR_LOADSTORE_CAST(VLoadStore256, uchar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); -FUNCTOR_LOADSTORE_CAST(VLoadStore256, schar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); -FUNCTOR_LOADSTORE_CAST(VLoadStore256, ushort, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); -FUNCTOR_LOADSTORE_CAST(VLoadStore256, short, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); -FUNCTOR_LOADSTORE_CAST(VLoadStore256, int, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); -FUNCTOR_LOADSTORE( VLoadStore256, float, __m256 , _mm256_loadu_ps , _mm256_storeu_ps ); -FUNCTOR_LOADSTORE( VLoadStore256, double, __m256d, _mm256_loadu_pd , _mm256_storeu_pd ); - -FUNCTOR_LOADSTORE_CAST(VLoadStore256Aligned, int, __m256i, _mm256_load_si256, _mm256_store_si256); -FUNCTOR_LOADSTORE( VLoadStore256Aligned, float, __m256 , _mm256_load_ps , _mm256_store_ps ); -FUNCTOR_LOADSTORE( VLoadStore256Aligned, double, __m256d, _mm256_load_pd , _mm256_store_pd ); - -FUNCTOR_TEMPLATE(VAdd); -FUNCTOR_CLOSURE_2arg(VAdd, uchar, return _mm256_adds_epu8 (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, schar, return _mm256_adds_epi8 (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm256_adds_epu16(a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, short, return _mm256_adds_epi16(a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, int, return _mm256_add_epi32 (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, float, return _mm256_add_ps (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm256_add_pd (a, b)); - -FUNCTOR_TEMPLATE(VSub); -FUNCTOR_CLOSURE_2arg(VSub, uchar, return _mm256_subs_epu8 (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, schar, return _mm256_subs_epi8 (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm256_subs_epu16(a, b)); -FUNCTOR_CLOSURE_2arg(VSub, short, return _mm256_subs_epi16(a, b)); -FUNCTOR_CLOSURE_2arg(VSub, int, return _mm256_sub_epi32 (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, float, return _mm256_sub_ps (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, double, return _mm256_sub_pd (a, b)); - -FUNCTOR_TEMPLATE(VMin); -FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm256_min_epu8 (a, b)); -FUNCTOR_CLOSURE_2arg(VMin, schar, return _mm256_min_epi8 (a, b)); -FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm256_min_epu16(a, b)); -FUNCTOR_CLOSURE_2arg(VMin, short, return _mm256_min_epi16(a, b)); -FUNCTOR_CLOSURE_2arg(VMin, int, return _mm256_min_epi32(a, b)); -FUNCTOR_CLOSURE_2arg(VMin, float, return _mm256_min_ps (a, b)); -FUNCTOR_CLOSURE_2arg(VMin, double, return _mm256_min_pd (a, b)); - -FUNCTOR_TEMPLATE(VMax); -FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm256_max_epu8 (a, b)); -FUNCTOR_CLOSURE_2arg(VMax, schar, return _mm256_max_epi8 (a, b)); -FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm256_max_epu16(a, b)); -FUNCTOR_CLOSURE_2arg(VMax, short, return _mm256_max_epi16(a, b)); -FUNCTOR_CLOSURE_2arg(VMax, int, return _mm256_max_epi32(a, b)); -FUNCTOR_CLOSURE_2arg(VMax, float, return _mm256_max_ps (a, b)); -FUNCTOR_CLOSURE_2arg(VMax, double, return _mm256_max_pd (a, b)); - - -static unsigned int CV_DECL_ALIGNED(32) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, - 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; -static unsigned int CV_DECL_ALIGNED(32) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff, - 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }; - -FUNCTOR_TEMPLATE(VAbsDiff); -FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, - return _mm256_add_epi8(_mm256_subs_epu8(a, b), _mm256_subs_epu8(b, a)); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, - __m256i d = _mm256_subs_epi8(a, b); - __m256i m = _mm256_cmpgt_epi8(b, a); - return _mm256_subs_epi8(_mm256_xor_si256(d, m), m); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, - return _mm256_add_epi16(_mm256_subs_epu16(a, b), _mm256_subs_epu16(b, a)); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, short, - __m256i M = _mm256_max_epi16(a, b); - __m256i m = _mm256_min_epi16(a, b); - return _mm256_subs_epi16(M, m); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, int, - __m256i d = _mm256_sub_epi32(a, b); - __m256i m = _mm256_cmpgt_epi32(b, a); - return _mm256_sub_epi32(_mm256_xor_si256(d, m), m); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, float, - return _mm256_and_ps(_mm256_sub_ps(a, b), *(const __m256*)v32f_absmask); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, double, - return _mm256_and_pd(_mm256_sub_pd(a, b), *(const __m256d*)v64f_absmask); - ); - -FUNCTOR_TEMPLATE(VAnd); -FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm256_and_si256(a, b)); -FUNCTOR_TEMPLATE(VOr); -FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm256_or_si256 (a, b)); -FUNCTOR_TEMPLATE(VXor); -FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm256_xor_si256(a, b)); -FUNCTOR_TEMPLATE(VNot); -FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm256_xor_si256(_mm256_set1_epi32(-1), a)); - -#elif CV_SSE2 - -#define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)\ - template <> \ - struct name{ \ - typedef register_type reg_type; \ - static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \ - static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); } \ - } - -#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\ - template <> \ - struct name{ \ - typedef register_type reg_type; \ - static reg_type load(const template_arg * p) { return load_body (p); } \ - static void store(template_arg * p, reg_type v) { store_body (p, v); } \ - } - -#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\ - template<> \ - struct name \ - { \ - VLoadStore128::reg_type operator()( \ - const VLoadStore128::reg_type & a, \ - const VLoadStore128::reg_type & b) const \ - { \ - body; \ - } \ - } - -#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\ - template<> \ - struct name \ - { \ - VLoadStore128::reg_type operator()( \ - const VLoadStore128::reg_type & a, \ - const VLoadStore128::reg_type & ) const \ - { \ - body; \ - } \ - } - -FUNCTOR_LOADSTORE_CAST(VLoadStore128, uchar, __m128i, _mm_loadu_si128, _mm_storeu_si128); -FUNCTOR_LOADSTORE_CAST(VLoadStore128, schar, __m128i, _mm_loadu_si128, _mm_storeu_si128); -FUNCTOR_LOADSTORE_CAST(VLoadStore128, ushort, __m128i, _mm_loadu_si128, _mm_storeu_si128); -FUNCTOR_LOADSTORE_CAST(VLoadStore128, short, __m128i, _mm_loadu_si128, _mm_storeu_si128); -FUNCTOR_LOADSTORE_CAST(VLoadStore128, int, __m128i, _mm_loadu_si128, _mm_storeu_si128); -FUNCTOR_LOADSTORE( VLoadStore128, float, __m128 , _mm_loadu_ps , _mm_storeu_ps ); -FUNCTOR_LOADSTORE( VLoadStore128, double, __m128d, _mm_loadu_pd , _mm_storeu_pd ); - -FUNCTOR_LOADSTORE_CAST(VLoadStore64, uchar, __m128i, _mm_loadl_epi64, _mm_storel_epi64); -FUNCTOR_LOADSTORE_CAST(VLoadStore64, schar, __m128i, _mm_loadl_epi64, _mm_storel_epi64); -FUNCTOR_LOADSTORE_CAST(VLoadStore64, ushort, __m128i, _mm_loadl_epi64, _mm_storel_epi64); -FUNCTOR_LOADSTORE_CAST(VLoadStore64, short, __m128i, _mm_loadl_epi64, _mm_storel_epi64); - -FUNCTOR_LOADSTORE_CAST(VLoadStore128Aligned, int, __m128i, _mm_load_si128, _mm_store_si128); -FUNCTOR_LOADSTORE( VLoadStore128Aligned, float, __m128 , _mm_load_ps , _mm_store_ps ); -FUNCTOR_LOADSTORE( VLoadStore128Aligned, double, __m128d, _mm_load_pd , _mm_store_pd ); - -FUNCTOR_TEMPLATE(VAdd); -FUNCTOR_CLOSURE_2arg(VAdd, uchar, return _mm_adds_epu8 (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, schar, return _mm_adds_epi8 (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm_adds_epu16(a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, short, return _mm_adds_epi16(a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, int, return _mm_add_epi32 (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, float, return _mm_add_ps (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm_add_pd (a, b)); - -FUNCTOR_TEMPLATE(VSub); -FUNCTOR_CLOSURE_2arg(VSub, uchar, return _mm_subs_epu8 (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, schar, return _mm_subs_epi8 (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm_subs_epu16(a, b)); -FUNCTOR_CLOSURE_2arg(VSub, short, return _mm_subs_epi16(a, b)); -FUNCTOR_CLOSURE_2arg(VSub, int, return _mm_sub_epi32 (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, float, return _mm_sub_ps (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, double, return _mm_sub_pd (a, b)); - -FUNCTOR_TEMPLATE(VMin); -FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm_min_epu8(a, b)); -FUNCTOR_CLOSURE_2arg(VMin, schar, - __m128i m = _mm_cmpgt_epi8(a, b); - return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); - ); -FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm_subs_epu16(a, _mm_subs_epu16(a, b))); -FUNCTOR_CLOSURE_2arg(VMin, short, return _mm_min_epi16(a, b)); -FUNCTOR_CLOSURE_2arg(VMin, int, - __m128i m = _mm_cmpgt_epi32(a, b); - return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); - ); -FUNCTOR_CLOSURE_2arg(VMin, float, return _mm_min_ps(a, b)); -FUNCTOR_CLOSURE_2arg(VMin, double, return _mm_min_pd(a, b)); - -FUNCTOR_TEMPLATE(VMax); -FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm_max_epu8(a, b)); -FUNCTOR_CLOSURE_2arg(VMax, schar, - __m128i m = _mm_cmpgt_epi8(b, a); - return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); - ); -FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm_adds_epu16(_mm_subs_epu16(a, b), b)); -FUNCTOR_CLOSURE_2arg(VMax, short, return _mm_max_epi16(a, b)); -FUNCTOR_CLOSURE_2arg(VMax, int, - __m128i m = _mm_cmpgt_epi32(b, a); - return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); - ); -FUNCTOR_CLOSURE_2arg(VMax, float, return _mm_max_ps(a, b)); -FUNCTOR_CLOSURE_2arg(VMax, double, return _mm_max_pd(a, b)); - - -static unsigned int CV_DECL_ALIGNED(16) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; -static unsigned int CV_DECL_ALIGNED(16) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }; - -FUNCTOR_TEMPLATE(VAbsDiff); -FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, - return _mm_add_epi8(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, - __m128i d = _mm_subs_epi8(a, b); - __m128i m = _mm_cmpgt_epi8(b, a); - return _mm_subs_epi8(_mm_xor_si128(d, m), m); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, - return _mm_add_epi16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a)); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, short, - __m128i M = _mm_max_epi16(a, b); - __m128i m = _mm_min_epi16(a, b); - return _mm_subs_epi16(M, m); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, int, - __m128i d = _mm_sub_epi32(a, b); - __m128i m = _mm_cmpgt_epi32(b, a); - return _mm_sub_epi32(_mm_xor_si128(d, m), m); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, float, - return _mm_and_ps(_mm_sub_ps(a,b), *(const __m128*)v32f_absmask); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, double, - return _mm_and_pd(_mm_sub_pd(a,b), *(const __m128d*)v64f_absmask); - ); - -FUNCTOR_TEMPLATE(VAnd); -FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm_and_si128(a, b)); -FUNCTOR_TEMPLATE(VOr); -FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm_or_si128 (a, b)); -FUNCTOR_TEMPLATE(VXor); -FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm_xor_si128(a, b)); -FUNCTOR_TEMPLATE(VNot); -FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm_xor_si128(_mm_set1_epi32(-1), a)); -#endif - -#if CV_NEON - -#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\ - template <> \ - struct name{ \ - typedef register_type reg_type; \ - static reg_type load(const template_arg * p) { return load_body (p);}; \ - static void store(template_arg * p, reg_type v) { store_body (p, v);}; \ - } - -#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\ - template<> \ - struct name \ - { \ - VLoadStore128::reg_type operator()( \ - VLoadStore128::reg_type a, \ - VLoadStore128::reg_type b) const \ - { \ - return body; \ - }; \ - } - -#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\ - template<> \ - struct name \ - { \ - VLoadStore128::reg_type operator()( \ - VLoadStore128::reg_type a, \ - VLoadStore128::reg_type ) const \ - { \ - return body; \ - }; \ - } - -FUNCTOR_LOADSTORE(VLoadStore128, uchar, uint8x16_t, vld1q_u8 , vst1q_u8 ); -FUNCTOR_LOADSTORE(VLoadStore128, schar, int8x16_t, vld1q_s8 , vst1q_s8 ); -FUNCTOR_LOADSTORE(VLoadStore128, ushort, uint16x8_t, vld1q_u16, vst1q_u16); -FUNCTOR_LOADSTORE(VLoadStore128, short, int16x8_t, vld1q_s16, vst1q_s16); -FUNCTOR_LOADSTORE(VLoadStore128, int, int32x4_t, vld1q_s32, vst1q_s32); -FUNCTOR_LOADSTORE(VLoadStore128, float, float32x4_t, vld1q_f32, vst1q_f32); - -FUNCTOR_TEMPLATE(VAdd); -FUNCTOR_CLOSURE_2arg(VAdd, uchar, vqaddq_u8 (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, schar, vqaddq_s8 (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, ushort, vqaddq_u16(a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, short, vqaddq_s16(a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, int, vaddq_s32 (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, float, vaddq_f32 (a, b)); - -FUNCTOR_TEMPLATE(VSub); -FUNCTOR_CLOSURE_2arg(VSub, uchar, vqsubq_u8 (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, schar, vqsubq_s8 (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, ushort, vqsubq_u16(a, b)); -FUNCTOR_CLOSURE_2arg(VSub, short, vqsubq_s16(a, b)); -FUNCTOR_CLOSURE_2arg(VSub, int, vsubq_s32 (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, float, vsubq_f32 (a, b)); - -FUNCTOR_TEMPLATE(VMin); -FUNCTOR_CLOSURE_2arg(VMin, uchar, vminq_u8 (a, b)); -FUNCTOR_CLOSURE_2arg(VMin, schar, vminq_s8 (a, b)); -FUNCTOR_CLOSURE_2arg(VMin, ushort, vminq_u16(a, b)); -FUNCTOR_CLOSURE_2arg(VMin, short, vminq_s16(a, b)); -FUNCTOR_CLOSURE_2arg(VMin, int, vminq_s32(a, b)); -FUNCTOR_CLOSURE_2arg(VMin, float, vminq_f32(a, b)); - -FUNCTOR_TEMPLATE(VMax); -FUNCTOR_CLOSURE_2arg(VMax, uchar, vmaxq_u8 (a, b)); -FUNCTOR_CLOSURE_2arg(VMax, schar, vmaxq_s8 (a, b)); -FUNCTOR_CLOSURE_2arg(VMax, ushort, vmaxq_u16(a, b)); -FUNCTOR_CLOSURE_2arg(VMax, short, vmaxq_s16(a, b)); -FUNCTOR_CLOSURE_2arg(VMax, int, vmaxq_s32(a, b)); -FUNCTOR_CLOSURE_2arg(VMax, float, vmaxq_f32(a, b)); - -FUNCTOR_TEMPLATE(VAbsDiff); -FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, vabdq_u8 (a, b)); -FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, vqabsq_s8 (vqsubq_s8(a, b))); -FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, vabdq_u16 (a, b)); -FUNCTOR_CLOSURE_2arg(VAbsDiff, short, vqabsq_s16(vqsubq_s16(a, b))); -FUNCTOR_CLOSURE_2arg(VAbsDiff, int, vabdq_s32 (a, b)); -FUNCTOR_CLOSURE_2arg(VAbsDiff, float, vabdq_f32 (a, b)); - -FUNCTOR_TEMPLATE(VAnd); -FUNCTOR_CLOSURE_2arg(VAnd, uchar, vandq_u8(a, b)); -FUNCTOR_TEMPLATE(VOr); -FUNCTOR_CLOSURE_2arg(VOr , uchar, vorrq_u8(a, b)); -FUNCTOR_TEMPLATE(VXor); -FUNCTOR_CLOSURE_2arg(VXor, uchar, veorq_u8(a, b)); -FUNCTOR_TEMPLATE(VNot); -FUNCTOR_CLOSURE_1arg(VNot, uchar, vmvnq_u8(a )); -#endif - - -template -struct Cmp_SIMD -{ - explicit Cmp_SIMD(int) - { - } - - int operator () (const T *, const T *, uchar *, int) const - { - return 0; - } -}; - -#if CV_NEON - -template <> -struct Cmp_SIMD -{ - explicit Cmp_SIMD(int code_) : - code(code_) - { - // CV_Assert(code == CMP_GT || code == CMP_LE || - // code == CMP_EQ || code == CMP_NE); - - v_mask = vdupq_n_u8(255); - } - - int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const - { - int x = 0; - - if (code == CMP_GT) - for ( ; x <= width - 16; x += 16) - vst1q_u8(dst + x, vcgtq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x))); - else if (code == CMP_LE) - for ( ; x <= width - 16; x += 16) - vst1q_u8(dst + x, vcleq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x))); - else if (code == CMP_EQ) - for ( ; x <= width - 16; x += 16) - vst1q_u8(dst + x, vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x))); - else if (code == CMP_NE) - for ( ; x <= width - 16; x += 16) - vst1q_u8(dst + x, veorq_u8(vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)), v_mask)); - - return x; - } - - int code; - uint8x16_t v_mask; -}; - -template <> -struct Cmp_SIMD -{ - explicit Cmp_SIMD(int code_) : - code(code_) - { - // CV_Assert(code == CMP_GT || code == CMP_LE || - // code == CMP_EQ || code == CMP_NE); - - v_mask = vdup_n_u8(255); - } - - int operator () (const ushort * src1, const ushort * src2, uchar * dst, int width) const - { - int x = 0; - - if (code == CMP_GT) - for ( ; x <= width - 8; x += 8) - { - uint16x8_t v_dst = vcgtq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); - vst1_u8(dst + x, vmovn_u16(v_dst)); - } - else if (code == CMP_LE) - for ( ; x <= width - 8; x += 8) - { - uint16x8_t v_dst = vcleq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); - vst1_u8(dst + x, vmovn_u16(v_dst)); - } - else if (code == CMP_EQ) - for ( ; x <= width - 8; x += 8) - { - uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); - vst1_u8(dst + x, vmovn_u16(v_dst)); - } - else if (code == CMP_NE) - for ( ; x <= width - 8; x += 8) - { - uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); - vst1_u8(dst + x, veor_u8(vmovn_u16(v_dst), v_mask)); - } - - return x; - } - - int code; - uint8x8_t v_mask; -}; - -template <> -struct Cmp_SIMD -{ - explicit Cmp_SIMD(int code_) : - code(code_) - { - // CV_Assert(code == CMP_GT || code == CMP_LE || - // code == CMP_EQ || code == CMP_NE); - - v_mask = vdup_n_u8(255); - } - - int operator () (const int * src1, const int * src2, uchar * dst, int width) const - { - int x = 0; - - if (code == CMP_GT) - for ( ; x <= width - 8; x += 8) - { - uint32x4_t v_dst1 = vcgtq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); - uint32x4_t v_dst2 = vcgtq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); - vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); - } - else if (code == CMP_LE) - for ( ; x <= width - 8; x += 8) - { - uint32x4_t v_dst1 = vcleq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); - uint32x4_t v_dst2 = vcleq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); - vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); - } - else if (code == CMP_EQ) - for ( ; x <= width - 8; x += 8) - { - uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); - uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); - vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); - } - else if (code == CMP_NE) - for ( ; x <= width - 8; x += 8) - { - uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); - uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); - uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))); - vst1_u8(dst + x, veor_u8(v_dst, v_mask)); - } - - return x; - } - - int code; - uint8x8_t v_mask; -}; - -template <> -struct Cmp_SIMD -{ - explicit Cmp_SIMD(int code_) : - code(code_) - { - // CV_Assert(code == CMP_GT || code == CMP_LE || - // code == CMP_EQ || code == CMP_NE); - - v_mask = vdup_n_u8(255); - } - - int operator () (const float * src1, const float * src2, uchar * dst, int width) const - { - int x = 0; - - if (code == CMP_GT) - for ( ; x <= width - 8; x += 8) - { - uint32x4_t v_dst1 = vcgtq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); - uint32x4_t v_dst2 = vcgtq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); - vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); - } - else if (code == CMP_LE) - for ( ; x <= width - 8; x += 8) - { - uint32x4_t v_dst1 = vcleq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); - uint32x4_t v_dst2 = vcleq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); - vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); - } - else if (code == CMP_EQ) - for ( ; x <= width - 8; x += 8) - { - uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); - uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); - vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); - } - else if (code == CMP_NE) - for ( ; x <= width - 8; x += 8) - { - uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); - uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); - uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))); - vst1_u8(dst + x, veor_u8(v_dst, v_mask)); - } - - return x; - } - - int code; - uint8x8_t v_mask; -}; - -#elif CV_SSE2 - -template <> -struct Cmp_SIMD -{ - explicit Cmp_SIMD(int code_) : - code(code_) - { - // CV_Assert(code == CMP_GT || code == CMP_LE || - // code == CMP_EQ || code == CMP_NE); - - haveSSE = checkHardwareSupport(CV_CPU_SSE2); - - v_mask = _mm_set1_epi8(-1); - } - - int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const - { - int x = 0; - - if (!haveSSE) - return x; - - if (code == CMP_GT) - for ( ; x <= width - 16; x += 16) - _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), - _mm_loadu_si128((const __m128i *)(src2 + x)))); - else if (code == CMP_LE) - for ( ; x <= width - 16; x += 16) - { - __m128i v_gt = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), - _mm_loadu_si128((const __m128i *)(src2 + x))); - _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_gt)); - } - else if (code == CMP_EQ) - for ( ; x <= width - 16; x += 16) - _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), - _mm_loadu_si128((const __m128i *)(src2 + x)))); - else if (code == CMP_NE) - for ( ; x <= width - 16; x += 16) - { - __m128i v_eq = _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), - _mm_loadu_si128((const __m128i *)(src2 + x))); - _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_eq)); - } - - return x; - } - - int code; - __m128i v_mask; - bool haveSSE; -}; - -template <> -struct Cmp_SIMD -{ - explicit Cmp_SIMD(int code_) : - code(code_) - { - // CV_Assert(code == CMP_GT || code == CMP_LE || - // code == CMP_EQ || code == CMP_NE); - - haveSSE = checkHardwareSupport(CV_CPU_SSE2); - - v_mask = _mm_set1_epi32(0xffffffff); - } - - int operator () (const int * src1, const int * src2, uchar * dst, int width) const - { - int x = 0; - - if (!haveSSE) - return x; - - if (code == CMP_GT) - for ( ; x <= width - 8; x += 8) - { - __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), - _mm_loadu_si128((const __m128i *)(src2 + x))); - __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), - _mm_loadu_si128((const __m128i *)(src2 + x + 4))); - - _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)); - } - else if (code == CMP_LE) - for ( ; x <= width - 8; x += 8) - { - __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), - _mm_loadu_si128((const __m128i *)(src2 + x))); - __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), - _mm_loadu_si128((const __m128i *)(src2 + x + 4))); - - _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask), v_mask)); - } - else if (code == CMP_EQ) - for ( ; x <= width - 8; x += 8) - { - __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), - _mm_loadu_si128((const __m128i *)(src2 + x))); - __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), - _mm_loadu_si128((const __m128i *)(src2 + x + 4))); - - _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)); - } - else if (code == CMP_NE) - for ( ; x <= width - 8; x += 8) - { - __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), - _mm_loadu_si128((const __m128i *)(src2 + x))); - __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), - _mm_loadu_si128((const __m128i *)(src2 + x + 4))); - - _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask))); - } - - return x; - } - - int code; - __m128i v_mask; - bool haveSSE; -}; - -#endif - - -template -struct Mul_SIMD -{ - int operator() (const T *, const T *, T *, int, WT) const - { - return 0; - } -}; - -#if CV_NEON - -template <> -struct Mul_SIMD -{ - int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, float scale) const - { - int x = 0; - - if( scale == 1.0f ) - for ( ; x <= width - 8; x += 8) - { - uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x)); - uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x)); - - float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), - vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); - float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), - vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); - - uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); - vst1_u8(dst + x, vqmovn_u16(v_dst)); - } - else - { - float32x4_t v_scale = vdupq_n_f32(scale); - for ( ; x <= width - 8; x += 8) - { - uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x)); - uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x)); - - float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), - vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); - v_dst1 = vmulq_f32(v_dst1, v_scale); - float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), - vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); - v_dst2 = vmulq_f32(v_dst2, v_scale); - - uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); - vst1_u8(dst + x, vqmovn_u16(v_dst)); - } - } - - return x; - } -}; - -template <> -struct Mul_SIMD -{ - int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const - { - int x = 0; - - if( scale == 1.0f ) - for ( ; x <= width - 8; x += 8) - { - int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x)); - int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x)); - - float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), - vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); - float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), - vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); - - int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); - vst1_s8(dst + x, vqmovn_s16(v_dst)); - } - else - { - float32x4_t v_scale = vdupq_n_f32(scale); - for ( ; x <= width - 8; x += 8) - { - int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x)); - int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x)); - - float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), - vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); - v_dst1 = vmulq_f32(v_dst1, v_scale); - float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), - vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); - v_dst2 = vmulq_f32(v_dst2, v_scale); - - int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); - vst1_s8(dst + x, vqmovn_s16(v_dst)); - } - } - - return x; - } -}; - -template <> -struct Mul_SIMD -{ - int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const - { - int x = 0; - - if( scale == 1.0f ) - for ( ; x <= width - 8; x += 8) - { - uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x); - - float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), - vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); - float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), - vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); - - uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); - vst1q_u16(dst + x, v_dst); - } - else - { - float32x4_t v_scale = vdupq_n_f32(scale); - for ( ; x <= width - 8; x += 8) - { - uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x); - - float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), - vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); - v_dst1 = vmulq_f32(v_dst1, v_scale); - float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), - vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); - v_dst2 = vmulq_f32(v_dst2, v_scale); - - uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); - vst1q_u16(dst + x, v_dst); - } - } - - return x; - } -}; - -template <> -struct Mul_SIMD -{ - int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const - { - int x = 0; - - if( scale == 1.0f ) - for ( ; x <= width - 8; x += 8) - { - int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x); - - float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), - vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); - float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), - vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); - - int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); - vst1q_s16(dst + x, v_dst); - } - else - { - float32x4_t v_scale = vdupq_n_f32(scale); - for ( ; x <= width - 8; x += 8) - { - int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x); - - float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), - vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); - v_dst1 = vmulq_f32(v_dst1, v_scale); - float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), - vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); - v_dst2 = vmulq_f32(v_dst2, v_scale); - - int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); - vst1q_s16(dst + x, v_dst); - } - } - - return x; - } -}; - -template <> -struct Mul_SIMD -{ - int operator() (const float * src1, const float * src2, float * dst, int width, float scale) const - { - int x = 0; - - if( scale == 1.0f ) - for ( ; x <= width - 8; x += 8) - { - float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); - float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); - vst1q_f32(dst + x, v_dst1); - vst1q_f32(dst + x + 4, v_dst2); - } - else - { - float32x4_t v_scale = vdupq_n_f32(scale); - for ( ; x <= width - 8; x += 8) - { - float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); - v_dst1 = vmulq_f32(v_dst1, v_scale); - - float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); - v_dst2 = vmulq_f32(v_dst2, v_scale); - - vst1q_f32(dst + x, v_dst1); - vst1q_f32(dst + x + 4, v_dst2); - } - } - - return x; - } -}; - -#elif CV_SSE2 - -#if CV_SSE4_1 - -template <> -struct Mul_SIMD -{ - Mul_SIMD() - { - haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); - } - - int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const - { - int x = 0; - - if (!haveSSE) - return x; - - __m128i v_zero = _mm_setzero_si128(); - - if( scale != 1.0f ) - { - __m128 v_scale = _mm_set1_ps(scale); - for ( ; x <= width - 8; x += 8) - { - __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x)); - __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x)); - - __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), - _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero))); - v_dst1 = _mm_mul_ps(v_dst1, v_scale); - - __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), - _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero))); - v_dst2 = _mm_mul_ps(v_dst2, v_scale); - - __m128i v_dsti = _mm_packus_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); - _mm_storeu_si128((__m128i *)(dst + x), v_dsti); - } - } - - return x; - } - - bool haveSSE; -}; - -#endif - -template <> -struct Mul_SIMD -{ - Mul_SIMD() - { - haveSSE = checkHardwareSupport(CV_CPU_SSE2); - } - - int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const - { - int x = 0; - - if (!haveSSE) - return x; - - __m128i v_zero = _mm_setzero_si128(); - - if( scale == 1.0f ) - for ( ; x <= width - 8; x += 8) - { - __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x)); - __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x)); - - v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8); - v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8); - - __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), - _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16))); - - __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), - _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16))); - - __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); - _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero)); - } - else - { - __m128 v_scale = _mm_set1_ps(scale); - for ( ; x <= width - 8; x += 8) - { - __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x)); - __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x)); - - v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8); - v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8); - - __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), - _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16))); - v_dst1 = _mm_mul_ps(v_dst1, v_scale); - - __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), - _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16))); - v_dst2 = _mm_mul_ps(v_dst2, v_scale); - - __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); - _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero)); - } - } - - return x; - } - - bool haveSSE; -}; - -template <> -struct Mul_SIMD -{ - Mul_SIMD() - { - haveSSE = checkHardwareSupport(CV_CPU_SSE2); - } - - int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const - { - int x = 0; - - if (!haveSSE) - return x; - - __m128i v_zero = _mm_setzero_si128(); - - if( scale != 1.0f ) - { - __m128 v_scale = _mm_set1_ps(scale); - for ( ; x <= width - 8; x += 8) - { - __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x)); - __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x)); - - __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), - _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16))); - v_dst1 = _mm_mul_ps(v_dst1, v_scale); - - __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), - _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16))); - v_dst2 = _mm_mul_ps(v_dst2, v_scale); - - __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); - _mm_storeu_si128((__m128i *)(dst + x), v_dsti); - } - } - - return x; - } - - bool haveSSE; -}; - -#endif - -template -struct Div_SIMD -{ - int operator() (const T *, const T *, T *, int, double) const - { - return 0; - } -}; - -template -struct Recip_SIMD -{ - int operator() (const T *, T *, int, double) const - { - return 0; - } -}; - - -#if CV_SIMD128 - -template <> -struct Div_SIMD -{ - bool haveSIMD; - Div_SIMD() { haveSIMD = hasSIMD128(); } - - int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_uint16x8 v_zero = v_setzero_u16(); - - for ( ; x <= width - 8; x += 8) - { - v_uint16x8 v_src1 = v_load_expand(src1 + x); - v_uint16x8 v_src2 = v_load_expand(src2 + x); - - v_uint32x4 t0, t1, t2, t3; - v_expand(v_src1, t0, t1); - v_expand(v_src2, t2, t3); - - v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); - v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); - - v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2)); - v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3)); - - f0 = f0 * v_scale / f2; - f1 = f1 * v_scale / f3; - - v_int32x4 i0 = v_round(f0), i1 = v_round(f1); - v_uint16x8 res = v_pack_u(i0, i1); - - res = v_select(v_src2 == v_zero, v_zero, res); - v_pack_store(dst + x, res); - } - - return x; - } -}; - - -template <> -struct Div_SIMD -{ - bool haveSIMD; - Div_SIMD() { haveSIMD = hasSIMD128(); } - - int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_int16x8 v_zero = v_setzero_s16(); - - for ( ; x <= width - 8; x += 8) - { - v_int16x8 v_src1 = v_load_expand(src1 + x); - v_int16x8 v_src2 = v_load_expand(src2 + x); - - v_int32x4 t0, t1, t2, t3; - v_expand(v_src1, t0, t1); - v_expand(v_src2, t2, t3); - - v_float32x4 f0 = v_cvt_f32(t0); - v_float32x4 f1 = v_cvt_f32(t1); - - v_float32x4 f2 = v_cvt_f32(t2); - v_float32x4 f3 = v_cvt_f32(t3); - - f0 = f0 * v_scale / f2; - f1 = f1 * v_scale / f3; - - v_int32x4 i0 = v_round(f0), i1 = v_round(f1); - v_int16x8 res = v_pack(i0, i1); - - res = v_select(v_src2 == v_zero, v_zero, res); - v_pack_store(dst + x, res); - } - - return x; - } -}; - - -template <> -struct Div_SIMD -{ - bool haveSIMD; - Div_SIMD() { haveSIMD = hasSIMD128(); } - - int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_uint16x8 v_zero = v_setzero_u16(); - - for ( ; x <= width - 8; x += 8) - { - v_uint16x8 v_src1 = v_load(src1 + x); - v_uint16x8 v_src2 = v_load(src2 + x); - - v_uint32x4 t0, t1, t2, t3; - v_expand(v_src1, t0, t1); - v_expand(v_src2, t2, t3); - - v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); - v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); - - v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2)); - v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3)); - - f0 = f0 * v_scale / f2; - f1 = f1 * v_scale / f3; - - v_int32x4 i0 = v_round(f0), i1 = v_round(f1); - v_uint16x8 res = v_pack_u(i0, i1); - - res = v_select(v_src2 == v_zero, v_zero, res); - v_store(dst + x, res); - } - - return x; - } -}; - -template <> -struct Div_SIMD -{ - bool haveSIMD; - Div_SIMD() { haveSIMD = hasSIMD128(); } - - int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_int16x8 v_zero = v_setzero_s16(); - - for ( ; x <= width - 8; x += 8) - { - v_int16x8 v_src1 = v_load(src1 + x); - v_int16x8 v_src2 = v_load(src2 + x); - - v_int32x4 t0, t1, t2, t3; - v_expand(v_src1, t0, t1); - v_expand(v_src2, t2, t3); - - v_float32x4 f0 = v_cvt_f32(t0); - v_float32x4 f1 = v_cvt_f32(t1); - - v_float32x4 f2 = v_cvt_f32(t2); - v_float32x4 f3 = v_cvt_f32(t3); - - f0 = f0 * v_scale / f2; - f1 = f1 * v_scale / f3; - - v_int32x4 i0 = v_round(f0), i1 = v_round(f1); - v_int16x8 res = v_pack(i0, i1); - - res = v_select(v_src2 == v_zero, v_zero, res); - v_store(dst + x, res); - } - - return x; - } -}; - -template <> -struct Div_SIMD -{ - bool haveSIMD; - Div_SIMD() { haveSIMD = hasSIMD128(); } - - int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_int32x4 v_zero = v_setzero_s32(); - - for ( ; x <= width - 8; x += 8) - { - v_int32x4 t0 = v_load(src1 + x); - v_int32x4 t1 = v_load(src1 + x + 4); - v_int32x4 t2 = v_load(src2 + x); - v_int32x4 t3 = v_load(src2 + x + 4); - - v_float32x4 f0 = v_cvt_f32(t0); - v_float32x4 f1 = v_cvt_f32(t1); - v_float32x4 f2 = v_cvt_f32(t2); - v_float32x4 f3 = v_cvt_f32(t3); - - f0 = f0 * v_scale / f2; - f1 = f1 * v_scale / f3; - - v_int32x4 res0 = v_round(f0), res1 = v_round(f1); - - res0 = v_select(t2 == v_zero, v_zero, res0); - res1 = v_select(t3 == v_zero, v_zero, res1); - v_store(dst + x, res0); - v_store(dst + x + 4, res1); - } - - return x; - } -}; - - -template <> -struct Div_SIMD -{ - bool haveSIMD; - Div_SIMD() { haveSIMD = hasSIMD128(); } - - int operator() (const float * src1, const float * src2, float * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_float32x4 v_zero = v_setzero_f32(); - - for ( ; x <= width - 8; x += 8) - { - v_float32x4 f0 = v_load(src1 + x); - v_float32x4 f1 = v_load(src1 + x + 4); - v_float32x4 f2 = v_load(src2 + x); - v_float32x4 f3 = v_load(src2 + x + 4); - - v_float32x4 res0 = f0 * v_scale / f2; - v_float32x4 res1 = f1 * v_scale / f3; - - res0 = v_select(f2 == v_zero, v_zero, res0); - res1 = v_select(f3 == v_zero, v_zero, res1); - - v_store(dst + x, res0); - v_store(dst + x + 4, res1); - } - - return x; - } -}; - - -///////////////////////// RECIPROCAL ////////////////////// - -template <> -struct Recip_SIMD -{ - bool haveSIMD; - Recip_SIMD() { haveSIMD = hasSIMD128(); } - - int operator() (const uchar * src2, uchar * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_uint16x8 v_zero = v_setzero_u16(); - - for ( ; x <= width - 8; x += 8) - { - v_uint16x8 v_src2 = v_load_expand(src2 + x); - - v_uint32x4 t0, t1; - v_expand(v_src2, t0, t1); - - v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); - v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); - - f0 = v_scale / f0; - f1 = v_scale / f1; - - v_int32x4 i0 = v_round(f0), i1 = v_round(f1); - v_uint16x8 res = v_pack_u(i0, i1); - - res = v_select(v_src2 == v_zero, v_zero, res); - v_pack_store(dst + x, res); - } - - return x; - } -}; - - -template <> -struct Recip_SIMD -{ - bool haveSIMD; - Recip_SIMD() { haveSIMD = hasSIMD128(); } - - int operator() (const schar * src2, schar * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_int16x8 v_zero = v_setzero_s16(); - - for ( ; x <= width - 8; x += 8) - { - v_int16x8 v_src2 = v_load_expand(src2 + x); - - v_int32x4 t0, t1; - v_expand(v_src2, t0, t1); - - v_float32x4 f0 = v_cvt_f32(t0); - v_float32x4 f1 = v_cvt_f32(t1); - - f0 = v_scale / f0; - f1 = v_scale / f1; - - v_int32x4 i0 = v_round(f0), i1 = v_round(f1); - v_int16x8 res = v_pack(i0, i1); - - res = v_select(v_src2 == v_zero, v_zero, res); - v_pack_store(dst + x, res); - } - - return x; - } -}; - - -template <> -struct Recip_SIMD -{ - bool haveSIMD; - Recip_SIMD() { haveSIMD = hasSIMD128(); } - - int operator() (const ushort * src2, ushort * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_uint16x8 v_zero = v_setzero_u16(); - - for ( ; x <= width - 8; x += 8) - { - v_uint16x8 v_src2 = v_load(src2 + x); - - v_uint32x4 t0, t1; - v_expand(v_src2, t0, t1); - - v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); - v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); - - f0 = v_scale / f0; - f1 = v_scale / f1; - - v_int32x4 i0 = v_round(f0), i1 = v_round(f1); - v_uint16x8 res = v_pack_u(i0, i1); - - res = v_select(v_src2 == v_zero, v_zero, res); - v_store(dst + x, res); - } - - return x; - } -}; - -template <> -struct Recip_SIMD -{ - bool haveSIMD; - Recip_SIMD() { haveSIMD = hasSIMD128(); } - - int operator() (const short * src2, short * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_int16x8 v_zero = v_setzero_s16(); - - for ( ; x <= width - 8; x += 8) - { - v_int16x8 v_src2 = v_load(src2 + x); - - v_int32x4 t0, t1; - v_expand(v_src2, t0, t1); - - v_float32x4 f0 = v_cvt_f32(t0); - v_float32x4 f1 = v_cvt_f32(t1); - - f0 = v_scale / f0; - f1 = v_scale / f1; - - v_int32x4 i0 = v_round(f0), i1 = v_round(f1); - v_int16x8 res = v_pack(i0, i1); - - res = v_select(v_src2 == v_zero, v_zero, res); - v_store(dst + x, res); - } - - return x; - } -}; - -template <> -struct Recip_SIMD -{ - bool haveSIMD; - Recip_SIMD() { haveSIMD = hasSIMD128(); } - - int operator() (const int * src2, int * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_int32x4 v_zero = v_setzero_s32(); - - for ( ; x <= width - 8; x += 8) - { - v_int32x4 t0 = v_load(src2 + x); - v_int32x4 t1 = v_load(src2 + x + 4); - - v_float32x4 f0 = v_cvt_f32(t0); - v_float32x4 f1 = v_cvt_f32(t1); - - f0 = v_scale / f0; - f1 = v_scale / f1; - - v_int32x4 res0 = v_round(f0), res1 = v_round(f1); - - res0 = v_select(t0 == v_zero, v_zero, res0); - res1 = v_select(t1 == v_zero, v_zero, res1); - v_store(dst + x, res0); - v_store(dst + x + 4, res1); - } - - return x; - } -}; - - -template <> -struct Recip_SIMD -{ - bool haveSIMD; - Recip_SIMD() { haveSIMD = hasSIMD128(); } - - int operator() (const float * src2, float * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_float32x4 v_zero = v_setzero_f32(); - - for ( ; x <= width - 8; x += 8) - { - v_float32x4 f0 = v_load(src2 + x); - v_float32x4 f1 = v_load(src2 + x + 4); - - v_float32x4 res0 = v_scale / f0; - v_float32x4 res1 = v_scale / f1; - - res0 = v_select(f0 == v_zero, v_zero, res0); - res1 = v_select(f1 == v_zero, v_zero, res1); - - v_store(dst + x, res0); - v_store(dst + x + 4, res1); - } - - return x; - } -}; - -#if CV_SIMD128_64F - -template <> -struct Div_SIMD -{ - bool haveSIMD; - Div_SIMD() { haveSIMD = hasSIMD128(); } - - int operator() (const double * src1, const double * src2, double * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float64x2 v_scale = v_setall_f64(scale); - v_float64x2 v_zero = v_setzero_f64(); - - for ( ; x <= width - 4; x += 4) - { - v_float64x2 f0 = v_load(src1 + x); - v_float64x2 f1 = v_load(src1 + x + 2); - v_float64x2 f2 = v_load(src2 + x); - v_float64x2 f3 = v_load(src2 + x + 2); - - v_float64x2 res0 = f0 * v_scale / f2; - v_float64x2 res1 = f1 * v_scale / f3; - - res0 = v_select(f2 == v_zero, v_zero, res0); - res1 = v_select(f3 == v_zero, v_zero, res1); - - v_store(dst + x, res0); - v_store(dst + x + 2, res1); - } - - return x; - } -}; - -template <> -struct Recip_SIMD -{ - bool haveSIMD; - Recip_SIMD() { haveSIMD = hasSIMD128(); } - - int operator() (const double * src2, double * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float64x2 v_scale = v_setall_f64(scale); - v_float64x2 v_zero = v_setzero_f64(); - - for ( ; x <= width - 4; x += 4) - { - v_float64x2 f0 = v_load(src2 + x); - v_float64x2 f1 = v_load(src2 + x + 2); - - v_float64x2 res0 = v_scale / f0; - v_float64x2 res1 = v_scale / f1; - - res0 = v_select(f0 == v_zero, v_zero, res0); - res1 = v_select(f1 == v_zero, v_zero, res1); - - v_store(dst + x, res0); - v_store(dst + x + 2, res1); - } - - return x; - } -}; - -#endif - -#endif - - -template -struct AddWeighted_SIMD -{ - int operator() (const T *, const T *, T *, int, WT, WT, WT) const - { - return 0; - } -}; - -#if CV_SSE2 - -template <> -struct AddWeighted_SIMD -{ - AddWeighted_SIMD() - { - haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); - } - - int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const - { - int x = 0; - - if (!haveSSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta), - v_gamma = _mm_set1_ps(gamma); - - for( ; x <= width - 8; x += 8 ) - { - __m128i v_src1 = _mm_loadl_epi64((const __m128i *)(src1 + x)); - __m128i v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x)); - - __m128i v_src1_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8); - __m128i v_src2_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8); - - __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1_p), 16)), v_alpha); - v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma), - _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2_p), 16)), v_beta)); - - __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1_p), 16)), v_alpha); - v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma), - _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2_p), 16)), v_beta)); - - __m128i v_dst16 = _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0), - _mm_cvtps_epi32(v_dstf1)); - - _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst16, v_zero)); - } - - return x; - } - - bool haveSSE2; -}; - -template <> -struct AddWeighted_SIMD -{ - AddWeighted_SIMD() - { - haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); - } - - int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const - { - int x = 0; - - if (!haveSSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta), - v_gamma = _mm_set1_ps(gamma); - - for( ; x <= width - 8; x += 8 ) - { - __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x)); - __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); - - __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), v_alpha); - v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma), - _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)), v_beta)); - - __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), v_alpha); - v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma), - _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)), v_beta)); - - _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0), - _mm_cvtps_epi32(v_dstf1))); - } - - return x; - } - - bool haveSSE2; -}; - -#if CV_SSE4_1 - -template <> -struct AddWeighted_SIMD -{ - AddWeighted_SIMD() - { - haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1); - } - - int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const - { - int x = 0; - - if (!haveSSE4_1) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta), - v_gamma = _mm_set1_ps(gamma); - - for( ; x <= width - 8; x += 8 ) - { - __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x)); - __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); - - __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), v_alpha); - v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma), - _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)), v_beta)); - - __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), v_alpha); - v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma), - _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)), v_beta)); - - _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(_mm_cvtps_epi32(v_dstf0), - _mm_cvtps_epi32(v_dstf1))); - } - - return x; - } - - bool haveSSE4_1; -}; - -#endif - -#elif CV_NEON - -template <> -struct AddWeighted_SIMD -{ - int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const - { - int x = 0; - - float32x4_t g = vdupq_n_f32 (gamma); - - for( ; x <= width - 8; x += 8 ) - { - int8x8_t in1 = vld1_s8(src1 + x); - int16x8_t in1_16 = vmovl_s8(in1); - float32x4_t in1_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in1_16))); - float32x4_t in1_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in1_16))); - - int8x8_t in2 = vld1_s8(src2+x); - int16x8_t in2_16 = vmovl_s8(in2); - float32x4_t in2_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in2_16))); - float32x4_t in2_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in2_16))); - - float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta)); - float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta)); - out_f_l = vaddq_f32(out_f_l, g); - out_f_h = vaddq_f32(out_f_h, g); - - int16x4_t out_16_l = vqmovn_s32(cv_vrndq_s32_f32(out_f_l)); - int16x4_t out_16_h = vqmovn_s32(cv_vrndq_s32_f32(out_f_h)); - - int16x8_t out_16 = vcombine_s16(out_16_l, out_16_h); - int8x8_t out = vqmovn_s16(out_16); - - vst1_s8(dst + x, out); - } - - return x; - } -}; - -template <> -struct AddWeighted_SIMD -{ - int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const - { - int x = 0; - - float32x4_t g = vdupq_n_f32(gamma); - - for( ; x <= width - 8; x += 8 ) - { - uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x); - - float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), alpha); - float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))), beta); - uint16x4_t v_dst1 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); - - v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), alpha); - v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))), beta); - uint16x4_t v_dst2 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); - - vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2)); - } - - return x; - } -}; - -template <> -struct AddWeighted_SIMD -{ - int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const - { - int x = 0; - - float32x4_t g = vdupq_n_f32(gamma); - - for( ; x <= width - 8; x += 8 ) - { - int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x); - - float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), alpha); - float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))), beta); - int16x4_t v_dst1 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); - - v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), alpha); - v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))), beta); - int16x4_t v_dst2 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); - - vst1q_s16(dst + x, vcombine_s16(v_dst1, v_dst2)); - } - - return x; - } -}; - -#endif - -} - -#endif // __OPENCV_ARITHM_SIMD_HPP__ diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp index 6b3b23cddb..0d7035f1bd 100644 --- a/modules/core/src/precomp.hpp +++ b/modules/core/src/precomp.hpp @@ -86,7 +86,6 @@ #include "opencv2/core/sse_utils.hpp" #include "opencv2/core/neon_utils.hpp" #include "opencv2/core/vsx_utils.hpp" -#include "arithm_core.hpp" #include "hal_replacement.hpp" #ifdef HAVE_TEGRA_OPTIMIZATION @@ -110,6 +109,102 @@ extern const uchar g_Saturate8u[]; #define CV_MIN_8U(a,b) ((a) - CV_FAST_CAST_8U((a) - (b))) #define CV_MAX_8U(a,b) ((a) + CV_FAST_CAST_8U((b) - (a))) +template struct OpAdd +{ + typedef T1 type1; + typedef T2 type2; + typedef T3 rtype; + T3 operator ()(const T1 a, const T2 b) const { return saturate_cast(a + b); } +}; + +template struct OpSub +{ + typedef T1 type1; + typedef T2 type2; + typedef T3 rtype; + T3 operator ()(const T1 a, const T2 b) const { return saturate_cast(a - b); } +}; + +template struct OpRSub +{ + typedef T1 type1; + typedef T2 type2; + typedef T3 rtype; + T3 operator ()(const T1 a, const T2 b) const { return saturate_cast(b - a); } +}; + +template struct OpMin +{ + typedef T type1; + typedef T type2; + typedef T rtype; + T operator ()(const T a, const T b) const { return std::min(a, b); } +}; + +template struct OpMax +{ + typedef T type1; + typedef T type2; + typedef T rtype; + T operator ()(const T a, const T b) const { return std::max(a, b); } +}; + +template struct OpAbsDiff +{ + typedef T type1; + typedef T type2; + typedef T rtype; + T operator()(T a, T b) const { return a > b ? a - b : b - a; } +}; + +// specializations to prevent "-0" results +template<> struct OpAbsDiff +{ + typedef float type1; + typedef float type2; + typedef float rtype; + float operator()(float a, float b) const { return std::abs(a - b); } +}; +template<> struct OpAbsDiff +{ + typedef double type1; + typedef double type2; + typedef double rtype; + double operator()(double a, double b) const { return std::abs(a - b); } +}; + +template struct OpAnd +{ + typedef T type1; + typedef T type2; + typedef T rtype; + T operator()( T a, T b ) const { return a & b; } +}; + +template struct OpOr +{ + typedef T type1; + typedef T type2; + typedef T rtype; + T operator()( T a, T b ) const { return a | b; } +}; + +template struct OpXor +{ + typedef T type1; + typedef T type2; + typedef T rtype; + T operator()( T a, T b ) const { return a ^ b; } +}; + +template struct OpNot +{ + typedef T type1; + typedef T type2; + typedef T rtype; + T operator()( T a, T ) const { return ~a; } +}; + template<> inline uchar OpAdd::operator ()(uchar a, uchar b) const { return CV_FAST_CAST_8U(a + b); } diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index 40d282b1c2..b28929c582 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -119,11 +119,15 @@ template struct Data d[i] += (LaneType)m; return *this; } - void fill(LaneType val) + void fill(LaneType val, int s, int c = R::nlanes) { - for (int i = 0; i < R::nlanes; ++i) + for (int i = s; i < c; ++i) d[i] = val; } + void fill(LaneType val) + { + fill(val, 0); + } void reverse() { for (int i = 0; i < R::nlanes / 2; ++i) @@ -739,6 +743,23 @@ template struct TheTest return *this; } + TheTest & test_absdiffs() + { + Data dataA(std::numeric_limits::max()), + dataB(std::numeric_limits::min()); + dataA[0] = (LaneType)-1; + dataB[0] = 1; + dataA[1] = 2; + dataB[1] = (LaneType)-2; + R a = dataA, b = dataB; + Data resC = v_absdiffs(a, b); + for (int i = 0; i < R::nlanes; ++i) + { + EXPECT_EQ(saturate_cast(std::abs(dataA[i] - dataB[i])), resC[i]); + } + return *this; + } + TheTest & test_reduce() { Data dataA; @@ -874,6 +895,81 @@ template struct TheTest return *this; } + // v_uint8 only + TheTest & test_pack_b() + { + // 16-bit + Data dataA, dataB; + dataB.fill(0, R::nlanes / 2); + + R a = dataA, b = dataB; + Data maskA = a == b, maskB = a != b; + + a = maskA; b = maskB; + Data res = v_pack_b(v_reinterpret_as_u16(a), v_reinterpret_as_u16(b)); + for (int i = 0; i < v_uint16::nlanes; ++i) + { + SCOPED_TRACE(cv::format("i=%d", i)); + EXPECT_EQ(maskA[i * 2], res[i]); + EXPECT_EQ(maskB[i * 2], res[i + v_uint16::nlanes]); + } + + // 32-bit + Data dataC, dataD; + dataD.fill(0, R::nlanes / 2); + + R c = dataC, d = dataD; + Data maskC = c == d, maskD = c != d; + + c = maskC; d = maskD; + res = v_pack_b + ( + v_reinterpret_as_u32(a), v_reinterpret_as_u32(b), + v_reinterpret_as_u32(c), v_reinterpret_as_u32(d) + ); + + for (int i = 0; i < v_uint32::nlanes; ++i) + { + SCOPED_TRACE(cv::format("i=%d", i)); + EXPECT_EQ(maskA[i * 4], res[i]); + EXPECT_EQ(maskB[i * 4], res[i + v_uint32::nlanes]); + EXPECT_EQ(maskC[i * 4], res[i + v_uint32::nlanes * 2]); + EXPECT_EQ(maskD[i * 4], res[i + v_uint32::nlanes * 3]); + } + + // 64-bit + Data dataE, dataF, dataG(0), dataH(0xFF); + dataF.fill(0, R::nlanes / 2); + + R e = dataE, f = dataF, g = dataG, h = dataH; + Data maskE = e == f, maskF = e != f; + + e = maskE; f = maskF; + res = v_pack_b + ( + v_reinterpret_as_u64(a), v_reinterpret_as_u64(b), + v_reinterpret_as_u64(c), v_reinterpret_as_u64(d), + v_reinterpret_as_u64(e), v_reinterpret_as_u64(f), + v_reinterpret_as_u64(g), v_reinterpret_as_u64(h) + ); + + for (int i = 0; i < v_uint64::nlanes; ++i) + { + SCOPED_TRACE(cv::format("i=%d", i)); + EXPECT_EQ(maskA[i * 8], res[i]); + EXPECT_EQ(maskB[i * 8], res[i + v_uint64::nlanes]); + EXPECT_EQ(maskC[i * 8], res[i + v_uint64::nlanes * 2]); + EXPECT_EQ(maskD[i * 8], res[i + v_uint64::nlanes * 3]); + + EXPECT_EQ(maskE[i * 8], res[i + v_uint64::nlanes * 4]); + EXPECT_EQ(maskF[i * 8], res[i + v_uint64::nlanes * 5]); + EXPECT_EQ(dataG[i * 8], res[i + v_uint64::nlanes * 6]); + EXPECT_EQ(dataH[i * 8], res[i + v_uint64::nlanes * 7]); + } + + return *this; + } + TheTest & test_unpack() { Data dataA, dataB; @@ -1228,6 +1324,7 @@ void test_hal_intrin_uint8() .test_popcount() .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>() .test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>() + .test_pack_b() .test_unpack() .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>() .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>() @@ -1259,6 +1356,7 @@ void test_hal_intrin_int8() .test_logic() .test_min_max() .test_absdiff() + .test_absdiffs() .test_abs() .test_mask() .test_popcount() @@ -1317,6 +1415,7 @@ void test_hal_intrin_int16() .test_logic() .test_min_max() .test_absdiff() + .test_absdiffs() .test_abs() .test_reduce() .test_mask()