diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index 9dcfc5623a..031f8f3d02 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -60,255 +60,72 @@ // access from within opencv code more accessible namespace cv { -#ifndef CV_DOXYGEN - -#ifdef CV_CPU_DISPATCH_MODE -#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE __CV_CAT(hal_, CV_CPU_DISPATCH_MODE) -#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace __CV_CAT(hal_, CV_CPU_DISPATCH_MODE) { -#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END } -#else -#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_baseline -#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_baseline { -#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END } -#endif - - -CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN -CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END -using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE; -CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN -#endif - -//! @addtogroup core_hal_intrin -//! @{ - -//! @cond IGNORED template struct V_TypeTraits { - typedef _Tp int_type; - typedef _Tp uint_type; - typedef _Tp abs_type; - typedef _Tp sum_type; - - enum { delta = 0, shift = 0 }; - - static int_type reinterpret_int(_Tp x) { return x; } - static uint_type reinterpet_uint(_Tp x) { return x; } - static _Tp reinterpret_from_int(int_type x) { return (_Tp)x; } -}; - -template<> struct V_TypeTraits -{ - typedef uchar value_type; - typedef schar int_type; - typedef uchar uint_type; - typedef uchar abs_type; - typedef int sum_type; - - typedef ushort w_type; - typedef unsigned q_type; - - enum { delta = 128, shift = 8 }; - - static int_type reinterpret_int(value_type x) { return (int_type)x; } - static uint_type reinterpret_uint(value_type x) { return (uint_type)x; } - static value_type reinterpret_from_int(int_type x) { return (value_type)x; } -}; - -template<> struct V_TypeTraits -{ - typedef schar value_type; - typedef schar int_type; - typedef uchar uint_type; - typedef uchar abs_type; - typedef int sum_type; - - typedef short w_type; - typedef int q_type; - - enum { delta = 128, shift = 8 }; - - static int_type reinterpret_int(value_type x) { return (int_type)x; } - static uint_type reinterpret_uint(value_type x) { return (uint_type)x; } - static value_type reinterpret_from_int(int_type x) { return (value_type)x; } -}; - -template<> struct V_TypeTraits -{ - typedef ushort value_type; - typedef short int_type; - typedef ushort uint_type; - typedef ushort abs_type; - typedef int sum_type; - - typedef unsigned w_type; - typedef uchar nu_type; - - enum { delta = 32768, shift = 16 }; - - static int_type reinterpret_int(value_type x) { return (int_type)x; } - static uint_type reinterpret_uint(value_type x) { return (uint_type)x; } - static value_type reinterpret_from_int(int_type x) { return (value_type)x; } -}; - -template<> struct V_TypeTraits -{ - typedef short value_type; - typedef short int_type; - typedef ushort uint_type; - typedef ushort abs_type; - typedef int sum_type; - - typedef int w_type; - typedef uchar nu_type; - typedef schar n_type; - - enum { delta = 128, shift = 8 }; - - static int_type reinterpret_int(value_type x) { return (int_type)x; } - static uint_type reinterpret_uint(value_type x) { return (uint_type)x; } - static value_type reinterpret_from_int(int_type x) { return (value_type)x; } -}; - -template<> struct V_TypeTraits -{ - typedef unsigned value_type; - typedef int int_type; - typedef unsigned uint_type; - typedef unsigned abs_type; - typedef unsigned sum_type; - - typedef uint64 w_type; - typedef ushort nu_type; - - static int_type reinterpret_int(value_type x) { return (int_type)x; } - static uint_type reinterpret_uint(value_type x) { return (uint_type)x; } - static value_type reinterpret_from_int(int_type x) { return (value_type)x; } -}; - -template<> struct V_TypeTraits -{ - typedef int value_type; - typedef int int_type; - typedef unsigned uint_type; - typedef unsigned abs_type; - typedef int sum_type; - - typedef int64 w_type; - typedef short n_type; - typedef ushort nu_type; - - static int_type reinterpret_int(value_type x) { return (int_type)x; } - static uint_type reinterpret_uint(value_type x) { return (uint_type)x; } - static value_type reinterpret_from_int(int_type x) { return (value_type)x; } -}; - -template<> struct V_TypeTraits -{ - typedef uint64 value_type; - typedef int64 int_type; - typedef uint64 uint_type; - typedef uint64 abs_type; - typedef uint64 sum_type; - - typedef unsigned nu_type; - - static int_type reinterpret_int(value_type x) { return (int_type)x; } - static uint_type reinterpret_uint(value_type x) { return (uint_type)x; } - static value_type reinterpret_from_int(int_type x) { return (value_type)x; } -}; - -template<> struct V_TypeTraits -{ - typedef int64 value_type; - typedef int64 int_type; - typedef uint64 uint_type; - typedef uint64 abs_type; - typedef int64 sum_type; - - typedef int nu_type; - - static int_type reinterpret_int(value_type x) { return (int_type)x; } - static uint_type reinterpret_uint(value_type x) { return (uint_type)x; } - static value_type reinterpret_from_int(int_type x) { return (value_type)x; } -}; - - -template<> struct V_TypeTraits -{ - typedef float value_type; - typedef int int_type; - typedef unsigned uint_type; - typedef float abs_type; - typedef float sum_type; - - typedef double w_type; - - static int_type reinterpret_int(value_type x) - { - Cv32suf u; - u.f = x; - return u.i; - } - static uint_type reinterpet_uint(value_type x) - { - Cv32suf u; - u.f = x; - return u.u; - } - static value_type reinterpret_from_int(int_type x) - { - Cv32suf u; - u.i = x; - return u.f; - } }; -template<> struct V_TypeTraits -{ - typedef double value_type; - typedef int64 int_type; - typedef uint64 uint_type; - typedef double abs_type; - typedef double sum_type; - static int_type reinterpret_int(value_type x) - { - Cv64suf u; - u.f = x; - return u.i; - } - static uint_type reinterpet_uint(value_type x) - { - Cv64suf u; - u.f = x; - return u.u; +#define CV_INTRIN_DEF_TYPE_TRAITS(type, int_type_, uint_type_, abs_type_, w_type_, q_type_, sum_type_, nlanes128_) \ + template<> struct V_TypeTraits \ + { \ + typedef type value_type; \ + typedef int_type_ int_type; \ + typedef abs_type_ abs_type; \ + typedef uint_type_ uint_type; \ + typedef w_type_ w_type; \ + typedef q_type_ q_type; \ + typedef sum_type_ sum_type; \ + enum { nlanes128 = nlanes128_ }; \ + \ + static inline int_type reinterpret_int(type x) \ + { \ + union { type l; int_type i; } v; \ + v.l = x; \ + return v.i; \ + } \ + \ + static inline type reinterpret_from_int(int_type x) \ + { \ + union { type l; int_type i; } v; \ + v.i = x; \ + return v.l; \ + } \ } - static value_type reinterpret_from_int(int_type x) - { - Cv64suf u; - u.i = x; - return u.f; - } -}; -template struct V_SIMD128Traits -{ - enum { nlanes = 16 / sizeof(T) }; -}; +CV_INTRIN_DEF_TYPE_TRAITS(uchar, schar, uchar, uchar, ushort, unsigned, unsigned, 16); +CV_INTRIN_DEF_TYPE_TRAITS(schar, schar, uchar, uchar, short, int, int, 16); +CV_INTRIN_DEF_TYPE_TRAITS(ushort, short, ushort, ushort, unsigned, uint64, unsigned, 8); +CV_INTRIN_DEF_TYPE_TRAITS(short, short, ushort, ushort, int, int64, int, 8); +CV_INTRIN_DEF_TYPE_TRAITS(unsigned, int, unsigned, unsigned, uint64, void, unsigned, 4); +CV_INTRIN_DEF_TYPE_TRAITS(int, int, unsigned, unsigned, int64, void, int, 4); +CV_INTRIN_DEF_TYPE_TRAITS(float, int, unsigned, float, double, void, float, 4); +CV_INTRIN_DEF_TYPE_TRAITS(uint64, int64, uint64, uint64, void, void, uint64, 2); +CV_INTRIN_DEF_TYPE_TRAITS(int64, int64, uint64, uint64, void, void, int64, 2); +CV_INTRIN_DEF_TYPE_TRAITS(double, int64, uint64, double, void, void, double, 2); -//! @endcond +#ifndef CV_DOXYGEN -//! @} +#ifdef CV_CPU_DISPATCH_MODE + #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE __CV_CAT(hal_, CV_CPU_DISPATCH_MODE) + #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace __CV_CAT(hal_, CV_CPU_DISPATCH_MODE) { + #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END } +#else + #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_baseline + #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_baseline { + #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END } +#endif -#ifndef CV_DOXYGEN +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END +using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE; #endif } #ifdef CV_DOXYGEN +# undef CV_AVX2 # undef CV_SSE2 # undef CV_NEON # undef CV_VSX +# undef CV_FP16 #endif #if CV_SSE2 @@ -325,27 +142,25 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END #else +#define CV_SIMD128_CPP 1 #include "opencv2/core/hal/intrin_cpp.hpp" #endif -//! @addtogroup core_hal_intrin -//! @{ +// AVX2 can be used together with SSE2, so +// we define those two sets of intrinsics at once. +// Most of the intrinsics do not conflict (the proper overloaded variant is +// resolved by the argument types, e.g. v_float32x4 ~ SSE2, v_float32x8 ~ AVX2), +// but some of AVX2 intrinsics get v256_ prefix instead of v_, e.g. v256_load() vs v_load(). +// Correspondingly, the wide intrinsics (which are mapped to the "widest" +// available instruction set) will get vx_ prefix +// (and will be mapped to v256_ counterparts) (e.g. vx_load() => v245_load()) +#if CV_AVX2 -#ifndef CV_SIMD128 -//! Set to 1 if current compiler supports vector extensions (NEON or SSE is enabled) -#define CV_SIMD128 0 -#endif +#include "opencv2/core/hal/intrin_avx.hpp" -#ifndef CV_SIMD128_64F -//! Set to 1 if current intrinsics implementation supports 64-bit float vectors -#define CV_SIMD128_64F 0 #endif -//! @} - -//================================================================================================== - //! @cond IGNORED namespace cv { @@ -354,88 +169,175 @@ namespace cv { CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN #endif -template struct V_RegTrait128; +#ifndef CV_SIMD128 +#define CV_SIMD128 0 +#endif -template <> struct V_RegTrait128 { - typedef v_uint8x16 reg; - typedef v_uint16x8 w_reg; - typedef v_uint32x4 q_reg; - typedef v_uint8x16 u_reg; - static v_uint8x16 zero() { return v_setzero_u8(); } - static v_uint8x16 all(uchar val) { return v_setall_u8(val); } -}; +#ifndef CV_SIMD128_64F +#define CV_SIMD128_64F 0 +#endif -template <> struct V_RegTrait128 { - typedef v_int8x16 reg; - typedef v_int16x8 w_reg; - typedef v_int32x4 q_reg; - typedef v_uint8x16 u_reg; - static v_int8x16 zero() { return v_setzero_s8(); } - static v_int8x16 all(schar val) { return v_setall_s8(val); } -}; +#ifndef CV_SIMD256 +#define CV_SIMD256 0 +#endif -template <> struct V_RegTrait128 { - typedef v_uint16x8 reg; - typedef v_uint32x4 w_reg; - typedef v_int16x8 int_reg; - typedef v_uint16x8 u_reg; - static v_uint16x8 zero() { return v_setzero_u16(); } - static v_uint16x8 all(ushort val) { return v_setall_u16(val); } -}; +#ifndef CV_SIMD256_64F +#define CV_SIMD256_64F 0 +#endif -template <> struct V_RegTrait128 { - typedef v_int16x8 reg; - typedef v_int32x4 w_reg; - typedef v_uint16x8 u_reg; - static v_int16x8 zero() { return v_setzero_s16(); } - static v_int16x8 all(short val) { return v_setall_s16(val); } -}; +#ifndef CV_SIMD512 +#define CV_SIMD512 0 +#endif -template <> struct V_RegTrait128 { - typedef v_uint32x4 reg; - typedef v_uint64x2 w_reg; - typedef v_int32x4 int_reg; - typedef v_uint32x4 u_reg; - static v_uint32x4 zero() { return v_setzero_u32(); } - static v_uint32x4 all(unsigned val) { return v_setall_u32(val); } -}; +#ifndef CV_SIMD512_64F +#define CV_SIMD512_64F 0 +#endif -template <> struct V_RegTrait128 { - typedef v_int32x4 reg; - typedef v_int64x2 w_reg; - typedef v_uint32x4 u_reg; - static v_int32x4 zero() { return v_setzero_s32(); } - static v_int32x4 all(int val) { return v_setall_s32(val); } -}; +#if CV_SIMD512 + #define CV_SIMD 1 + #define CV_SIMD_64F CV_SIMD512_64F + #define CV_SIMD_WIDTH 64 +#elif CV_SIMD256 + #define CV_SIMD 1 + #define CV_SIMD_64F CV_SIMD256_64F + #define CV_SIMD_WIDTH 32 +#else + #define CV_SIMD CV_SIMD128 + #define CV_SIMD_64F CV_SIMD128_64F + #define CV_SIMD_WIDTH 16 +#endif -template <> struct V_RegTrait128 { - typedef v_uint64x2 reg; - static v_uint64x2 zero() { return v_setzero_u64(); } - static v_uint64x2 all(uint64 val) { return v_setall_u64(val); } -}; +//================================================================================================== -template <> struct V_RegTrait128 { - typedef v_int64x2 reg; - static v_int64x2 zero() { return v_setzero_s64(); } - static v_int64x2 all(int64 val) { return v_setall_s64(val); } +#define CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \ + inline vtyp vx_setall_##short_typ(typ v) { return prefix##_setall_##short_typ(v); } \ + inline vtyp vx_setzero_##short_typ() { return prefix##_setzero_##short_typ(); } \ + inline vtyp vx_##loadsfx(const typ* ptr) { return prefix##_##loadsfx(ptr); } \ + inline vtyp vx_##loadsfx##_aligned(const typ* ptr) { return prefix##_##loadsfx##_aligned(ptr); } \ + inline void vx_store(typ* ptr, const vtyp& v) { return v_store(ptr, v); } \ + inline void vx_store_aligned(typ* ptr, const vtyp& v) { return v_store_aligned(ptr, v); } + +#define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \ +inline wtyp vx_load_expand(const typ* ptr) { return prefix##_load_expand(ptr); } + +#define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND_Q(typ, qtyp, prefix) \ +inline qtyp vx_load_expand_q(const typ* ptr) { return prefix##_load_expand_q(ptr); } + +#define CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(typ, vtyp, short_typ, wtyp, qtyp, prefix, loadsfx) \ + CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \ + CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \ + CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND_Q(typ, qtyp, prefix) + +#define CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(prefix) \ + CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(uchar, v_uint8, u8, v_uint16, v_uint32, prefix, load) \ + CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(schar, v_int8, s8, v_int16, v_int32, prefix, load) \ + CV_INTRIN_DEFINE_WIDE_INTRIN(ushort, v_uint16, u16, prefix, load) \ + CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(ushort, v_uint32, prefix) \ + CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_int16, s16, prefix, load) \ + CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(short, v_int32, prefix) \ + CV_INTRIN_DEFINE_WIDE_INTRIN(int, v_int32, s32, prefix, load) \ + CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(int, v_int64, prefix) \ + CV_INTRIN_DEFINE_WIDE_INTRIN(unsigned, v_uint32, u32, prefix, load) \ + CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(unsigned, v_uint64, prefix) \ + CV_INTRIN_DEFINE_WIDE_INTRIN(float, v_float32, f32, prefix, load) \ + CV_INTRIN_DEFINE_WIDE_INTRIN(int64, v_int64, s64, prefix, load) \ + CV_INTRIN_DEFINE_WIDE_INTRIN(uint64, v_uint64, u64, prefix, load) + +template struct V_RegTraits +{ }; -template <> struct V_RegTrait128 { - typedef v_float32x4 reg; - typedef v_int32x4 int_reg; - typedef v_float32x4 u_reg; - static v_float32x4 zero() { return v_setzero_f32(); } - static v_float32x4 all(float val) { return v_setall_f32(val); } -}; +#define CV_DEF_REG_TRAITS(prefix, _reg, lane_type, suffix, _u_reg, _w_reg, _q_reg, _int_reg, _round_reg) \ + template<> struct V_RegTraits<_reg> \ + { \ + typedef _reg reg; \ + typedef _u_reg u_reg; \ + typedef _w_reg w_reg; \ + typedef _q_reg q_reg; \ + typedef _int_reg int_reg; \ + typedef _round_reg round_reg; \ + } +#if CV_SIMD128 || CV_SIMD128_CPP + CV_DEF_REG_TRAITS(v, v_uint8x16, uchar, u8, v_uint8x16, v_uint16x8, v_uint32x4, v_int8x16, void); + CV_DEF_REG_TRAITS(v, v_int8x16, schar, s8, v_uint8x16, v_int16x8, v_int32x4, v_int8x16, void); + CV_DEF_REG_TRAITS(v, v_uint16x8, ushort, u16, v_uint16x8, v_uint32x4, v_uint64x2, v_int16x8, void); + CV_DEF_REG_TRAITS(v, v_int16x8, short, s16, v_uint16x8, v_int32x4, v_int64x2, v_int16x8, void); + CV_DEF_REG_TRAITS(v, v_uint32x4, unsigned, u32, v_uint32x4, v_uint64x2, void, v_int32x4, void); + CV_DEF_REG_TRAITS(v, v_int32x4, int, s32, v_uint32x4, v_int64x2, void, v_int32x4, void); #if CV_SIMD128_64F -template <> struct V_RegTrait128 { - typedef v_float64x2 reg; - typedef v_int32x4 int_reg; - typedef v_float64x2 u_reg; - static v_float64x2 zero() { return v_setzero_f64(); } - static v_float64x2 all(double val) { return v_setall_f64(val); } -}; + CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, v_float64x2, void, v_int32x4, v_int32x4); +#else + CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, void, void, v_int32x4, v_int32x4); +#endif + CV_DEF_REG_TRAITS(v, v_uint64x2, uint64, u64, v_uint64x2, void, void, v_int64x2, void); + CV_DEF_REG_TRAITS(v, v_int64x2, int64, s64, v_uint64x2, void, void, v_int64x2, void); +#if CV_SIMD128_64F + CV_DEF_REG_TRAITS(v, v_float64x2, double, f64, v_float64x2, void, void, v_int64x2, v_int32x4); +#endif +#if CV_FP16 + CV_DEF_REG_TRAITS(v, v_float16x8, short, f16, v_float32x4, void, void, v_int16x8, v_int16x8); +#endif +#endif + +#if CV_SIMD256 + CV_DEF_REG_TRAITS(v256, v_uint8x32, uchar, u8, v_uint8x32, v_uint16x16, v_uint32x8, v_int8x32, void); + CV_DEF_REG_TRAITS(v256, v_int8x32, schar, s8, v_uint8x32, v_int16x16, v_int32x8, v_int8x32, void); + CV_DEF_REG_TRAITS(v256, v_uint16x16, ushort, u16, v_uint16x16, v_uint32x8, v_uint64x4, v_int16x16, void); + CV_DEF_REG_TRAITS(v256, v_int16x16, short, s16, v_uint16x16, v_int32x8, v_int64x4, v_int16x16, void); + CV_DEF_REG_TRAITS(v256, v_uint32x8, unsigned, u32, v_uint32x8, v_uint64x4, void, v_int32x8, void); + CV_DEF_REG_TRAITS(v256, v_int32x8, int, s32, v_uint32x8, v_int64x4, void, v_int32x8, void); + CV_DEF_REG_TRAITS(v256, v_float32x8, float, f32, v_float32x8, v_float64x4, void, v_int32x8, v_int32x8); + CV_DEF_REG_TRAITS(v256, v_uint64x4, uint64, u64, v_uint64x4, void, void, v_int64x4, void); + CV_DEF_REG_TRAITS(v256, v_int64x4, int64, s64, v_uint64x4, void, void, v_int64x4, void); + CV_DEF_REG_TRAITS(v256, v_float64x4, double, f64, v_float64x4, void, void, v_int64x4, v_int32x8); +#if CV_FP16 + CV_DEF_REG_TRAITS(v256, v_float16x16, short, f16, v_float32x8, void, void, v_int16x16, void); +#endif +#endif + +#if CV_SIMD256 + typedef v_uint8x32 v_uint8; + typedef v_int8x32 v_int8; + typedef v_uint16x16 v_uint16; + typedef v_int16x16 v_int16; + typedef v_uint32x8 v_uint32; + typedef v_int32x8 v_int32; + typedef v_uint64x4 v_uint64; + typedef v_int64x4 v_int64; + typedef v_float32x8 v_float32; + #if CV_SIMD256_64F + typedef v_float64x4 v_float64; + #endif + #if CV_FP16 + typedef v_float16x16 v_float16; + CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_float16, f16, v256, load_f16) + #endif + CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v256) + CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v256, load) + inline void vx_cleanup() { v256_cleanup(); } +#elif CV_SIMD128 + typedef v_uint8x16 v_uint8; + typedef v_int8x16 v_int8; + typedef v_uint16x8 v_uint16; + typedef v_int16x8 v_int16; + typedef v_uint32x4 v_uint32; + typedef v_int32x4 v_int32; + typedef v_uint64x2 v_uint64; + typedef v_int64x2 v_int64; + typedef v_float32x4 v_float32; + #if CV_SIMD128_64F + typedef v_float64x2 v_float64; + #endif + #if CV_FP16 + typedef v_float16x8 v_float16; + CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_float16, f16, v, load_f16) + #endif + CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v) + #if CV_SIMD128_64F + CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v, load) + #endif + inline void vx_cleanup() { v_cleanup(); } #endif inline unsigned int trailingZeros32(unsigned int value) { diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp new file mode 100644 index 0000000000..7e983fd24f --- /dev/null +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -0,0 +1,2016 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html + +#ifndef OPENCV_HAL_INTRIN_AVX_HPP +#define OPENCV_HAL_INTRIN_AVX_HPP + +#define CV_SIMD256 1 +#define CV_SIMD256_64F 1 + +namespace cv +{ + +//! @cond IGNORED + +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN + +///////// Utils //////////// + +inline __m256i _v256_combine(const __m128i& lo, const __m128i& hi) +{ return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1); } + +inline __m256 _v256_combine(const __m128& lo, const __m128& hi) +{ return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1); } + +inline __m256d _v256_combine(const __m128d& lo, const __m128d& hi) +{ return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 1); } + +inline int _v_cvtsi256_si32(const __m256i& a) +{ return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); } + +inline __m256i _v256_shuffle_odd_64(const __m256i& v) +{ return _mm256_permute4x64_epi64(v, _MM_SHUFFLE(3, 1, 2, 0)); } + +inline __m256d _v256_shuffle_odd_64(const __m256d& v) +{ return _mm256_permute4x64_pd(v, _MM_SHUFFLE(3, 1, 2, 0)); } + +template +inline __m256i _v256_permute2x128(const __m256i& a, const __m256i& b) +{ return _mm256_permute2x128_si256(a, b, imm); } + +template +inline __m256 _v256_permute2x128(const __m256& a, const __m256& b) +{ return _mm256_permute2f128_ps(a, b, imm); } + +template +inline __m256d _v256_permute2x128(const __m256d& a, const __m256d& b) +{ return _mm256_permute2f128_pd(a, b, imm); } + +template +inline _Tpvec v256_permute2x128(const _Tpvec& a, const _Tpvec& b) +{ return _Tpvec(_v256_permute2x128(a.val, b.val)); } + +template +inline __m256i _v256_permute4x64(const __m256i& a) +{ return _mm256_permute4x64_epi64(a, imm); } + +template +inline __m256d _v256_permute4x64(const __m256d& a) +{ return _mm256_permute4x64_pd(a, imm); } + +template +inline _Tpvec v256_permute4x64(const _Tpvec& a) +{ return _Tpvec(_v256_permute4x64(a.val)); } + +inline __m128i _v256_extract_high(const __m256i& v) +{ return _mm256_extracti128_si256(v, 1); } + +inline __m128 _v256_extract_high(const __m256& v) +{ return _mm256_extractf128_ps(v, 1); } + +inline __m128d _v256_extract_high(const __m256d& v) +{ return _mm256_extractf128_pd(v, 1); } + +inline __m128i _v256_extract_low(const __m256i& v) +{ return _mm256_castsi256_si128(v); } + +inline __m128 _v256_extract_low(const __m256& v) +{ return _mm256_castps256_ps128(v); } + +inline __m128d _v256_extract_low(const __m256d& v) +{ return _mm256_castpd256_pd128(v); } + +///////// Types //////////// + +struct v_uint8x32 +{ + typedef uchar lane_type; + enum { nlanes = 32 }; + __m256i val; + + explicit v_uint8x32(__m256i v) : val(v) {} + v_uint8x32(uchar v0, uchar v1, uchar v2, uchar v3, + uchar v4, uchar v5, uchar v6, uchar v7, + uchar v8, uchar v9, uchar v10, uchar v11, + uchar v12, uchar v13, uchar v14, uchar v15, + uchar v16, uchar v17, uchar v18, uchar v19, + uchar v20, uchar v21, uchar v22, uchar v23, + uchar v24, uchar v25, uchar v26, uchar v27, + uchar v28, uchar v29, uchar v30, uchar v31) + { + val = _mm256_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3, + (char)v4, (char)v5, (char)v6 , (char)v7, (char)v8, (char)v9, + (char)v10, (char)v11, (char)v12, (char)v13, (char)v14, (char)v15, + (char)v16, (char)v17, (char)v18, (char)v19, (char)v20, (char)v21, + (char)v22, (char)v23, (char)v24, (char)v25, (char)v26, (char)v27, + (char)v28, (char)v29, (char)v30, (char)v31); + } + v_uint8x32() : val(_mm256_setzero_si256()) {} + uchar get0() const { return (uchar)_v_cvtsi256_si32(val); } +}; + +struct v_int8x32 +{ + typedef schar lane_type; + enum { nlanes = 32 }; + __m256i val; + + explicit v_int8x32(__m256i v) : val(v) {} + v_int8x32(schar v0, schar v1, schar v2, schar v3, + schar v4, schar v5, schar v6, schar v7, + schar v8, schar v9, schar v10, schar v11, + schar v12, schar v13, schar v14, schar v15, + schar v16, schar v17, schar v18, schar v19, + schar v20, schar v21, schar v22, schar v23, + schar v24, schar v25, schar v26, schar v27, + schar v28, schar v29, schar v30, schar v31) + { + val = _mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, + v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, + v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31); + } + v_int8x32() : val(_mm256_setzero_si256()) {} + schar get0() const { return (schar)_v_cvtsi256_si32(val); } +}; + +struct v_uint16x16 +{ + typedef ushort lane_type; + enum { nlanes = 16 }; + __m256i val; + + explicit v_uint16x16(__m256i v) : val(v) {} + v_uint16x16(ushort v0, ushort v1, ushort v2, ushort v3, + ushort v4, ushort v5, ushort v6, ushort v7, + ushort v8, ushort v9, ushort v10, ushort v11, + ushort v12, ushort v13, ushort v14, ushort v15) + { + val = _mm256_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3, + (short)v4, (short)v5, (short)v6, (short)v7, (short)v8, (short)v9, + (short)v10, (short)v11, (short)v12, (short)v13, (short)v14, (short)v15); + } + v_uint16x16() : val(_mm256_setzero_si256()) {} + ushort get0() const { return (ushort)_v_cvtsi256_si32(val); } +}; + +struct v_int16x16 +{ + typedef short lane_type; + enum { nlanes = 16 }; + __m256i val; + + explicit v_int16x16(__m256i v) : val(v) {} + v_int16x16(short v0, short v1, short v2, short v3, + short v4, short v5, short v6, short v7, + short v8, short v9, short v10, short v11, + short v12, short v13, short v14, short v15) + { + val = _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15); + } + v_int16x16() : val(_mm256_setzero_si256()) {} + short get0() const { return (short)_v_cvtsi256_si32(val); } +}; + +struct v_uint32x8 +{ + typedef unsigned lane_type; + enum { nlanes = 8 }; + __m256i val; + + explicit v_uint32x8(__m256i v) : val(v) {} + v_uint32x8(unsigned v0, unsigned v1, unsigned v2, unsigned v3, + unsigned v4, unsigned v5, unsigned v6, unsigned v7) + { + val = _mm256_setr_epi32((unsigned)v0, (unsigned)v1, (unsigned)v2, + (unsigned)v3, (unsigned)v4, (unsigned)v5, (unsigned)v6, (unsigned)v7); + } + v_uint32x8() : val(_mm256_setzero_si256()) {} + unsigned get0() const { return (unsigned)_v_cvtsi256_si32(val); } +}; + +struct v_int32x8 +{ + typedef int lane_type; + enum { nlanes = 8 }; + __m256i val; + + explicit v_int32x8(__m256i v) : val(v) {} + v_int32x8(int v0, int v1, int v2, int v3, + int v4, int v5, int v6, int v7) + { + val = _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7); + } + v_int32x8() : val(_mm256_setzero_si256()) {} + int get0() const { return _v_cvtsi256_si32(val); } +}; + +struct v_float32x8 +{ + typedef float lane_type; + enum { nlanes = 8 }; + __m256 val; + + explicit v_float32x8(__m256 v) : val(v) {} + v_float32x8(float v0, float v1, float v2, float v3, + float v4, float v5, float v6, float v7) + { + val = _mm256_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7); + } + v_float32x8() : val(_mm256_setzero_ps()) {} + float get0() const { return _mm_cvtss_f32(_mm256_castps256_ps128(val)); } +}; + +struct v_uint64x4 +{ + typedef uint64 lane_type; + enum { nlanes = 4 }; + __m256i val; + + explicit v_uint64x4(__m256i v) : val(v) {} + v_uint64x4(uint64 v0, uint64 v1, uint64 v2, uint64 v3) + { val = _mm256_setr_epi64x((int64)v0, (int64)v1, (int64)v2, (int64)v3); } + v_uint64x4() : val(_mm256_setzero_si256()) {} + uint64 get0() const + { return (uint64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val)); } +}; + +struct v_int64x4 +{ + typedef int64 lane_type; + enum { nlanes = 4 }; + __m256i val; + + explicit v_int64x4(__m256i v) : val(v) {} + v_int64x4(int64 v0, int64 v1, int64 v2, int64 v3) + { val = _mm256_setr_epi64x(v0, v1, v2, v3); } + v_int64x4() : val(_mm256_setzero_si256()) {} + int64 get0() const { return (int64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val)); } +}; + +struct v_float64x4 +{ + typedef double lane_type; + enum { nlanes = 4 }; + __m256d val; + + explicit v_float64x4(__m256d v) : val(v) {} + v_float64x4(double v0, double v1, double v2, double v3) + { val = _mm256_setr_pd(v0, v1, v2, v3); } + v_float64x4() : val(_mm256_setzero_pd()) {} + double get0() const { return _mm_cvtsd_f64(_mm256_castpd256_pd128(val)); } +}; + +struct v_float16x16 +{ + typedef short lane_type; + enum { nlanes = 16 }; + __m256i val; + + explicit v_float16x16(__m256i v) : val(v) {} + v_float16x16(short v0, short v1, short v2, short v3, + short v4, short v5, short v6, short v7, + short v8, short v9, short v10, short v11, + short v12, short v13, short v14, short v15) + { + val = _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); + } + v_float16x16() : val(_mm256_setzero_si256()) {} + short get0() const { return (short)_v_cvtsi256_si32(val); } +}; +inline v_float16x16 v256_setzero_f16() { return v_float16x16(_mm256_setzero_si256()); } +inline v_float16x16 v256_setall_f16(short val) { return v_float16x16(_mm256_set1_epi16(val)); } + +//////////////// Load and store operations /////////////// + +#define OPENCV_HAL_IMPL_AVX_LOADSTORE(_Tpvec, _Tp) \ + inline _Tpvec v256_load(const _Tp* ptr) \ + { return _Tpvec(_mm256_loadu_si256((const __m256i*)ptr)); } \ + inline _Tpvec v256_load_aligned(const _Tp* ptr) \ + { return _Tpvec(_mm256_load_si256((const __m256i*)ptr)); } \ + inline _Tpvec v256_load_low(const _Tp* ptr) \ + { \ + __m128i v128 = _mm_loadu_si128((const __m128i*)ptr); \ + return _Tpvec(_mm256_castsi128_si256(v128)); \ + } \ + inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ + { \ + __m128i vlo = _mm_loadu_si128((const __m128i*)ptr0); \ + __m128i vhi = _mm_loadu_si128((const __m128i*)ptr1); \ + return _Tpvec(_v256_combine(vlo, vhi)); \ + } \ + inline void v_store(_Tp* ptr, const _Tpvec& a) \ + { _mm256_storeu_si256((__m256i*)ptr, a.val); } \ + inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ + { _mm256_store_si256((__m256i*)ptr, a.val); } \ + inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ + { _mm_storeu_si128((__m128i*)ptr, _v256_extract_low(a.val)); } \ + inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ + { _mm_storeu_si128((__m128i*)ptr, _v256_extract_high(a.val)); } + +OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint8x32, uchar) +OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int8x32, schar) +OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint16x16, ushort) +OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int16x16, short) +OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint32x8, unsigned) +OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int32x8, int) +OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint64x4, uint64) +OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int64x4, int64) + +#define OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(_Tpvec, _Tp, suffix, halfreg) \ + inline _Tpvec v256_load(const _Tp* ptr) \ + { return _Tpvec(_mm256_loadu_##suffix(ptr)); } \ + inline _Tpvec v256_load_aligned(const _Tp* ptr) \ + { return _Tpvec(_mm256_load_##suffix(ptr)); } \ + inline _Tpvec v256_load_low(const _Tp* ptr) \ + { \ + return _Tpvec(_mm256_cast##suffix##128_##suffix##256 \ + (_mm_loadu_##suffix(ptr))); \ + } \ + inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ + { \ + halfreg vlo = _mm_loadu_##suffix(ptr0); \ + halfreg vhi = _mm_loadu_##suffix(ptr1); \ + return _Tpvec(_v256_combine(vlo, vhi)); \ + } \ + inline void v_store(_Tp* ptr, const _Tpvec& a) \ + { _mm256_storeu_##suffix(ptr, a.val); } \ + inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ + { _mm256_store_##suffix(ptr, a.val); } \ + inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ + { _mm_storeu_##suffix(ptr, _v256_extract_low(a.val)); } \ + inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ + { _mm_storeu_##suffix(ptr, _v256_extract_high(a.val)); } + +OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float32x8, float, ps, __m128) +OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float64x4, double, pd, __m128d) + +#define OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, _Tpvecf, suffix, cast) \ + inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a) \ + { return _Tpvec(cast(a.val)); } + +#define OPENCV_HAL_IMPL_AVX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s) \ + inline _Tpvec v256_setzero_##suffix() \ + { return _Tpvec(_mm256_setzero_si256()); } \ + inline _Tpvec v256_setall_##suffix(_Tp v) \ + { return _Tpvec(_mm256_set1_##ssuffix((ctype_s)v)); } \ + OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int16x16, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint32x8, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int32x8, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint64x4, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int64x4, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_float32x8, suffix, _mm256_castps_si256) \ + OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_float64x4, suffix, _mm256_castpd_si256) + +OPENCV_HAL_IMPL_AVX_INIT(v_uint8x32, uchar, u8, epi8, char) +OPENCV_HAL_IMPL_AVX_INIT(v_int8x32, schar, s8, epi8, char) +OPENCV_HAL_IMPL_AVX_INIT(v_uint16x16, ushort, u16, epi16, short) +OPENCV_HAL_IMPL_AVX_INIT(v_int16x16, short, s16, epi16, short) +OPENCV_HAL_IMPL_AVX_INIT(v_uint32x8, unsigned, u32, epi32, int) +OPENCV_HAL_IMPL_AVX_INIT(v_int32x8, int, s32, epi32, int) +OPENCV_HAL_IMPL_AVX_INIT(v_uint64x4, uint64, u64, epi64x, int64) +OPENCV_HAL_IMPL_AVX_INIT(v_int64x4, int64, s64, epi64x, int64) + +#define OPENCV_HAL_IMPL_AVX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \ + inline _Tpvec v256_setzero_##suffix() \ + { return _Tpvec(_mm256_setzero_##zsuffix()); } \ + inline _Tpvec v256_setall_##suffix(_Tp v) \ + { return _Tpvec(_mm256_set1_##zsuffix(v)); } \ + OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32, suffix, cast) \ + OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32, suffix, cast) \ + OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, cast) \ + OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int16x16, suffix, cast) \ + OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint32x8, suffix, cast) \ + OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int32x8, suffix, cast) \ + OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint64x4, suffix, cast) \ + OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int64x4, suffix, cast) + +OPENCV_HAL_IMPL_AVX_INIT_FLT(v_float32x8, float, f32, ps, _mm256_castsi256_ps) +OPENCV_HAL_IMPL_AVX_INIT_FLT(v_float64x4, double, f64, pd, _mm256_castsi256_pd) + +inline v_float32x8 v_reinterpret_as_f32(const v_float32x8& a) +{ return a; } +inline v_float32x8 v_reinterpret_as_f32(const v_float64x4& a) +{ return v_float32x8(_mm256_castpd_ps(a.val)); } + +inline v_float64x4 v_reinterpret_as_f64(const v_float64x4& a) +{ return a; } +inline v_float64x4 v_reinterpret_as_f64(const v_float32x8& a) +{ return v_float64x4(_mm256_castps_pd(a.val)); } + +inline v_float16x16 v256_load_f16(const short* ptr) +{ return v_float16x16(_mm256_loadu_si256((const __m256i*)ptr)); } +inline v_float16x16 v256_load_f16_aligned(const short* ptr) +{ return v_float16x16(_mm256_load_si256((const __m256i*)ptr)); } + +inline void v_store(short* ptr, const v_float16x16& a) +{ _mm256_storeu_si256((__m256i*)ptr, a.val); } +inline void v_store_aligned(short* ptr, const v_float16x16& a) +{ _mm256_store_si256((__m256i*)ptr, a.val); } + +/* Recombine */ +/*#define OPENCV_HAL_IMPL_AVX_COMBINE(_Tpvec, perm) \ + inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(perm(a.val, b.val, 0x20)); } \ + inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(perm(a.val, b.val, 0x31)); } \ + inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \ + _Tpvec& c, _Tpvec& d) \ + { c = v_combine_low(a, b); d = v_combine_high(a, b); } + +#define OPENCV_HAL_IMPL_AVX_UNPACKS(_Tpvec, suffix) \ + OPENCV_HAL_IMPL_AVX_COMBINE(_Tpvec, _mm256_permute2x128_si256) \ + inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, \ + _Tpvec& b0, _Tpvec& b1) \ + { \ + __m256i v0 = _v256_shuffle_odd_64(a0.val); \ + __m256i v1 = _v256_shuffle_odd_64(a1.val); \ + b0.val = _mm256_unpacklo_##suffix(v0, v1); \ + b1.val = _mm256_unpackhi_##suffix(v0, v1); \ + } + +OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint8x32, epi8) +OPENCV_HAL_IMPL_AVX_UNPACKS(v_int8x32, epi8) +OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint16x16, epi16) +OPENCV_HAL_IMPL_AVX_UNPACKS(v_int16x16, epi16) +OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint32x8, epi32) +OPENCV_HAL_IMPL_AVX_UNPACKS(v_int32x8, epi32) +OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint64x4, epi64) +OPENCV_HAL_IMPL_AVX_UNPACKS(v_int64x4, epi64) +OPENCV_HAL_IMPL_AVX_COMBINE(v_float32x8, _mm256_permute2f128_ps) +OPENCV_HAL_IMPL_AVX_COMBINE(v_float64x4, _mm256_permute2f128_pd) + +inline void v_zip(const v_float32x8& a0, const v_float32x8& a1, v_float32x8& b0, v_float32x8& b1) +{ + __m256 v0 = _mm256_unpacklo_ps(a0.val, a1.val); + __m256 v1 = _mm256_unpackhi_ps(a0.val, a1.val); + v_recombine(v_float32x8(v0), v_float32x8(v1), b0, b1); +} + +inline void v_zip(const v_float64x4& a0, const v_float64x4& a1, v_float64x4& b0, v_float64x4& b1) +{ + __m256d v0 = _v_shuffle_odd_64(a0.val); + __m256d v1 = _v_shuffle_odd_64(a1.val); + b0.val = _mm256_unpacklo_pd(v0, v1); + b1.val = _mm256_unpackhi_pd(v0, v1); +}*/ + +//////////////// Variant Value reordering /////////////// + +// unpacks +#define OPENCV_HAL_IMPL_AVX_UNPACK(_Tpvec, suffix) \ + inline _Tpvec v256_unpacklo(const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(_mm256_unpacklo_##suffix(a.val, b.val)); } \ + inline _Tpvec v256_unpackhi(const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(_mm256_unpackhi_##suffix(a.val, b.val)); } + +OPENCV_HAL_IMPL_AVX_UNPACK(v_uint8x32, epi8) +OPENCV_HAL_IMPL_AVX_UNPACK(v_int8x32, epi8) +OPENCV_HAL_IMPL_AVX_UNPACK(v_uint16x16, epi16) +OPENCV_HAL_IMPL_AVX_UNPACK(v_int16x16, epi16) +OPENCV_HAL_IMPL_AVX_UNPACK(v_uint32x8, epi32) +OPENCV_HAL_IMPL_AVX_UNPACK(v_int32x8, epi32) +OPENCV_HAL_IMPL_AVX_UNPACK(v_uint64x4, epi64) +OPENCV_HAL_IMPL_AVX_UNPACK(v_int64x4, epi64) +OPENCV_HAL_IMPL_AVX_UNPACK(v_float32x8, ps) +OPENCV_HAL_IMPL_AVX_UNPACK(v_float64x4, pd) + +// blend +#define OPENCV_HAL_IMPL_AVX_BLEND(_Tpvec, suffix) \ + template \ + inline _Tpvec v256_blend(const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(_mm256_blend_##suffix(a.val, b.val, m)); } + +OPENCV_HAL_IMPL_AVX_BLEND(v_uint16x16, epi16) +OPENCV_HAL_IMPL_AVX_BLEND(v_int16x16, epi16) +OPENCV_HAL_IMPL_AVX_BLEND(v_uint32x8, epi32) +OPENCV_HAL_IMPL_AVX_BLEND(v_int32x8, epi32) +OPENCV_HAL_IMPL_AVX_BLEND(v_float32x8, ps) +OPENCV_HAL_IMPL_AVX_BLEND(v_float64x4, pd) + +template +inline v_uint64x4 v256_blend(const v_uint64x4& a, const v_uint64x4& b) +{ + enum {M0 = m}; + enum {M1 = (M0 | (M0 << 2)) & 0x33}; + enum {M2 = (M1 | (M1 << 1)) & 0x55}; + enum {MM = M2 | (M2 << 1)}; + return v_uint64x4(_mm256_blend_epi32(a.val, b.val, MM)); +} +template +inline v_int64x4 v256_blend(const v_int64x4& a, const v_int64x4& b) +{ return v_int64x4(v256_blend(v_uint64x4(a.val), v_uint64x4(b.val)).val); } + +// shuffle +// todo: emluate 64bit +#define OPENCV_HAL_IMPL_AVX_SHUFFLE(_Tpvec, intrin) \ + template \ + inline _Tpvec v256_shuffle(const _Tpvec& a) \ + { return _Tpvec(_mm256_##intrin(a.val, m)); } + +OPENCV_HAL_IMPL_AVX_SHUFFLE(v_uint32x8, shuffle_epi32) +OPENCV_HAL_IMPL_AVX_SHUFFLE(v_int32x8, shuffle_epi32) +OPENCV_HAL_IMPL_AVX_SHUFFLE(v_float32x8, permute_ps) +OPENCV_HAL_IMPL_AVX_SHUFFLE(v_float64x4, permute_pd) + +template +inline void v256_zip(const _Tpvec& a, const _Tpvec& b, _Tpvec& ab0, _Tpvec& ab1) +{ + ab0 = v256_unpacklo(a, b); + ab1 = v256_unpackhi(a, b); +} + +template +inline _Tpvec v256_combine_diagonal(const _Tpvec& a, const _Tpvec& b) +{ return _Tpvec(_mm256_blend_epi32(a.val, b.val, 0b11110000)); } + +inline v_float32x8 v256_combine_diagonal(const v_float32x8& a, const v_float32x8& b) +{ return v256_blend<0b11110000>(a, b); } + +inline v_float64x4 v256_combine_diagonal(const v_float64x4& a, const v_float64x4& b) +{ return v256_blend<0b1100>(a, b); } + +template +inline _Tpvec v256_alignr_128(const _Tpvec& a, const _Tpvec& b) +{ return v256_permute2x128<0x21>(a, b); } + +template +inline _Tpvec v256_alignr_64(const _Tpvec& a, const _Tpvec& b) +{ return _Tpvec(_mm256_alignr_epi8(a.val, b.val, 8)); } +inline v_float64x4 v256_alignr_64(const v_float64x4& a, const v_float64x4& b) +{ return v_float64x4(_mm256_shuffle_pd(b.val, a.val, _MM_SHUFFLE(0, 0, 1, 1))); } +// todo: emulate float32 + +template +inline _Tpvec v256_swap_halves(const _Tpvec& a) +{ return v256_permute2x128<1>(a, a); } + +template +inline _Tpvec v256_reverse_64(const _Tpvec& a) +{ return v256_permute4x64<_MM_SHUFFLE(0, 1, 2, 3)>(a); } + +// ZIP +#define OPENCV_HAL_IMPL_AVX_ZIP(_Tpvec) \ + inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \ + { return v256_permute2x128<0x20>(a, b); } \ + inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \ + { return v256_permute2x128<0x31>(a, b); } \ + inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \ + _Tpvec& c, _Tpvec& d) \ + { \ + _Tpvec a1b0 = v256_alignr_128(a, b); \ + c = v256_combine_diagonal(a, a1b0); \ + d = v256_combine_diagonal(a1b0, b); \ + } \ + inline void v_zip(const _Tpvec& a, const _Tpvec& b, \ + _Tpvec& ab0, _Tpvec& ab1) \ + { \ + _Tpvec ab0ab2, ab1ab3; \ + v256_zip(a, b, ab0ab2, ab1ab3); \ + v_recombine(ab0ab2, ab1ab3, ab0, ab1); \ + } + +OPENCV_HAL_IMPL_AVX_ZIP(v_uint8x32) +OPENCV_HAL_IMPL_AVX_ZIP(v_int8x32) +OPENCV_HAL_IMPL_AVX_ZIP(v_uint16x16) +OPENCV_HAL_IMPL_AVX_ZIP(v_int16x16) +OPENCV_HAL_IMPL_AVX_ZIP(v_uint32x8) +OPENCV_HAL_IMPL_AVX_ZIP(v_int32x8) +OPENCV_HAL_IMPL_AVX_ZIP(v_uint64x4) +OPENCV_HAL_IMPL_AVX_ZIP(v_int64x4) +OPENCV_HAL_IMPL_AVX_ZIP(v_float32x8) +OPENCV_HAL_IMPL_AVX_ZIP(v_float64x4) + +////////// Arithmetic, bitwise and comparison operations ///////// + +/* Element-wise binary and unary operations */ + +/** Arithmetics **/ +#define OPENCV_HAL_IMPL_AVX_BIN_OP(bin_op, _Tpvec, intrin) \ + inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(intrin(a.val, b.val)); } \ + inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ + { a.val = intrin(a.val, b.val); return a; } + +OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint8x32, _mm256_adds_epu8) +OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint8x32, _mm256_subs_epu8) +OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int8x32, _mm256_adds_epi8) +OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int8x32, _mm256_subs_epi8) +OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint16x16, _mm256_adds_epu16) +OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint16x16, _mm256_subs_epu16) +OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint16x16, _mm256_mullo_epi16) +OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int16x16, _mm256_adds_epi16) +OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int16x16, _mm256_subs_epi16) +OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int16x16, _mm256_mullo_epi16) +OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint32x8, _mm256_add_epi32) +OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint32x8, _mm256_sub_epi32) +OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint32x8, _mm256_mullo_epi32) +OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int32x8, _mm256_add_epi32) +OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int32x8, _mm256_sub_epi32) +OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int32x8, _mm256_mullo_epi32) +OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint64x4, _mm256_add_epi64) +OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint64x4, _mm256_sub_epi64) +OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int64x4, _mm256_add_epi64) +OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int64x4, _mm256_sub_epi64) + +OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float32x8, _mm256_add_ps) +OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float32x8, _mm256_sub_ps) +OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float32x8, _mm256_mul_ps) +OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float32x8, _mm256_div_ps) +OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float64x4, _mm256_add_pd) +OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float64x4, _mm256_sub_pd) +OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float64x4, _mm256_mul_pd) +OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float64x4, _mm256_div_pd) + +inline void v_mul_expand(const v_int16x16& a, const v_int16x16& b, + v_int32x8& c, v_int32x8& d) +{ + v_int16x16 vhi = v_int16x16(_mm256_mulhi_epi16(a.val, b.val)); + + v_int16x16 v0, v1; + v_zip(a * b, vhi, v0, v1); + + c = v_reinterpret_as_s32(v0); + d = v_reinterpret_as_s32(v1); +} + +inline void v_mul_expand(const v_uint16x16& a, const v_uint16x16& b, + v_uint32x8& c, v_uint32x8& d) +{ + v_uint16x16 vhi = v_uint16x16(_mm256_mulhi_epu16(a.val, b.val)); + + v_uint16x16 v0, v1; + v_zip(a * b, vhi, v0, v1); + + c = v_reinterpret_as_u32(v0); + d = v_reinterpret_as_u32(v1); +} + +inline void v_mul_expand(const v_uint32x8& a, const v_uint32x8& b, + v_uint64x4& c, v_uint64x4& d) +{ + __m256i v0 = _mm256_mul_epu32(a.val, b.val); + __m256i v1 = _mm256_mul_epu32(_mm256_srli_epi64(a.val, 32), _mm256_srli_epi64(b.val, 32)); + v_zip(v_uint64x4(v0), v_uint64x4(v1), c, d); +} + + +/** Non-saturating arithmetics **/ +#define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \ + inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(intrin(a.val, b.val)); } + +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint8x32, _mm256_add_epi8) +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int8x32, _mm256_add_epi8) +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint16x16, _mm256_add_epi16) +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int16x16, _mm256_add_epi16) +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint8x32, _mm256_sub_epi8) +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int8x32, _mm256_sub_epi8) +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint16x16, _mm256_sub_epi16) +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int16x16, _mm256_sub_epi16) + +/** Bitwise shifts **/ +#define OPENCV_HAL_IMPL_AVX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \ + inline _Tpuvec operator << (const _Tpuvec& a, int imm) \ + { return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); } \ + inline _Tpsvec operator << (const _Tpsvec& a, int imm) \ + { return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); } \ + inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \ + { return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); } \ + inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \ + { return _Tpsvec(srai(a.val, imm)); } \ + template \ + inline _Tpuvec v_shl(const _Tpuvec& a) \ + { return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); } \ + template \ + inline _Tpsvec v_shl(const _Tpsvec& a) \ + { return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); } \ + template \ + inline _Tpuvec v_shr(const _Tpuvec& a) \ + { return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); } \ + template \ + inline _Tpsvec v_shr(const _Tpsvec& a) \ + { return _Tpsvec(srai(a.val, imm)); } + +OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint16x16, v_int16x16, epi16, _mm256_srai_epi16) +OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint32x8, v_int32x8, epi32, _mm256_srai_epi32) + +inline __m256i _mm256_srai_epi64xx(const __m256i a, int imm) +{ + __m256i d = _mm256_set1_epi64x((int64)1 << 63); + __m256i r = _mm256_srli_epi64(_mm256_add_epi64(a, d), imm); + return _mm256_sub_epi64(r, _mm256_srli_epi64(d, imm)); +} +OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint64x4, v_int64x4, epi64, _mm256_srai_epi64xx) + + +/** Bitwise logic **/ +#define OPENCV_HAL_IMPL_AVX_LOGIC_OP(_Tpvec, suffix, not_const) \ + OPENCV_HAL_IMPL_AVX_BIN_OP(&, _Tpvec, _mm256_and_##suffix) \ + OPENCV_HAL_IMPL_AVX_BIN_OP(|, _Tpvec, _mm256_or_##suffix) \ + OPENCV_HAL_IMPL_AVX_BIN_OP(^, _Tpvec, _mm256_xor_##suffix) \ + inline _Tpvec operator ~ (const _Tpvec& a) \ + { return _Tpvec(_mm256_xor_##suffix(a.val, not_const)); } + +OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint8x32, si256, _mm256_set1_epi32(-1)) +OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int8x32, si256, _mm256_set1_epi32(-1)) +OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint16x16, si256, _mm256_set1_epi32(-1)) +OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int16x16, si256, _mm256_set1_epi32(-1)) +OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint32x8, si256, _mm256_set1_epi32(-1)) +OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int32x8, si256, _mm256_set1_epi32(-1)) +OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint64x4, si256, _mm256_set1_epi64x(-1)) +OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int64x4, si256, _mm256_set1_epi64x(-1)) +OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_float32x8, ps, _mm256_castsi256_ps(_mm256_set1_epi32(-1))) +OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_float64x4, pd, _mm256_castsi256_pd(_mm256_set1_epi32(-1))) + +/** Select **/ +#define OPENCV_HAL_IMPL_AVX_SELECT(_Tpvec, suffix) \ + inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(_mm256_blendv_##suffix(b.val, a.val, mask.val)); } + +OPENCV_HAL_IMPL_AVX_SELECT(v_uint8x32, epi8) +OPENCV_HAL_IMPL_AVX_SELECT(v_int8x32, epi8) +OPENCV_HAL_IMPL_AVX_SELECT(v_uint16x16, epi8) +OPENCV_HAL_IMPL_AVX_SELECT(v_int16x16, epi8) +OPENCV_HAL_IMPL_AVX_SELECT(v_uint32x8, epi8) +OPENCV_HAL_IMPL_AVX_SELECT(v_int32x8, epi8) +OPENCV_HAL_IMPL_AVX_SELECT(v_float32x8, ps) +OPENCV_HAL_IMPL_AVX_SELECT(v_float64x4, pd) + +/** Comparison **/ +#define OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpvec) \ + inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ + { return ~(a == b); } \ + inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ + { return b > a; } \ + inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ + { return ~(a < b); } \ + inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ + { return b >= a; } + +#define OPENCV_HAL_IMPL_AVX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, sbit) \ + inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \ + { return _Tpuvec(_mm256_cmpeq_##suffix(a.val, b.val)); } \ + inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \ + { \ + __m256i smask = _mm256_set1_##suffix(sbit); \ + return _Tpuvec(_mm256_cmpgt_##suffix( \ + _mm256_xor_si256(a.val, smask), \ + _mm256_xor_si256(b.val, smask))); \ + } \ + inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \ + { return _Tpsvec(_mm256_cmpeq_##suffix(a.val, b.val)); } \ + inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \ + { return _Tpsvec(_mm256_cmpgt_##suffix(a.val, b.val)); } \ + OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpuvec) \ + OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpsvec) + +OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint8x32, v_int8x32, epi8, (char)-128) +OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint16x16, v_int16x16, epi16, (short)-32768) +OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint32x8, v_int32x8, epi32, (int)0x80000000) + +#define OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(_Tpvec) \ + inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(_mm256_cmpeq_epi64(a.val, b.val)); } \ + inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ + { return ~(a == b); } + +OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_uint64x4) +OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_int64x4) + +#define OPENCV_HAL_IMPL_AVX_CMP_FLT(bin_op, imm8, _Tpvec, suffix) \ + inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(_mm256_cmp_##suffix(a.val, b.val, imm8)); } + +#define OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(_Tpvec, suffix) \ + OPENCV_HAL_IMPL_AVX_CMP_FLT(==, _CMP_EQ_OQ, _Tpvec, suffix) \ + OPENCV_HAL_IMPL_AVX_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, suffix) \ + OPENCV_HAL_IMPL_AVX_CMP_FLT(<, _CMP_LT_OQ, _Tpvec, suffix) \ + OPENCV_HAL_IMPL_AVX_CMP_FLT(>, _CMP_GT_OQ, _Tpvec, suffix) \ + OPENCV_HAL_IMPL_AVX_CMP_FLT(<=, _CMP_LE_OQ, _Tpvec, suffix) \ + OPENCV_HAL_IMPL_AVX_CMP_FLT(>=, _CMP_GE_OQ, _Tpvec, suffix) + +OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float32x8, ps) +OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float64x4, pd) + +/** min/max **/ +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint8x32, _mm256_min_epu8) +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint8x32, _mm256_max_epu8) +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int8x32, _mm256_min_epi8) +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int8x32, _mm256_max_epi8) +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint16x16, _mm256_min_epu16) +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint16x16, _mm256_max_epu16) +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int16x16, _mm256_min_epi16) +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int16x16, _mm256_max_epi16) +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint32x8, _mm256_min_epu32) +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint32x8, _mm256_max_epu32) +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int32x8, _mm256_min_epi32) +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int32x8, _mm256_max_epi32) +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_float32x8, _mm256_min_ps) +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float32x8, _mm256_max_ps) +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_float64x4, _mm256_min_pd) +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float64x4, _mm256_max_pd) + +/** Rotate **/ +template +inline v_uint8x32 v_rotate_left(const v_uint8x32& a, const v_uint8x32& b) +{ + __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x03); + + switch(imm) + { + case 0: return a; + case 32: return b; + case 16: return v_uint8x32(swap); + } + + if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(a.val, swap, 16 - imm)); + if (imm < 32) return v_uint8x32(_mm256_alignr_epi8(swap, b.val, 32 - imm)); + + return v_uint8x32(); +} + +template +inline v_uint8x32 v_rotate_right(const v_uint8x32& a, const v_uint8x32& b) +{ + __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x21); + + switch(imm) + { + case 0: return a; + case 32: return b; + case 16: return v_uint8x32(swap); + } + + if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(swap, a.val, imm)); + if (imm < 32) return v_uint8x32(_mm256_alignr_epi8(b.val, swap, imm - 16)); + + return v_uint8x32(); +} + +template +inline v_uint8x32 v_rotate_left(const v_uint8x32& a) +{ + v_uint8x32 res; + // ESAC control[3] ? [127:0] = 0 + __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(0, 0, 2, 0)); + + if (imm == 0) + return a; + if (imm == 16) + res.val = swapz; + else if (imm < 16) + res.val = _mm256_alignr_epi8(a.val, swapz, 16 - imm); + else if (imm < 32) + res.val = _mm256_slli_si256(swapz, imm - 16); + else + return v_uint8x32(); + return res; +} + +template +inline v_uint8x32 v_rotate_right(const v_uint8x32& a) +{ + v_uint8x32 res; + // ESAC control[3] ? [127:0] = 0 + __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(2, 0, 0, 1)); + + if (imm == 0) + return a; + if (imm == 16) + res.val = swapz; + else if (imm < 16) + res.val = _mm256_alignr_epi8(swapz, a.val, imm); + else if (imm < 32) + res.val = _mm256_srli_si256(swapz, imm - 16); + else + return v_uint8x32(); + return res; +} + +#define OPENCV_HAL_IMPL_AVX_ROTATE_CAST(intrin, _Tpvec, cast) \ + template \ + inline _Tpvec intrin(const _Tpvec& a, const _Tpvec& b) \ + { \ + const int w = sizeof(typename _Tpvec::lane_type); \ + v_uint8x32 ret = intrin(v_reinterpret_as_u8(a), \ + v_reinterpret_as_u8(b)); \ + return _Tpvec(cast(ret.val)); \ + } \ + template \ + inline _Tpvec intrin(const _Tpvec& a) \ + { \ + const int w = sizeof(typename _Tpvec::lane_type); \ + v_uint8x32 ret = intrin(v_reinterpret_as_u8(a)); \ + return _Tpvec(cast(ret.val)); \ + } + +#define OPENCV_HAL_IMPL_AVX_ROTATE(_Tpvec) \ + OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left, _Tpvec, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, _Tpvec, OPENCV_HAL_NOP) + +OPENCV_HAL_IMPL_AVX_ROTATE(v_int8x32) +OPENCV_HAL_IMPL_AVX_ROTATE(v_uint16x16) +OPENCV_HAL_IMPL_AVX_ROTATE(v_int16x16) +OPENCV_HAL_IMPL_AVX_ROTATE(v_uint32x8) +OPENCV_HAL_IMPL_AVX_ROTATE(v_int32x8) +OPENCV_HAL_IMPL_AVX_ROTATE(v_uint64x4) +OPENCV_HAL_IMPL_AVX_ROTATE(v_int64x4) + +OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left, v_float32x8, _mm256_castsi256_ps) +OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float32x8, _mm256_castsi256_ps) +OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left, v_float64x4, _mm256_castsi256_pd) +OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float64x4, _mm256_castsi256_pd) + +////////// Reduce and mask ///////// + +/** Reduce **/ +#define OPENCV_HAL_IMPL_AVX_REDUCE_16(_Tpvec, sctype, func, intrin) \ + inline sctype v_reduce_##func(const _Tpvec& a) \ + { \ + __m128i v0 = _v256_extract_low(a.val); \ + __m128i v1 = _v256_extract_high(a.val); \ + v0 = intrin(v0, v1); \ + v0 = intrin(v0, _mm_srli_si128(v0, 8)); \ + v0 = intrin(v0, _mm_srli_si128(v0, 4)); \ + v0 = intrin(v0, _mm_srli_si128(v0, 2)); \ + return (sctype) _mm_cvtsi128_si32(v0); \ + } + +OPENCV_HAL_IMPL_AVX_REDUCE_16(v_uint16x16, ushort, min, _mm_min_epu16) +OPENCV_HAL_IMPL_AVX_REDUCE_16(v_int16x16, short, min, _mm_min_epi16) +OPENCV_HAL_IMPL_AVX_REDUCE_16(v_uint16x16, ushort, max, _mm_max_epu16) +OPENCV_HAL_IMPL_AVX_REDUCE_16(v_int16x16, short, max, _mm_max_epi16) + +#define OPENCV_HAL_IMPL_AVX_REDUCE_8(_Tpvec, sctype, func, intrin) \ + inline sctype v_reduce_##func(const _Tpvec& a) \ + { \ + __m128i v0 = _v256_extract_low(a.val); \ + __m128i v1 = _v256_extract_high(a.val); \ + v0 = intrin(v0, v1); \ + v0 = intrin(v0, _mm_srli_si128(v0, 8)); \ + v0 = intrin(v0, _mm_srli_si128(v0, 4)); \ + return (sctype) _mm_cvtsi128_si32(v0); \ + } + +OPENCV_HAL_IMPL_AVX_REDUCE_8(v_uint32x8, unsigned, min, _mm_min_epu32) +OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8, int, min, _mm_min_epi32) +OPENCV_HAL_IMPL_AVX_REDUCE_8(v_uint32x8, unsigned, max, _mm_max_epu32) +OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8, int, max, _mm_max_epi32) + +#define OPENCV_HAL_IMPL_AVX_REDUCE_FLT(func, intrin) \ + inline float v_reduce_##func(const v_float32x8& a) \ + { \ + __m128 v0 = _v256_extract_low(a.val); \ + __m128 v1 = _v256_extract_high(a.val); \ + v0 = intrin(v0, v1); \ + v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 3, 2))); \ + v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 0, 3))); \ + return _mm_cvtss_f32(v0); \ + } + +OPENCV_HAL_IMPL_AVX_REDUCE_FLT(min, _mm_min_ps) +OPENCV_HAL_IMPL_AVX_REDUCE_FLT(max, _mm_max_ps) + +inline ushort v_reduce_sum(const v_uint16x16& a) +{ + __m128i a0 = _v256_extract_low(a.val); + __m128i a1 = _v256_extract_high(a.val); + + __m128i s0 = _mm_adds_epu16(a0, a1); + s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8)); + s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4)); + s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 2)); + + return (ushort)_mm_cvtsi128_si32(s0); +} + +inline short v_reduce_sum(const v_int16x16& a) +{ + __m256i s0 = _mm256_hadds_epi16(a.val, a.val); + s0 = _mm256_hadds_epi16(s0, s0); + s0 = _mm256_hadds_epi16(s0, s0); + + __m128i s1 = _v256_extract_high(s0); + s1 = _mm_adds_epi16(_v256_extract_low(s0), s1); + + return (short)_mm_cvtsi128_si32(s1); +} + +inline int v_reduce_sum(const v_int32x8& a) +{ + __m256i s0 = _mm256_hadd_epi32(a.val, a.val); + s0 = _mm256_hadd_epi32(s0, s0); + + __m128i s1 = _v256_extract_high(s0); + s1 = _mm_add_epi32(_v256_extract_low(s0), s1); + + return _mm_cvtsi128_si32(s1); +} + +inline unsigned v_reduce_sum(const v_uint32x8& a) +{ return v_reduce_sum(v_reinterpret_as_s32(a)); } + +inline float v_reduce_sum(const v_float32x8& a) +{ + __m256 s0 = _mm256_hadd_ps(a.val, a.val); + s0 = _mm256_hadd_ps(s0, s0); + + __m128 s1 = _v256_extract_high(s0); + s1 = _mm_add_ps(_v256_extract_low(s0), s1); + + return _mm_cvtss_f32(s1); +} + +inline v_float32x8 v_reduce_sum4(const v_float32x8& a, const v_float32x8& b, + const v_float32x8& c, const v_float32x8& d) +{ + __m256 ab = _mm256_hadd_ps(a.val, b.val); + __m256 cd = _mm256_hadd_ps(c.val, d.val); + return v_float32x8(_mm256_hadd_ps(ab, cd)); +} + +/** Popcount **/ +#define OPENCV_HAL_IMPL_AVX_POPCOUNT(_Tpvec) \ + inline v_uint32x8 v_popcount(const _Tpvec& a) \ + { \ + const v_uint32x8 m1 = v256_setall_u32(0x55555555); \ + const v_uint32x8 m2 = v256_setall_u32(0x33333333); \ + const v_uint32x8 m4 = v256_setall_u32(0x0f0f0f0f); \ + v_uint32x8 p = v_reinterpret_as_u32(a); \ + p = ((p >> 1) & m1) + (p & m1); \ + p = ((p >> 2) & m2) + (p & m2); \ + p = ((p >> 4) & m4) + (p & m4); \ + p.val = _mm256_sad_epu8(p.val, _mm256_setzero_si256()); \ + return p; \ + } + +OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint8x32) +OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int8x32) +OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint16x16) +OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int16x16) +OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint32x8) +OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int32x8) + +/** Mask **/ +inline int v_signmask(const v_int8x32& a) +{ return _mm256_movemask_epi8(a.val); } +inline int v_signmask(const v_uint8x32& a) +{ return v_signmask(v_reinterpret_as_s8(a)); } + +inline int v_signmask(const v_int16x16& a) +{ + v_int8x32 v = v_int8x32(_mm256_packs_epi16(a.val, a.val)); + return v_signmask(v) & 255; +} +inline int v_signmask(const v_uint16x16& a) +{ return v_signmask(v_reinterpret_as_s16(a)); } + +inline int v_signmask(const v_int32x8& a) +{ + __m256i a16 = _mm256_packs_epi32(a.val, a.val); + v_int8x32 v = v_int8x32(_mm256_packs_epi16(a16, a16)); + return v_signmask(v) & 15; +} +inline int v_signmask(const v_uint32x8& a) +{ return v_signmask(v_reinterpret_as_s32(a)); } + +inline int v_signmask(const v_float32x8& a) +{ return _mm256_movemask_ps(a.val); } +inline int v_signmask(const v_float64x4& a) +{ return _mm256_movemask_pd(a.val); } + +/** Checks **/ +#define OPENCV_HAL_IMPL_AVX_CHECK(_Tpvec, and_op, allmask) \ + inline bool v_check_all(const _Tpvec& a) \ + { \ + int mask = v_signmask(v_reinterpret_as_s8(a)); \ + return and_op(mask, allmask) == allmask; \ + } \ + inline bool v_check_any(const _Tpvec& a) \ + { \ + int mask = v_signmask(v_reinterpret_as_s8(a)); \ + return and_op(mask, allmask) != 0; \ + } + +OPENCV_HAL_IMPL_AVX_CHECK(v_uint8x32, OPENCV_HAL_1ST, -1) +OPENCV_HAL_IMPL_AVX_CHECK(v_int8x32, OPENCV_HAL_1ST, -1) +OPENCV_HAL_IMPL_AVX_CHECK(v_uint16x16, OPENCV_HAL_AND, (int)0xaaaa) +OPENCV_HAL_IMPL_AVX_CHECK(v_int16x16, OPENCV_HAL_AND, (int)0xaaaa) +OPENCV_HAL_IMPL_AVX_CHECK(v_uint32x8, OPENCV_HAL_AND, (int)0x8888) +OPENCV_HAL_IMPL_AVX_CHECK(v_int32x8, OPENCV_HAL_AND, (int)0x8888) + +#define OPENCV_HAL_IMPL_AVX_CHECK_FLT(_Tpvec, allmask) \ + inline bool v_check_all(const _Tpvec& a) \ + { \ + int mask = v_signmask(a); \ + return mask == allmask; \ + } \ + inline bool v_check_any(const _Tpvec& a) \ + { \ + int mask = v_signmask(a); \ + return mask != 0; \ + } + +OPENCV_HAL_IMPL_AVX_CHECK_FLT(v_float32x8, 255) +OPENCV_HAL_IMPL_AVX_CHECK_FLT(v_float64x4, 15) + + +////////// Other math ///////// + +/** Some frequent operations **/ +#define OPENCV_HAL_IMPL_AVX_MULADD(_Tpvec, suffix) \ + inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \ + { return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); } \ + inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \ + { return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); } \ + inline _Tpvec v_sqrt(const _Tpvec& x) \ + { return _Tpvec(_mm256_sqrt_##suffix(x.val)); } \ + inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \ + { return v_fma(a, a, b * b); } \ + inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \ + { return v_sqrt(v_fma(a, a, b*b)); } + +OPENCV_HAL_IMPL_AVX_MULADD(v_float32x8, ps) +OPENCV_HAL_IMPL_AVX_MULADD(v_float64x4, pd) + +inline v_float32x8 v_invsqrt(const v_float32x8& x) +{ + v_float32x8 half = x * v256_setall_f32(0.5); + v_float32x8 t = v_float32x8(_mm256_rsqrt_ps(x.val)); + // todo: _mm256_fnmsub_ps + t *= v256_setall_f32(1.5) - ((t * t) * half); + return t; +} + +inline v_float64x4 v_invsqrt(const v_float64x4& x) +{ + return v256_setall_f64(1.) / v_sqrt(x); +} + +/** Absolute values **/ +#define OPENCV_HAL_IMPL_AVX_ABS(_Tpvec, suffix) \ + inline v_u##_Tpvec v_abs(const v_##_Tpvec& x) \ + { return v_u##_Tpvec(_mm256_abs_##suffix(x.val)); } + +OPENCV_HAL_IMPL_AVX_ABS(int8x32, epi8) +OPENCV_HAL_IMPL_AVX_ABS(int16x16, epi16) +OPENCV_HAL_IMPL_AVX_ABS(int32x8, epi32) + +inline v_float32x8 v_abs(const v_float32x8& x) +{ return x & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))); } +inline v_float64x4 v_abs(const v_float64x4& x) +{ return x & v_float64x4(_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_set1_epi64x(-1), 1))); } + +/** Absolute difference **/ +inline v_uint8x32 v_absdiff(const v_uint8x32& a, const v_uint8x32& b) +{ return v_add_wrap(a - b, b - a); } +inline v_uint16x16 v_absdiff(const v_uint16x16& a, const v_uint16x16& b) +{ return v_add_wrap(a - b, b - a); } +inline v_uint32x8 v_absdiff(const v_uint32x8& a, const v_uint32x8& b) +{ return v_max(a, b) - v_min(a, b); } + +inline v_uint8x32 v_absdiff(const v_int8x32& a, const v_int8x32& b) +{ + v_int8x32 d = v_sub_wrap(a, b); + v_int8x32 m = a < b; + return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m)); +} + +inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b) +{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); } + +inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b) +{ + v_int32x8 d = a - b; + v_int32x8 m = a < b; + return v_reinterpret_as_u32((d ^ m) - m); +} + +inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b) +{ return v_abs(a - b); } + +inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b) +{ return v_abs(a - b); } + +////////// Conversions ///////// + +/** Rounding **/ +inline v_int32x8 v_round(const v_float32x8& a) +{ return v_int32x8(_mm256_cvtps_epi32(a.val)); } + +inline v_int32x8 v_round(const v_float64x4& a) +{ return v_int32x8(_mm256_castsi128_si256(_mm256_cvtpd_epi32(a.val))); } + +inline v_int32x8 v_trunc(const v_float32x8& a) +{ return v_int32x8(_mm256_cvttps_epi32(a.val)); } + +inline v_int32x8 v_trunc(const v_float64x4& a) +{ return v_int32x8(_mm256_castsi128_si256(_mm256_cvttpd_epi32(a.val))); } + +inline v_int32x8 v_floor(const v_float32x8& a) +{ return v_int32x8(_mm256_cvttps_epi32(_mm256_floor_ps(a.val))); } + +inline v_int32x8 v_floor(const v_float64x4& a) +{ return v_trunc(v_float64x4(_mm256_floor_pd(a.val))); } + +inline v_int32x8 v_ceil(const v_float32x8& a) +{ return v_int32x8(_mm256_cvttps_epi32(_mm256_ceil_ps(a.val))); } + +inline v_int32x8 v_ceil(const v_float64x4& a) +{ return v_trunc(v_float64x4(_mm256_ceil_pd(a.val))); } + +/** To float **/ +inline v_float32x8 v_cvt_f32(const v_int32x8& a) +{ return v_float32x8(_mm256_cvtepi32_ps(a.val)); } + +inline v_float32x8 v_cvt_f32(const v_float64x4& a) +{ return v_float32x8(_mm256_castps128_ps256(_mm256_cvtpd_ps(a.val))); } + +inline v_float32x8 v_cvt_f32(const v_float64x4& a, const v_float64x4& b) +{ + __m128 af = _mm256_cvtpd_ps(a.val), bf = _mm256_cvtpd_ps(b.val); + return v_float32x8(_mm256_insertf128_ps(_mm256_castps128_ps256(af), bf, 1)); +} + +inline v_float64x4 v_cvt_f64(const v_int32x8& a) +{ return v_float64x4(_mm256_cvtepi32_pd(_v256_extract_low(a.val))); } + +inline v_float64x4 v_cvt_f64_high(const v_int32x8& a) +{ return v_float64x4(_mm256_cvtepi32_pd(_v256_extract_high(a.val))); } + +inline v_float64x4 v_cvt_f64(const v_float32x8& a) +{ return v_float64x4(_mm256_cvtps_pd(_v256_extract_low(a.val))); } + +inline v_float64x4 v_cvt_f64_high(const v_float32x8& a) +{ return v_float64x4(_mm256_cvtps_pd(_v256_extract_high(a.val))); } + +#if CV_FP16 +inline v_float32x8 v_cvt_f32(const v_float16x16& a) +{ return v_float32x8(_mm256_cvtph_ps(_v256_extract_low(a.val))); } + +inline v_float32x8 v_cvt_f32_high(const v_float16x16& a) +{ return v_float32x8(_mm256_cvtph_ps(_v256_extract_high(a.val))); } + +inline v_float16x16 v_cvt_f16(const v_float32x8& a, const v_float32x8& b) +{ + __m128i ah = _mm256_cvtps_ph(a.val, 0), bh = _mm256_cvtps_ph(b.val, 0); + return v_float16x16(_mm256_inserti128_si256(_mm256_castsi128_si256(ah), bh, 1)); +} +#endif + +////////////// Lookup table access //////////////////// + +inline v_int32x8 v_lut(const int* tab, const v_int32x8& idxvec) +{ + int CV_DECL_ALIGNED(32) idx[8]; + v_store_aligned(idx, idxvec); + return v_int32x8(_mm256_setr_epi32(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]], + tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]])); +} + +inline v_float32x8 v_lut(const float* tab, const v_int32x8& idxvec) +{ + int CV_DECL_ALIGNED(32) idx[8]; + v_store_aligned(idx, idxvec); + return v_float32x8(_mm256_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]], + tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]])); +} + +inline v_float64x4 v_lut(const double* tab, const v_int32x8& idxvec) +{ + int CV_DECL_ALIGNED(32) idx[8]; + v_store_aligned(idx, idxvec); + return v_float64x4(_mm256_setr_pd(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]])); +} + +inline void v_lut_deinterleave(const float* tab, const v_int32x8& idxvec, v_float32x8& x, v_float32x8& y) +{ + int CV_DECL_ALIGNED(32) idx[8]; + v_store_aligned(idx, idxvec); + __m128 z = _mm_setzero_ps(); + __m128 xy01, xy45, xy23, xy67; + xy01 = _mm_loadl_pi(z, (const __m64*)(tab + idx[0])); + xy01 = _mm_loadh_pi(xy01, (const __m64*)(tab + idx[1])); + xy45 = _mm_loadl_pi(z, (const __m64*)(tab + idx[4])); + xy45 = _mm_loadh_pi(xy45, (const __m64*)(tab + idx[5])); + __m256 xy0145 = _v256_combine(xy01, xy45); + xy23 = _mm_loadl_pi(z, (const __m64*)(tab + idx[2])); + xy23 = _mm_loadh_pi(xy23, (const __m64*)(tab + idx[3])); + xy67 = _mm_loadl_pi(z, (const __m64*)(tab + idx[6])); + xy67 = _mm_loadh_pi(xy67, (const __m64*)(tab + idx[7])); + __m256 xy2367 = _v256_combine(xy23, xy67); + + __m256 xxyy0145 = _mm256_unpacklo_ps(xy0145, xy2367); + __m256 xxyy2367 = _mm256_unpackhi_ps(xy0145, xy2367); + + x = v_float32x8(_mm256_unpacklo_ps(xxyy0145, xxyy2367)); + y = v_float32x8(_mm256_unpackhi_ps(xxyy0145, xxyy2367)); +} + +inline void v_lut_deinterleave(const double* tab, const v_int32x8& idxvec, v_float64x4& x, v_float64x4& y) +{ + int CV_DECL_ALIGNED(32) idx[4]; + v_store_low(idx, idxvec); + __m128d xy0 = _mm_loadu_pd(tab + idx[0]); + __m128d xy2 = _mm_loadu_pd(tab + idx[2]); + __m128d xy1 = _mm_loadu_pd(tab + idx[1]); + __m128d xy3 = _mm_loadu_pd(tab + idx[3]); + __m256d xy02 = _v256_combine(xy0, xy2); + __m256d xy13 = _v256_combine(xy1, xy3); + + x = v_float64x4(_mm256_unpacklo_pd(xy02, xy13)); + y = v_float64x4(_mm256_unpackhi_pd(xy02, xy13)); +} + +////////// Matrix operations ///////// + +inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b) +{ return v_int32x8(_mm256_madd_epi16(a.val, b.val)); } + +inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c) +{ return v_dotprod(a, b) + c; } + +#define OPENCV_HAL_AVX_SPLAT2_PS(a, im) \ + v_float32x8(_mm256_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im))) + +inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0, + const v_float32x8& m1, const v_float32x8& m2, + const v_float32x8& m3) +{ + v_float32x8 v04 = OPENCV_HAL_AVX_SPLAT2_PS(v, 0); + v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1); + v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2); + v_float32x8 v37 = OPENCV_HAL_AVX_SPLAT2_PS(v, 3); + return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3))); +} + +inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0, + const v_float32x8& m1, const v_float32x8& m2, + const v_float32x8& a) +{ + v_float32x8 v04 = OPENCV_HAL_AVX_SPLAT2_PS(v, 0); + v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1); + v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2); + return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, a))); +} + +#define OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \ + inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \ + const _Tpvec& a2, const _Tpvec& a3, \ + _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \ + { \ + __m256i t0 = cast_from(_mm256_unpacklo_##suffix(a0.val, a1.val)); \ + __m256i t1 = cast_from(_mm256_unpacklo_##suffix(a2.val, a3.val)); \ + __m256i t2 = cast_from(_mm256_unpackhi_##suffix(a0.val, a1.val)); \ + __m256i t3 = cast_from(_mm256_unpackhi_##suffix(a2.val, a3.val)); \ + b0.val = cast_to(_mm256_unpacklo_epi64(t0, t1)); \ + b1.val = cast_to(_mm256_unpackhi_epi64(t0, t1)); \ + b2.val = cast_to(_mm256_unpacklo_epi64(t2, t3)); \ + b3.val = cast_to(_mm256_unpackhi_epi64(t2, t3)); \ + } + +OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_uint32x8, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_int32x8, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_float32x8, ps, _mm256_castps_si256, _mm256_castsi256_ps) + +//////////////// Value reordering /////////////// + +/* Expand */ +#define OPENCV_HAL_IMPL_AVX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \ + inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \ + { \ + b0.val = intrin(_v256_extract_low(a.val)); \ + b1.val = intrin(_v256_extract_high(a.val)); \ + } \ + inline _Tpwvec v256_load_expand(const _Tp* ptr) \ + { \ + __m128i a = _mm_loadu_si128((const __m128i*)ptr); \ + return _Tpwvec(intrin(a)); \ + } + +OPENCV_HAL_IMPL_AVX_EXPAND(v_uint8x32, v_uint16x16, uchar, _mm256_cvtepu8_epi16) +OPENCV_HAL_IMPL_AVX_EXPAND(v_int8x32, v_int16x16, schar, _mm256_cvtepi8_epi16) +OPENCV_HAL_IMPL_AVX_EXPAND(v_uint16x16, v_uint32x8, ushort, _mm256_cvtepu16_epi32) +OPENCV_HAL_IMPL_AVX_EXPAND(v_int16x16, v_int32x8, short, _mm256_cvtepi16_epi32) +OPENCV_HAL_IMPL_AVX_EXPAND(v_uint32x8, v_uint64x4, unsigned, _mm256_cvtepu32_epi64) +OPENCV_HAL_IMPL_AVX_EXPAND(v_int32x8, v_int64x4, int, _mm256_cvtepi32_epi64) + +#define OPENCV_HAL_IMPL_AVX_EXPAND_Q(_Tpvec, _Tp, intrin) \ + inline _Tpvec v256_load_expand_q(const _Tp* ptr) \ + { \ + __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \ + return _Tpvec(intrin(a)); \ + } + +OPENCV_HAL_IMPL_AVX_EXPAND_Q(v_uint32x8, uchar, _mm256_cvtepu8_epi32) +OPENCV_HAL_IMPL_AVX_EXPAND_Q(v_int32x8, schar, _mm256_cvtepi8_epi32) + +/* pack */ +// 16 +inline v_int8x32 v_pack(const v_int16x16& a, const v_int16x16& b) +{ return v_int8x32(_v256_shuffle_odd_64(_mm256_packs_epi16(a.val, b.val))); } + +inline v_uint8x32 v_pack(const v_uint16x16& a, const v_uint16x16& b) +{ return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a.val, b.val))); } + +inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b) +{ return v_pack(v_reinterpret_as_u16(a), v_reinterpret_as_u16(b)); } + +inline void v_pack_store(schar* ptr, const v_int16x16& a) +{ v_store_low(ptr, v_pack(a, a)); } + +inline void v_pack_store(uchar* ptr, const v_uint16x16& a) +{ v_store_low(ptr, v_pack(a, a)); } + +inline void v_pack_u_store(uchar* ptr, const v_int16x16& a) +{ v_store_low(ptr, v_pack_u(a, a)); } + +template inline +v_uint8x32 v_rshr_pack(const v_uint16x16& a, const v_uint16x16& b) +{ + // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers. + v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1))); + return v_pack_u(v_reinterpret_as_s16((a + delta) >> n), + v_reinterpret_as_s16((b + delta) >> n)); +} + +template inline +void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a) +{ + v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1))); + v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n)); +} + +template inline +v_uint8x32 v_rshr_pack_u(const v_int16x16& a, const v_int16x16& b) +{ + v_int16x16 delta = v256_setall_s16((short)(1 << (n-1))); + return v_pack_u((a + delta) >> n, (b + delta) >> n); +} + +template inline +void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a) +{ + v_int16x16 delta = v256_setall_s16((short)(1 << (n-1))); + v_pack_u_store(ptr, (a + delta) >> n); +} + +template inline +v_int8x32 v_rshr_pack(const v_int16x16& a, const v_int16x16& b) +{ + v_int16x16 delta = v256_setall_s16((short)(1 << (n-1))); + return v_pack((a + delta) >> n, (b + delta) >> n); +} + +template inline +void v_rshr_pack_store(schar* ptr, const v_int16x16& a) +{ + v_int16x16 delta = v256_setall_s16((short)(1 << (n-1))); + v_pack_store(ptr, (a + delta) >> n); +} + +// 32 +inline v_int16x16 v_pack(const v_int32x8& a, const v_int32x8& b) +{ return v_int16x16(_v256_shuffle_odd_64(_mm256_packs_epi32(a.val, b.val))); } + +inline v_uint16x16 v_pack(const v_uint32x8& a, const v_uint32x8& b) +{ return v_uint16x16(_v256_shuffle_odd_64(_mm256_packus_epi32(a.val, b.val))); } + +inline v_uint16x16 v_pack_u(const v_int32x8& a, const v_int32x8& b) +{ return v_pack(v_reinterpret_as_u32(a), v_reinterpret_as_u32(b)); } + +inline void v_pack_store(short* ptr, const v_int32x8& a) +{ v_store_low(ptr, v_pack(a, a)); } + +inline void v_pack_store(ushort* ptr, const v_uint32x8& a) +{ v_store_low(ptr, v_pack(a, a)); } + +inline void v_pack_u_store(ushort* ptr, const v_int32x8& a) +{ v_store_low(ptr, v_pack_u(a, a)); } + + +template inline +v_uint16x16 v_rshr_pack(const v_uint32x8& a, const v_uint32x8& b) +{ + // we assume that n > 0, and so the shifted 32-bit values can be treated as signed numbers. + v_uint32x8 delta = v256_setall_u32(1 << (n-1)); + return v_pack_u(v_reinterpret_as_s32((a + delta) >> n), + v_reinterpret_as_s32((b + delta) >> n)); +} + +template inline +void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a) +{ + v_uint32x8 delta = v256_setall_u32(1 << (n-1)); + v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n)); +} + +template inline +v_uint16x16 v_rshr_pack_u(const v_int32x8& a, const v_int32x8& b) +{ + v_int32x8 delta = v256_setall_s32(1 << (n-1)); + return v_pack_u((a + delta) >> n, (b + delta) >> n); +} + +template inline +void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a) +{ + v_int32x8 delta = v256_setall_s32(1 << (n-1)); + v_pack_u_store(ptr, (a + delta) >> n); +} + +template inline +v_int16x16 v_rshr_pack(const v_int32x8& a, const v_int32x8& b) +{ + v_int32x8 delta = v256_setall_s32(1 << (n-1)); + return v_pack((a + delta) >> n, (b + delta) >> n); +} + +template inline +void v_rshr_pack_store(short* ptr, const v_int32x8& a) +{ + v_int32x8 delta = v256_setall_s32(1 << (n-1)); + v_pack_store(ptr, (a + delta) >> n); +} + +// 64 +// Non-saturating pack +inline v_uint32x8 v_pack(const v_uint64x4& a, const v_uint64x4& b) +{ + __m256i a0 = _mm256_shuffle_epi32(a.val, _MM_SHUFFLE(0, 0, 2, 0)); + __m256i b0 = _mm256_shuffle_epi32(b.val, _MM_SHUFFLE(0, 0, 2, 0)); + __m256i ab = _mm256_unpacklo_epi64(a0, b0); // a0, a1, b0, b1, a2, a3, b2, b3 + return v_uint32x8(_v256_shuffle_odd_64(ab)); +} + +inline v_int32x8 v_pack(const v_int64x4& a, const v_int64x4& b) +{ return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); } + +inline void v_pack_store(unsigned* ptr, const v_uint64x4& a) +{ + __m256i a0 = _mm256_shuffle_epi32(a.val, _MM_SHUFFLE(0, 0, 2, 0)); + v_store_low(ptr, v_uint32x8(_v256_shuffle_odd_64(a0))); +} + +inline void v_pack_store(int* ptr, const v_int64x4& b) +{ v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(b)); } + +template inline +v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b) +{ + v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1)); + return v_pack((a + delta) >> n, (b + delta) >> n); +} + +template inline +void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a) +{ + v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1)); + v_pack_store(ptr, (a + delta) >> n); +} + +template inline +v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b) +{ + v_int64x4 delta = v256_setall_s64((int64)1 << (n-1)); + return v_pack((a + delta) >> n, (b + delta) >> n); +} + +template inline +void v_rshr_pack_store(int* ptr, const v_int64x4& a) +{ + v_int64x4 delta = v256_setall_s64((int64)1 << (n-1)); + v_pack_store(ptr, (a + delta) >> n); +} + +/* Recombine */ +// its up there with load and store operations + +/* Extract */ +#define OPENCV_HAL_IMPL_AVX_EXTRACT(_Tpvec) \ + template \ + inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \ + { return v_rotate_right(a, b); } + +OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint8x32) +OPENCV_HAL_IMPL_AVX_EXTRACT(v_int8x32) +OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint16x16) +OPENCV_HAL_IMPL_AVX_EXTRACT(v_int16x16) +OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint32x8) +OPENCV_HAL_IMPL_AVX_EXTRACT(v_int32x8) +OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint64x4) +OPENCV_HAL_IMPL_AVX_EXTRACT(v_int64x4) +OPENCV_HAL_IMPL_AVX_EXTRACT(v_float32x8) +OPENCV_HAL_IMPL_AVX_EXTRACT(v_float64x4) + + +/** Reinterpret **/ +// its up there with load and store operations + +/* de&interleave */ +#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_2CH(_Tpvec, _Tp, suffix) \ + inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b) \ + { return v256_load_deinterleave_##suffix(ptr, a, b); } \ + inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b) \ + { return v256_store_interleave_2ch(ptr, a, b); } + +#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_3CH(_Tpvec, _Tp, suffix) \ + inline void v_load_deinterleave \ + (const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c) \ + { return v256_load_deinterleave_##suffix(ptr, a, b, c); } \ + inline void v_store_interleave \ + (_Tp* ptr, const _Tpvec& a,const _Tpvec& b, const _Tpvec& c) \ + { return v256_store_interleave_##suffix(ptr, a, b, c); } + +#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_4CH(_Tpvec, _Tp, suffix) \ + inline void v_load_deinterleave \ + (const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d) \ + { return v256_load_deinterleave_##suffix(ptr, a, b, c, d); } \ + inline void v_store_interleave \ + (_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d) \ + { return v256_store_interleave_##suffix(ptr, a, b, c, d); } + +#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_3n4CH(_Tpvec, _Tp, suffix) \ + OPENCV_HAL_IMPL_AVX_INTERLEAVE_3CH(_Tpvec, _Tp, suffix) \ + OPENCV_HAL_IMPL_AVX_INTERLEAVE_4CH(_Tpvec, _Tp, suffix) + +#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(_Tpvec, _Tp, suffix) \ + OPENCV_HAL_IMPL_AVX_INTERLEAVE_2CH(_Tpvec, _Tp, suffix) \ + OPENCV_HAL_IMPL_AVX_INTERLEAVE_3n4CH(_Tpvec, _Tp, suffix) + +/* **** */ +// +template +inline void v256_store_interleave_2ch(_Tp* ptr, const _Tpvec& a, const _Tpvec& b) +{ + _Tpvec ab0, ab1; + v_zip(a, b, ab0, ab1); + v_store(ptr, ab0); + v_store(ptr + _Tpvec::nlanes, ab1); +} + +template +inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b) +{ + _Tpvec ab0 = v256_load(ptr); + _Tpvec ab1 = v256_load(ptr + _Tpvec::nlanes); + _Tpvec ab00, ab11; + v_recombine(ab0, ab1, ab00, ab11); + v256_zip(ab00, ab11, a, b); +} + +/// +template +inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c) +{ + _Tpvec abc0 = v256_load(ptr); + _Tpvec abc1 = v256_load(ptr + _Tpvec::nlanes); + _Tpvec abc2 = v256_load(ptr + _Tpvec::nlanes * 2); + + _Tpvec ab0 = v256_combine_diagonal(abc0, abc1); + _Tpvec bc1 = v256_combine_diagonal(abc1, abc2); + _Tpvec ac1 = v256_reverse_64(v256_combine_diagonal(abc2, abc0)); + + a = v256_unpacklo(ab0, ac1); + c = v256_unpackhi(ac1, bc1); + b = v256_alignr_64(bc1, ab0); +} + + +template +inline void v256_store_interleave_l4(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) +{ + _Tpvec ab0 = v256_unpacklo(a, b); + _Tpvec bc1 = v256_unpackhi(b, c); + _Tpvec ca10 = v256_swap_halves(v256_blend<0b1010>(c, a)); + + v_store(ptr, v256_combine_diagonal(ab0, ca10)); + v_store(ptr + _Tpvec::nlanes, v256_combine_diagonal(bc1, ab0)); + v_store(ptr + _Tpvec::nlanes * 2, v256_combine_diagonal(ca10, bc1)); +} + +//// +template +inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d) +{ + _Tpvec abcd0 = v256_load(ptr); + _Tpvec abcd1 = v256_load(ptr + _Tpvec::nlanes); + _Tpvec abcd2 = v256_load(ptr + _Tpvec::nlanes * 2); + _Tpvec abcd3 = v256_load(ptr + _Tpvec::nlanes * 3); + + _Tpvec cd0ab0 = v256_alignr_128(abcd0, abcd2); + _Tpvec cd1ab1 = v256_alignr_128(abcd1, abcd3); + + _Tpvec ab0 = v256_combine_diagonal(abcd0, cd0ab0); + _Tpvec ab1 = v256_combine_diagonal(abcd1, cd1ab1); + _Tpvec cd0 = v256_combine_diagonal(cd0ab0, abcd2); + _Tpvec cd1 = v256_combine_diagonal(cd1ab1, abcd3); + + v256_zip(ab0, ab1, a, b); + v256_zip(cd0, cd1, c, d); +} + +template +inline void v256_store_interleave_l4(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d) +{ + _Tpvec ab0, ab1, cd0, cd1; + v256_zip(a, b, ab0, ab1); + v256_zip(c, d, cd0, cd1); + + _Tpvec ab0cd0 = v256_alignr_128(ab0, cd0); + _Tpvec ab1cd1 = v256_alignr_128(ab1, cd1); + + v_store(ptr, v256_combine_diagonal(ab0, ab0cd0)); + v_store(ptr + _Tpvec::nlanes, v256_combine_diagonal(ab1, ab1cd1)); + v_store(ptr + _Tpvec::nlanes * 2, v256_combine_diagonal(ab0cd0, cd0)); + v_store(ptr + _Tpvec::nlanes * 3, v256_combine_diagonal(ab1cd1, cd1)); +} + +OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint64x4, uint64, l4) +OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int64x4, int64, l4) +OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_float64x4, double, l4) + +/* **** **** */ +// +inline void v256_load_deinterleave_l8(const float* ptr, v_float32x8& a, v_float32x8& b) +{ + v_float32x8 ab0 = v256_load(ptr); + v_float32x8 ab1 = v256_load(ptr + 8); + + v_float32x8 ab0ab2, ab1ab3; + v_recombine(ab0, ab1, ab0ab2, ab1ab3); + + a.val = _mm256_shuffle_ps(ab0ab2.val, ab1ab3.val, _MM_SHUFFLE(2, 0, 2, 0)); + b.val = _mm256_shuffle_ps(ab0ab2.val, ab1ab3.val, _MM_SHUFFLE(3, 1, 3, 1)); +} + +template +inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b) +{ + v_float32x8 fa, fb; + v256_load_deinterleave_l8((float*)ptr, fa, fb); + a.val = v_reinterpret_as_u32(fa).val; + b.val = v_reinterpret_as_u32(fb).val; +} +/// +template +inline void v256_store_interleave_l8(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) +{ + _Tpvec ab0, ab1, bc0, bc1; + v256_zip(a, b, ab0, ab1); + v256_zip(b, c, bc0, bc1); + + _Tpvec cazg = v256_blend<0b10101010>(c, a); + _Tpvec abc0abc1(_mm256_unpacklo_epi64(ab0.val, cazg.val)); + _Tpvec abc1abc2(_mm256_unpackhi_epi64(cazg.val, bc1.val)); + _Tpvec abc2abc0 = v256_reverse_64(v256_blend<0b11001100>(ab1, bc0)); + + _Tpvec abc0 = v256_combine_diagonal(abc0abc1, abc2abc0); + _Tpvec abc1 = v256_combine_diagonal(abc1abc2, abc0abc1); + _Tpvec abc2 = v256_combine_diagonal(abc2abc0, abc1abc2); + + v_store(ptr, abc0); + v_store(ptr + _Tpvec::nlanes, abc1); + v_store(ptr + _Tpvec::nlanes * 2, abc2); +} + +inline void v256_store_interleave_l8(float* ptr, const v_float32x8& a, const v_float32x8& b, const v_float32x8& c) +{ + v_float32x8 ab0, ab1, bc0, bc1; + v256_zip(a, b, ab0, ab1); + v256_zip(b, c, bc0, bc1); + + v_float32x8 cazg = v256_blend<0b10101010>(c, a); + v_float32x8 abc0abc1(_mm256_shuffle_ps(ab0.val, cazg.val, _MM_SHUFFLE(1, 0, 1, 0))); + v_float32x8 abc1abc2(_mm256_shuffle_ps(cazg.val, bc1.val, _MM_SHUFFLE(3, 2, 3, 2))); + + v_float32x8 abc0abc2(_mm256_shuffle_ps(bc0.val, ab1.val, _MM_SHUFFLE(1, 0, 3, 2))); + v_float32x8 abc2abc0 = v256_swap_halves(abc0abc2); + + v_float32x8 abc0 = v256_combine_diagonal(abc0abc1, abc2abc0); + v_float32x8 abc1 = v256_combine_diagonal(abc1abc2, abc0abc1); + v_float32x8 abc2 = v256_combine_diagonal(abc2abc0, abc1abc2); + + v_store(ptr, abc0); + v_store(ptr + 8, abc1); + v_store(ptr + 16, abc2); +} + +template +inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c) +{ + _Tpvec abc02 = v256_load(ptr); + _Tpvec abc1 = v256_load(ptr + _Tpvec::nlanes); + _Tpvec abc20 = v256_load(ptr + _Tpvec::nlanes * 2); + + _Tpvec abc2 = v256_alignr_128(abc02, abc20); + _Tpvec abc0 = v256_combine_diagonal(abc02, abc20); + + a = v256_blend<0b10010010>(abc0, abc1); + a = v256_blend<0b01000100>(a, abc2); + + b = v256_blend<0b00100100>(abc0, abc1); + b = v256_blend<0b10011001>(b, abc2); + + c = v256_blend<0b01001001>(abc0, abc1); + c = v256_blend<0b00100010>(c, abc2); + + a = v256_shuffle<_MM_SHUFFLE(1, 2, 3, 0)>(a); + b = v256_shuffle<_MM_SHUFFLE(2, 3, 0, 1)>(b); + c = v256_shuffle<_MM_SHUFFLE(3, 0, 1, 2)>(c); +} +///// +template +inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d) +{ + _Tpvec ab0, ab1, cd0, cd1; + v256_load_deinterleave_l4(ptr, ab0, cd0, ab1, cd1); + v256_zip(ab0, ab1, a, b); + v256_zip(cd0, cd1, c, d); +} + +template +inline void v256_store_interleave_l8(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d) +{ + _Tpvec ac0, ac1, bd0, bd1; + v256_zip(a, c, ac0, ac1); + v256_zip(b, d, bd0, bd1); + + _Tpvec abcd0, abcd1, abcd2, abcd3; + v256_zip(ac0, bd0, abcd0, abcd1); + v256_zip(ac1, bd1, abcd2, abcd3); + + _Tpvec abcd01, abcd23, abcd45, abcd67; + v_recombine(abcd0, abcd1, abcd01, abcd45); + v_recombine(abcd2, abcd3, abcd23, abcd67); + + v_store(ptr, abcd01); + v_store(ptr + _Tpvec::nlanes, abcd23); + v_store(ptr + _Tpvec::nlanes * 2, abcd45); + v_store(ptr + _Tpvec::nlanes * 3, abcd67); +} + +OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint32x8, unsigned, l8) +OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int32x8, int, l8) +OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_float32x8, float, l8) + +/* ******** ******** */ +// +template +inline void v256_load_deinterleave_l16(const _Tp* ptr, _Tpvec& a, _Tpvec& b) +{ + const __m256i sep = _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 + ); + + _Tpvec ab0, ab1; + v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes), ab0, ab1); + + __m256i a0b0 = _mm256_shuffle_epi8(ab0.val, sep); + __m256i a1b1 = _mm256_shuffle_epi8(ab1.val, sep); + + a.val = _mm256_unpacklo_epi64(a0b0, a1b1); + b.val = _mm256_unpackhi_epi64(a0b0, a1b1); +} +/// +template +inline void v256_store_interleave_l16(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) +{ + v_uint32x8 ab0 = v_reinterpret_as_u32(v256_unpacklo(a, b)); + v_uint32x8 ab1 = v_reinterpret_as_u32(v256_unpackhi(a, b)); + v_uint32x8 bc0 = v_reinterpret_as_u32(v256_unpacklo(b, c)); + v_uint32x8 bc1 = v_reinterpret_as_u32(v256_unpackhi(b, c)); + + v_uint32x8 cazg = v_reinterpret_as_u32(v256_blend<0b10101010>(c, a)); + cazg = v256_shuffle<_MM_SHUFFLE(2, 1, 0, 3)>(cazg); + + v_uint32x8 ac1ab1 = v256_blend<0b10101010>(ab1, bc1); + ac1ab1 = v256_shuffle<_MM_SHUFFLE(2, 1, 0, 3)>(ac1ab1); + + v_uint32x8 abc001 = v256_blend<0b10101010>(ab0, cazg); + v_uint32x8 cabc0 = v256_blend<0b10101010>(cazg, bc0); + + v_uint32x8 cabc1 = v256_unpacklo(cabc0, ac1ab1); + v_uint32x8 bcab0 = v256_unpackhi(cabc1, abc001); + + v_uint64x4 abc01 = v256_unpacklo(v_reinterpret_as_u64(abc001), v_reinterpret_as_u64(bcab0)); + v_uint64x4 abc21 = v256_unpackhi(v_reinterpret_as_u64(cabc0), v_reinterpret_as_u64(bcab0)); + abc21 = v256_swap_halves(abc21); + v_uint64x4 abc12 = v_reinterpret_as_u64(v256_alignr_64(cabc1, ac1ab1)); + + v_uint64x4 abc0 = v256_combine_diagonal(abc01, abc21); + v_uint64x4 abc1 = v256_combine_diagonal(abc12, abc01); + v_uint64x4 abc2 = v256_combine_diagonal(abc21, abc12); + + v_store(ptr, _Tpvec(abc0.val)); + v_store(ptr + _Tpvec::nlanes, _Tpvec(abc1.val)); + v_store(ptr + _Tpvec::nlanes * 2, _Tpvec(abc2.val)); +} +// todo: +template +inline void v256_load_deinterleave_l16(const _Tp*, _Tpvec&, _Tpvec&, _Tpvec&) +{} +//// +template +inline void v256_load_deinterleave_l16(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d) +{ + _Tpvec ab0, ab1, cd0, cd1; + v256_load_deinterleave_l8(ptr, ab0, cd0, ab1, cd1); + v256_zip(ab0, ab1, a, b); + v256_zip(cd0, cd1, c, d); +} + +template +inline void v256_store_interleave_l16(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d) +{ v256_store_interleave_l8(ptr, a, b, c, d); } + +OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint16x16, ushort, l16) +OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int16x16, short, l16) + +/* **************** **************** */ +// +template +inline void v256_load_deinterleave_l32(const _Tp* ptr, _Tpvec& a, _Tpvec& b) +{ + const __m256i sep = _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 + ); + + _Tpvec ab0, ab1; + v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes), ab0, ab1); + + __m256i a0b0 = _mm256_shuffle_epi8(ab0.val, sep); + __m256i a1b1 = _mm256_shuffle_epi8(ab1.val, sep); + + a.val = _mm256_unpacklo_epi64(a0b0, a1b1); + b.val = _mm256_unpackhi_epi64(a0b0, a1b1); +} + +/// todo +template +inline void v256_store_interleave_l32(_Tp*, const _Tpvec&, const _Tpvec&, const _Tpvec&) +{} +template +inline void v256_load_deinterleave_l32(const _Tp*, _Tpvec&, _Tpvec&, _Tpvec&) +{} +//// +template +inline void v256_load_deinterleave_l32(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d) +{ + const __m256i sep = _mm256_setr_epi8( + 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, + 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 + ); + + _Tpvec abcd0, abcd1, abcd2, abcd3; + v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes * 2), abcd0, abcd1); + v_recombine(v256_load(ptr + _Tpvec::nlanes), v256_load(ptr + _Tpvec::nlanes * 3), abcd2, abcd3); + + __m256i ab0cd0 = _mm256_shuffle_epi8(abcd0.val, sep); + __m256i ab1cd1 = _mm256_shuffle_epi8(abcd1.val, sep); + __m256i ab2cd2 = _mm256_shuffle_epi8(abcd2.val, sep); + __m256i ab3cd3 = _mm256_shuffle_epi8(abcd3.val, sep); + + __m256i ab0 = _mm256_unpacklo_epi32(ab0cd0, ab1cd1); + __m256i ab1 = _mm256_unpacklo_epi32(ab2cd2, ab3cd3); + __m256i cd0 = _mm256_unpackhi_epi32(ab0cd0, ab1cd1); + __m256i cd1 = _mm256_unpackhi_epi32(ab2cd2, ab3cd3); + + a.val = _mm256_unpacklo_epi64(ab0, ab1); + b.val = _mm256_unpackhi_epi64(ab0, ab1); + c.val = _mm256_unpacklo_epi64(cd0, cd1); + d.val = _mm256_unpackhi_epi64(cd0, cd1); +} + +template +inline void v256_store_interleave_l32(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d) +{ v256_store_interleave_l8(ptr, a, b, c, d); } + +OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint8x32, uchar, l32) +OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int8x32, schar, l32) + +inline void v256_cleanup() { _mm256_zeroupper(); } + +//! @name Check SIMD256 support +//! @{ +//! @brief Check CPU capability of SIMD operation +static inline bool hasSIMD256() +{ + return (CV_CPU_HAS_SUPPORT_AVX2) ? true : false; +} +//! @} + +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END + +//! @endcond + +} // cv:: + +#endif // OPENCV_HAL_INTRIN_AVX_HPP diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index c7cbb578db..1f5f53100a 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -247,8 +247,6 @@ template struct v_reg { //! @cond IGNORED typedef _Tp lane_type; - typedef v_reg::int_type, n> int_vec; - typedef v_reg::abs_type, n> abs_vec; enum { nlanes = n }; // !@endcond @@ -797,11 +795,11 @@ inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n> /** @brief Multiply and add -Returns \f$ a*b + c \f$ -For floating point types and signed 32bit int only. */ + Returns \f$ a*b + c \f$ + For floating point types and signed 32bit int only. */ template -inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, - const v_reg<_Tp, n>& c) +inline v_reg<_Tp, n> v_fma(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, + const v_reg<_Tp, n>& c) { v_reg<_Tp, n> d; for( int i = 0; i < n; i++ ) @@ -809,6 +807,14 @@ inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, return d; } +/** @brief A synonym for v_fma */ +template +inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, + const v_reg<_Tp, n>& c) +{ + return v_fma(a, b, c); +} + /** @brief Dot product of elements Multiply values in two registers and sum adjacent result pairs. @@ -1141,9 +1147,9 @@ template inline void v_zip( const v_reg<_Tp, n>& a0, const @note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x16, int ==> cv::v_int32x4, etc. */ template -inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load(const _Tp* ptr) +inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load(const _Tp* ptr) { - return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr); + return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr); } /** @brief Load register contents from memory (aligned) @@ -1151,9 +1157,9 @@ inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load(const _Tp* ptr) similar to cv::v_load, but source memory block should be aligned (to 16-byte boundary) */ template -inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_aligned(const _Tp* ptr) +inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_aligned(const _Tp* ptr) { - return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr); + return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr); } /** @brief Load 64-bits of data to lower part (high part is undefined). @@ -1166,9 +1172,9 @@ v_int32x4 r = v_load_low(lo); @endcode */ template -inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_low(const _Tp* ptr) +inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_low(const _Tp* ptr) { - v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> c; + v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c; for( int i = 0; i < c.nlanes/2; i++ ) { c.s[i] = ptr[i]; @@ -1187,9 +1193,9 @@ v_int32x4 r = v_load_halves(lo, hi); @endcode */ template -inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_halves(const _Tp* loptr, const _Tp* hiptr) +inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_halves(const _Tp* loptr, const _Tp* hiptr) { - v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> c; + v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c; for( int i = 0; i < c.nlanes/2; i++ ) { c.s[i] = loptr[i]; @@ -1208,11 +1214,11 @@ v_int32x4 r = v_load_expand(buf); // r = {1, 2, 3, 4} - type is int32 @endcode For 8-, 16-, 32-bit integer source types. */ template -inline v_reg::w_type, V_SIMD128Traits<_Tp>::nlanes / 2> +inline v_reg::w_type, V_TypeTraits<_Tp>::nlanes128 / 2> v_load_expand(const _Tp* ptr) { typedef typename V_TypeTraits<_Tp>::w_type w_type; - v_reg::nlanes> c; + v_reg::nlanes128> c; for( int i = 0; i < c.nlanes; i++ ) { c.s[i] = ptr[i]; @@ -1229,11 +1235,11 @@ v_int32x4 r = v_load_q(buf); // r = {1, 2, 3, 4} - type is int32 @endcode For 8-bit integer source types. */ template -inline v_reg::q_type, V_SIMD128Traits<_Tp>::nlanes / 4> +inline v_reg::q_type, V_TypeTraits<_Tp>::nlanes128 / 4> v_load_expand_q(const _Tp* ptr) { typedef typename V_TypeTraits<_Tp>::q_type q_type; - v_reg::nlanes> c; + v_reg::nlanes128> c; for( int i = 0; i < c.nlanes; i++ ) { c.s[i] = ptr[i]; @@ -1622,6 +1628,17 @@ template inline v_reg v_cvt_f32(const v_reg& a) return c; } +template inline v_reg v_cvt_f32(const v_reg& a, const v_reg& b) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + { + c.s[i] = (float)a.s[i]; + c.s[i+n] = (float)b.s[i]; + } + return c; +} + /** @brief Convert to double Supported input type is cv::v_int32x4. */ @@ -1644,6 +1661,52 @@ template inline v_reg v_cvt_f64(const v_reg& a) return c; } +template inline v_reg v_lut(const int* tab, const v_reg& idx) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = tab[idx.s[i]]; + return c; +} + +template inline v_reg v_lut(const float* tab, const v_reg& idx) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = tab[idx.s[i]]; + return c; +} + +template inline v_reg v_lut(const double* tab, const v_reg& idx) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = tab[idx.s[i]]; + return c; +} + +template inline void v_lut_deinterleave(const float* tab, const v_reg& idx, + v_reg& x, v_reg& y) +{ + for( int i = 0; i < n; i++ ) + { + int j = idx.s[i]; + x.s[i] = tab[j]; + y.s[i] = tab[j+1]; + } +} + +template inline void v_lut_deinterleave(const double* tab, const v_reg& idx, + v_reg& x, v_reg& y) +{ + for( int i = 0; i < n; i++ ) + { + int j = idx.s[i]; + x.s[i] = tab[j]; + y.s[i] = tab[j+1]; + } +} + /** @brief Transpose 4x4 matrix Scheme: @@ -1968,6 +2031,8 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0, v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + m3.s[3]); } +inline void v_cleanup() {} + //! @} //! @name Check SIMD support diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index 9dadab57ea..fdb3ec09cb 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -280,11 +280,29 @@ struct v_float64x2 #if CV_FP16 // Workaround for old compilers -template static inline int16x4_t vreinterpret_s16_f16(T a) -{ return (int16x4_t)a; } -template static inline float16x4_t vreinterpret_f16_s16(T a) -{ return (float16x4_t)a; } -template static inline float16x4_t cv_vld1_f16(const T* ptr) +static inline int16x8_t vreinterpretq_s16_f16(float16x8_t a) { return (int16x8_t)a; } +static inline float16x8_t vreinterpretq_f16_s16(int16x8_t a) { return (float16x8_t)a; } +static inline int16x4_t vreinterpret_s16_f16(float16x4_t a) { return (int16x4_t)a; } +static inline float16x4_t vreinterpret_f16_s16(int16x4_t a) { return (float16x4_t)a; } + +static inline float16x8_t cv_vld1q_f16(const void* ptr) +{ +#ifndef vld1q_f16 // APPLE compiler defines vld1_f16 as macro + return vreinterpretq_f16_s16(vld1q_s16((const short*)ptr)); +#else + return vld1q_f16((const __fp16*)ptr); +#endif +} +static inline void cv_vst1q_f16(void* ptr, float16x8_t a) +{ +#ifndef vst1q_f16 // APPLE compiler defines vst1_f16 as macro + vst1q_s16((short*)ptr, vreinterpretq_s16_f16(a)); +#else + vst1q_f16((__fp16*)ptr, a); +#endif +} + +static inline float16x4_t cv_vld1_f16(const void* ptr) { #ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro return vreinterpret_f16_s16(vld1_s16((const short*)ptr)); @@ -292,7 +310,7 @@ template static inline float16x4_t cv_vld1_f16(const T* ptr) return vld1_f16((const __fp16*)ptr); #endif } -template static inline void cv_vst1_f16(T* ptr, float16x4_t a) +static inline void cv_vst1_f16(void* ptr, float16x4_t a) { #ifndef vst1_f16 // APPLE compiler defines vst1_f16 as macro vst1_s16((short*)ptr, vreinterpret_s16_f16(a)); @@ -301,24 +319,28 @@ template static inline void cv_vst1_f16(T* ptr, float16x4_t a) #endif } -struct v_float16x4 + +struct v_float16x8 { typedef short lane_type; - enum { nlanes = 4 }; + enum { nlanes = 8 }; - v_float16x4() {} - explicit v_float16x4(float16x4_t v) : val(v) {} - v_float16x4(short v0, short v1, short v2, short v3) + v_float16x8() {} + explicit v_float16x8(float16x8_t v) : val(v) {} + v_float16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7) { - short v[] = {v0, v1, v2, v3}; - val = cv_vld1_f16(v); + short v[] = {v0, v1, v2, v3, v4, v5, v6, v7}; + val = cv_vld1q_f16(v); } short get0() const { - return vget_lane_s16(vreinterpret_s16_f16(val), 0); + return vgetq_lane_s16(vreinterpretq_s16_f16(val), 0); } - float16x4_t val; + float16x8_t val; }; + +inline v_float16x8 v_setzero_f16() { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16((short)0))); } +inline v_float16x8 v_setall_f16(short v) { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16(v))); } #endif #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \ @@ -731,16 +753,32 @@ inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b) return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val)); } -inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c) +inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c) { +#if CV_SIMD128_64F + // ARMv8, which adds support for 64-bit floating-point (so CV_SIMD128_64F is defined), + // also adds FMA support both for single- and double-precision floating-point vectors + return v_float32x4(vfmaq_f32(c.val, a.val, b.val)); +#else return v_float32x4(vmlaq_f32(c.val, a.val, b.val)); +#endif } -inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) +inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) { return v_int32x4(vmlaq_s32(c.val, a.val, b.val)); } +inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c) +{ + return v_fma(a, b, c); +} + +inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) +{ + return v_fma(a, b, c); +} + #if CV_SIMD128_64F inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b) { @@ -753,9 +791,14 @@ inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b) return v_float64x2(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val))); } +inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c) +{ + return v_float64x2(vfmaq_f64(c.val, a.val, b.val)); +} + inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c) { - return v_float64x2(vaddq_f64(c.val, vmulq_f64(a.val, b.val))); + return v_fma(a, b, c); } #endif @@ -841,10 +884,15 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64) #if CV_FP16 // Workaround for old comiplers -inline v_float16x4 v_load_f16(const short* ptr) -{ return v_float16x4(cv_vld1_f16(ptr)); } -inline void v_store_f16(short* ptr, v_float16x4& a) -{ cv_vst1_f16(ptr, a.val); } +inline v_float16x8 v_load_f16(const short* ptr) +{ return v_float16x8(cv_vld1q_f16(ptr)); } +inline v_float16x8 v_load_f16_aligned(const short* ptr) +{ return v_float16x8(cv_vld1q_f16(ptr)); } + +inline void v_store(short* ptr, const v_float16x8& a) +{ cv_vst1q_f16(ptr, a.val); } +inline void v_store_aligned(short* ptr, const v_float16x8& a) +{ cv_vst1q_f16(ptr, a.val); } #endif #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \ @@ -1293,6 +1341,11 @@ inline v_float32x4 v_cvt_f32(const v_float64x2& a) return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), zero)); } +inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b) +{ + return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), vcvt_f32_f64(b.val))); +} + inline v_float64x2 v_cvt_f64(const v_int32x4& a) { return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_low_s32(a.val)))); @@ -1315,17 +1368,88 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a) #endif #if CV_FP16 -inline v_float32x4 v_cvt_f32(const v_float16x4& a) +inline v_float32x4 v_cvt_f32(const v_float16x8& a) +{ + return v_float32x4(vcvt_f32_f16(vget_low_f16(a.val))); +} +inline v_float32x4 v_cvt_f32_high(const v_float16x8& a) { - return v_float32x4(vcvt_f32_f16(a.val)); + return v_float32x4(vcvt_f32_f16(vget_high_f16(a.val))); } -inline v_float16x4 v_cvt_f16(const v_float32x4& a) +inline v_float16x8 v_cvt_f16(const v_float32x4& a, const v_float32x4& b) { - return v_float16x4(vcvt_f16_f32(a.val)); + return v_float16x8(vcombine_f16(vcvt_f16_f32(a.val), vcvt_f16_f32(b.val))); } #endif +////////////// Lookup table access //////////////////// + +inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec) +{ + int CV_DECL_ALIGNED(32) elems[4] = + { + tab[vgetq_lane_s32(idxvec.val, 0)], + tab[vgetq_lane_s32(idxvec.val, 1)], + tab[vgetq_lane_s32(idxvec.val, 2)], + tab[vgetq_lane_s32(idxvec.val, 3)] + }; + return v_int32x4(vld1q_s32(elems)); +} + +inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec) +{ + float CV_DECL_ALIGNED(32) elems[4] = + { + tab[vgetq_lane_s32(idxvec.val, 0)], + tab[vgetq_lane_s32(idxvec.val, 1)], + tab[vgetq_lane_s32(idxvec.val, 2)], + tab[vgetq_lane_s32(idxvec.val, 3)] + }; + return v_float32x4(vld1q_f32(elems)); +} + +inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y) +{ + /*int CV_DECL_ALIGNED(32) idx[4]; + v_store(idx, idxvec); + + float32x4_t xy02 = vcombine_f32(vld1_f32(tab + idx[0]), vld1_f32(tab + idx[2])); + float32x4_t xy13 = vcombine_f32(vld1_f32(tab + idx[1]), vld1_f32(tab + idx[3])); + + float32x4x2_t xxyy = vuzpq_f32(xy02, xy13); + x = v_float32x4(xxyy.val[0]); + y = v_float32x4(xxyy.val[1]);*/ + int CV_DECL_ALIGNED(32) idx[4]; + v_store_aligned(idx, idxvec); + + x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]); + y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]); +} + +#if CV_SIMD128_64F +inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec) +{ + double CV_DECL_ALIGNED(32) elems[2] = + { + tab[vgetq_lane_s32(idxvec.val, 0)], + tab[vgetq_lane_s32(idxvec.val, 1)], + }; + return v_float64x2(vld1q_f64(elems)); +} + +inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y) +{ + int CV_DECL_ALIGNED(32) idx[4]; + v_store_aligned(idx, idxvec); + + x = v_float64x2(tab[idx[0]], tab[idx[1]]); + y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]); +} +#endif + +inline void v_cleanup() {} + //! @name Check SIMD support //! @{ //! @brief Check CPU capability of SIMD operation diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index 8c61f44f4a..b79ea16a4d 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -58,6 +58,17 @@ namespace cv CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN +struct v_uint8x16; +struct v_int8x16; +struct v_uint16x8; +struct v_int16x8; +struct v_uint32x4; +struct v_int32x4; +struct v_float32x4; +struct v_uint64x2; +struct v_int64x2; +struct v_float64x2; + struct v_uint8x16 { typedef uchar lane_type; @@ -144,6 +155,7 @@ struct v_int16x8 { return (short)_mm_cvtsi128_si32(val); } + __m128i val; }; @@ -163,6 +175,7 @@ struct v_uint32x4 { return (unsigned)_mm_cvtsi128_si32(val); } + __m128i val; }; @@ -182,6 +195,7 @@ struct v_int32x4 { return _mm_cvtsi128_si32(val); } + __m128i val; }; @@ -201,6 +215,7 @@ struct v_float32x4 { return _mm_cvtss_f32(val); } + __m128 val; }; @@ -222,6 +237,7 @@ struct v_uint64x2 int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32)); return (unsigned)a | ((uint64)(unsigned)b << 32); } + __m128i val; }; @@ -243,6 +259,7 @@ struct v_int64x2 int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32)); return (int64)((unsigned)a | ((uint64)(unsigned)b << 32)); } + __m128i val; }; @@ -262,29 +279,31 @@ struct v_float64x2 { return _mm_cvtsd_f64(val); } + __m128d val; }; -#if CV_FP16 -struct v_float16x4 +struct v_float16x8 { typedef short lane_type; typedef __m128i vector_type; - enum { nlanes = 4 }; + enum { nlanes = 8 }; - v_float16x4() : val(_mm_setzero_si128()) {} - explicit v_float16x4(__m128i v) : val(v) {} - v_float16x4(short v0, short v1, short v2, short v3) + v_float16x8() : val(_mm_setzero_si128()) {} + explicit v_float16x8(__m128i v) : val(v) {} + v_float16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7) { - val = _mm_setr_epi16(v0, v1, v2, v3, 0, 0, 0, 0); + val = _mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7); } short get0() const { return (short)_mm_cvtsi128_si32(val); } + __m128i val; }; -#endif +inline v_float16x8 v_setzero_f16() { return v_float16x8(_mm_setzero_si128()); } +inline v_float16x8 v_setall_f16(short val) { return v_float16x8(_mm_set1_epi16(val)); } namespace hal_sse_internal { @@ -697,11 +716,15 @@ inline v_uint32x4 operator * (const v_uint32x4& a, const v_uint32x4& b) } inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b) { +#if CV_SSE4_1 + return v_int32x4(_mm_mullo_epi32(a.val, b.val)); +#else __m128i c0 = _mm_mul_epu32(a.val, b.val); __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32)); __m128i d0 = _mm_unpacklo_epi32(c0, c1); __m128i d1 = _mm_unpackhi_epi32(c0, c1); return v_int32x4(_mm_unpacklo_epi64(d0, d1)); +#endif } inline v_uint32x4& operator *= (v_uint32x4& a, const v_uint32x4& b) { @@ -1027,11 +1050,35 @@ inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b) __m128i m = _mm_cmpgt_epi32(b.val, a.val); return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m)); } -inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) + +inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) { return a * b + c; } +inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) +{ + return v_fma(a, b, c); +} + +inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c) +{ +#if CV_FMA3 + return v_float32x4(_mm_fmadd_ps(a.val, b.val, c.val)); +#else + return v_float32x4(_mm_add_ps(_mm_mul_ps(a.val, b.val), c.val)); +#endif +} + +inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c) +{ +#if CV_FMA3 + return v_float64x2(_mm_fmadd_pd(a.val, b.val, c.val)); +#else + return v_float64x2(_mm_add_pd(_mm_mul_pd(a.val, b.val), c.val)); +#endif +} + #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \ inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \ { \ @@ -1040,17 +1087,16 @@ inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \ } \ inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \ { \ - _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \ - return _Tpvec(_mm_sqrt_##suffix(res)); \ + _Tpvec res = v_fma(a, a, b*b); \ + return _Tpvec(_mm_sqrt_##suffix(res.val)); \ } \ inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \ { \ - _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \ - return _Tpvec(res); \ + return v_fma(a, a, b*b); \ } \ inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \ { \ - return _Tpvec(_mm_add_##suffix(_mm_mul_##suffix(a.val, b.val), c.val)); \ + return v_fma(a, b, c); \ } OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff)) @@ -1268,12 +1314,15 @@ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps) OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd) -#if CV_FP16 -inline v_float16x4 v_load_f16(const short* ptr) -{ return v_float16x4(_mm_loadl_epi64((const __m128i*)ptr)); } -inline void v_store_f16(short* ptr, v_float16x4& a) -{ _mm_storel_epi64((__m128i*)ptr, a.val); } -#endif +inline v_float16x8 v_load_f16(const short* ptr) +{ return v_float16x8(_mm_loadu_si128((const __m128i*)ptr)); } +inline v_float16x8 v_load_f16_aligned(const short* ptr) +{ return v_float16x8(_mm_load_si128((const __m128i*)ptr)); } + +inline void v_store(short* ptr, const v_float16x8& a) +{ _mm_storeu_si128((__m128i*)ptr, a.val); } +inline void v_store_aligned(short* ptr, const v_float16x8& a) +{ _mm_store_si128((__m128i*)ptr, a.val); } #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \ inline scalartype v_reduce_##func(const v_##_Tpvec& a) \ @@ -2183,6 +2232,11 @@ inline v_float32x4 v_cvt_f32(const v_float64x2& a) return v_float32x4(_mm_cvtpd_ps(a.val)); } +inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b) +{ + return v_float32x4(_mm_movelh_ps(_mm_cvtpd_ps(a.val), _mm_cvtpd_ps(b.val))); +} + inline v_float64x2 v_cvt_f64(const v_int32x4& a) { return v_float64x2(_mm_cvtepi32_pd(a.val)); @@ -2200,21 +2254,82 @@ inline v_float64x2 v_cvt_f64(const v_float32x4& a) inline v_float64x2 v_cvt_f64_high(const v_float32x4& a) { - return v_float64x2(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(a.val),8)))); + return v_float64x2(_mm_cvtps_pd(_mm_movehl_ps(a.val, a.val))); } #if CV_FP16 -inline v_float32x4 v_cvt_f32(const v_float16x4& a) +inline v_float32x4 v_cvt_f32(const v_float16x8& a) { return v_float32x4(_mm_cvtph_ps(a.val)); } -inline v_float16x4 v_cvt_f16(const v_float32x4& a) +inline v_float32x4 v_cvt_f32_high(const v_float16x8& a) +{ + return v_float32x4(_mm_cvtph_ps(_mm_unpackhi_epi64(a.val, a.val))); +} + +inline v_float16x8 v_cvt_f16(const v_float32x4& a, const v_float32x4& b) { - return v_float16x4(_mm_cvtps_ph(a.val, 0)); + return v_float16x8(_mm_unpacklo_epi64(_mm_cvtps_ph(a.val, 0), _mm_cvtps_ph(b.val, 0))); } #endif +////////////// Lookup table access //////////////////// + +inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec) +{ + int CV_DECL_ALIGNED(32) idx[4]; + v_store_aligned(idx, idxvec); + return v_int32x4(_mm_setr_epi32(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]])); +} + +inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec) +{ + int CV_DECL_ALIGNED(32) idx[4]; + v_store_aligned(idx, idxvec); + return v_float32x4(_mm_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]])); +} + +inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec) +{ + int idx[2]; + v_store_low(idx, idxvec); + return v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]])); +} + +// loads pairs from the table and deinterleaves them, e.g. returns: +// x = (tab[idxvec[0], tab[idxvec[1]], tab[idxvec[2]], tab[idxvec[3]]), +// y = (tab[idxvec[0]+1], tab[idxvec[1]+1], tab[idxvec[2]+1], tab[idxvec[3]+1]) +// note that the indices are float's indices, not the float-pair indices. +// in theory, this function can be used to implement bilinear interpolation, +// when idxvec are the offsets within the image. +inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y) +{ + int CV_DECL_ALIGNED(32) idx[4]; + v_store_aligned(idx, idxvec); + __m128 z = _mm_setzero_ps(); + __m128 xy01 = _mm_loadl_pi(z, (__m64*)(tab + idx[0])); + __m128 xy23 = _mm_loadl_pi(z, (__m64*)(tab + idx[2])); + xy01 = _mm_loadh_pi(xy01, (__m64*)(tab + idx[1])); + xy23 = _mm_loadh_pi(xy23, (__m64*)(tab + idx[3])); + __m128 xxyy02 = _mm_unpacklo_ps(xy01, xy23); + __m128 xxyy13 = _mm_unpackhi_ps(xy01, xy23); + x = v_float32x4(_mm_unpacklo_ps(xxyy02, xxyy13)); + y = v_float32x4(_mm_unpackhi_ps(xxyy02, xxyy13)); +} + +inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y) +{ + int idx[2]; + v_store_low(idx, idxvec); + __m128d xy0 = _mm_loadu_pd(tab + idx[0]); + __m128d xy1 = _mm_loadu_pd(tab + idx[1]); + x = v_float64x2(_mm_unpacklo_pd(xy0, xy1)); + y = v_float64x2(_mm_unpackhi_pd(xy0, xy1)); +} + +inline void v_cleanup() {} + //! @name Check SIMD support //! @{ //! @brief Check CPU capability of SIMD operation diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index 8b76dd8487..069e9578eb 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -764,6 +764,8 @@ inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(vec_sqrt(vec_madd(a.val, a.val, vec_mul(b.val, b.val)))); } \ inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(vec_madd(a.val, a.val, vec_mul(b.val, b.val))); } \ +inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \ +{ return _Tpvec(vec_madd(a.val, b.val, c.val)); } \ inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \ { return _Tpvec(vec_madd(a.val, b.val, c.val)); } @@ -836,6 +838,9 @@ inline v_float32x4 v_cvt_f32(const v_int32x4& a) inline v_float32x4 v_cvt_f32(const v_float64x2& a) { return v_float32x4(vec_mergesqo(vec_cvfo(a.val), vec_float4_z)); } +inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b) +{ return v_float32x4(vec_mergesqo(vec_cvfo(a.val), vec_cvfo(b.val))); } + inline v_float64x2 v_cvt_f64(const v_int32x4& a) { return v_float64x2(vec_ctdo(vec_mergeh(a.val, a.val))); } @@ -848,6 +853,48 @@ inline v_float64x2 v_cvt_f64(const v_float32x4& a) inline v_float64x2 v_cvt_f64_high(const v_float32x4& a) { return v_float64x2(vec_cvfo(vec_mergel(a.val, a.val))); } +////////////// Lookup table access //////////////////// + +inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec) +{ + int CV_DECL_ALIGNED(32) idx[4]; + v_store_aligned(idx, idxvec); + return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]); +} + +inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec) +{ + int CV_DECL_ALIGNED(32) idx[4]; + v_store_aligned(idx, idxvec); + return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]); +} + +inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec) +{ + int CV_DECL_ALIGNED(32) idx[4]; + v_store_aligned(idx, idxvec); + return v_float64x2(tab[idx[0]], tab[idx[1]]); +} + +inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y) +{ + int CV_DECL_ALIGNED(32) idx[4]; + v_store_aligned(idx, idxvec); + x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]); + y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]); +} + +inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y) +{ + int CV_DECL_ALIGNED(32) idx[4]; + v_store_aligned(idx, idxvec); + x = v_float64x2(tab[idx[0]], tab[idx[1]]); + y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]); +} + +inline void v_cleanup() {} + + /** Reinterpret **/ /** its up there with load and store operations **/ diff --git a/modules/core/src/convert.fp16.cpp b/modules/core/src/convert.fp16.cpp index 6c71093e57..7168e8d643 100644 --- a/modules/core/src/convert.fp16.cpp +++ b/modules/core/src/convert.fp16.cpp @@ -81,10 +81,9 @@ void cvtScaleHalf_SIMD32f16f( const float* src, size_t sstep, short* dst, size_t for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth) { float32x4_t v_src = vld1q_f32(src + x); - float16x4_t v_dst = vcvt_f16_f32(v_src); - cv_vst1_f16((__fp16*)dst + x, v_dst); + cv_vst1_f16(dst + x, v_dst); } for ( ; x < size.width; x++ ) diff --git a/modules/core/src/mathfuncs_core.simd.hpp b/modules/core/src/mathfuncs_core.simd.hpp index b10bab6d63..354cc00421 100644 --- a/modules/core/src/mathfuncs_core.simd.hpp +++ b/modules/core/src/mathfuncs_core.simd.hpp @@ -22,7 +22,6 @@ void log32f(const float *src, float *dst, int n); void log64f(const double *src, double *dst, int n); float fastAtan2(float y, float x); - #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY using namespace std; @@ -36,162 +35,140 @@ static const float atan2_p7 = -0.04432655554792128f*(float)(180/CV_PI); using namespace cv; -#if CV_SIMD128 - -template -struct v_atan +static inline float atan_f32(float y, float x) { - typedef V_RegTrait128 Trait; - typedef typename Trait::reg VT; // vector type - enum { WorkWidth = VT::nlanes * 2 }; - - v_atan(const T & scale) - : s(Trait::all(scale)) + float ax = std::abs(x), ay = std::abs(y); + float a, c, c2; + if( ax >= ay ) { - eps = Trait::all(DBL_EPSILON); - z = Trait::zero(); - p7 = Trait::all(atan2_p7); - p5 = Trait::all(atan2_p5); - p3 = Trait::all(atan2_p3); - p1 = Trait::all(atan2_p1); - val90 = Trait::all(90.f); - val180 = Trait::all(180.f); - val360 = Trait::all(360.f); + c = ay/(ax + (float)DBL_EPSILON); + c2 = c*c; + a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c; } + else + { + c = ax/(ay + (float)DBL_EPSILON); + c2 = c*c; + a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c; + } + if( x < 0 ) + a = 180.f - a; + if( y < 0 ) + a = 360.f - a; + return a; +} - inline int operator()(int len, const T * Y, const T * X, T * angle) +#if CV_SIMD + +struct v_atan_f32 +{ + explicit v_atan_f32(const float& scale) { - int i = 0; - const int c = VT::nlanes; - for ( ; i <= len - c * 2; i += c * 2) - { - VT x1 = v_load(X + i); - VT x2 = v_load(X + i + c); - VT y1 = v_load(Y + i); - VT y2 = v_load(Y + i + c); - v_store(&angle[i], s * one(x1, y1)); - v_store(&angle[i + c], s * one(x2, y2)); - } - return i; + eps = vx_setall_f32((float)DBL_EPSILON); + z = vx_setzero_f32(); + p7 = vx_setall_f32(atan2_p7); + p5 = vx_setall_f32(atan2_p5); + p3 = vx_setall_f32(atan2_p3); + p1 = vx_setall_f32(atan2_p1); + val90 = vx_setall_f32(90.f); + val180 = vx_setall_f32(180.f); + val360 = vx_setall_f32(360.f); + s = vx_setall_f32(scale); } -private: - inline VT one(VT & x, VT & y) + v_float32 compute(const v_float32& y, const v_float32& x) { - VT ax = v_abs(x); - VT ay = v_abs(y); - VT c = v_min(ax, ay) / (v_max(ax, ay) + eps); - VT cc = c * c; - VT a = (((p7 * cc + p5) * cc + p3) * cc + p1) * c; + v_float32 ax = v_abs(x); + v_float32 ay = v_abs(y); + v_float32 c = v_min(ax, ay) / (v_max(ax, ay) + eps); + v_float32 cc = c * c; + v_float32 a = v_fma(v_fma(v_fma(cc, p7, p5), cc, p3), cc, p1)*c; a = v_select(ax >= ay, a, val90 - a); a = v_select(x < z, val180 - a, a); a = v_select(y < z, val360 - a, a); - return a; + return a * s; } -private: - VT eps; - VT z; - VT p7; - VT p5; - VT p3; - VT p1; - VT val90; - VT val180; - VT val360; - VT s; + v_float32 eps; + v_float32 z; + v_float32 p7; + v_float32 p5; + v_float32 p3; + v_float32 p1; + v_float32 val90; + v_float32 val180; + v_float32 val360; + v_float32 s; }; -#if !CV_SIMD128_64F +#endif + +} // anonymous:: + +///////////////////////////////////// ATAN2 //////////////////////////////////// -// emulation -template <> -struct v_atan +static void fastAtan32f_(const float *Y, const float *X, float *angle, int len, bool angleInDegrees ) { - v_atan(double scale) : impl(static_cast(scale)) {} - inline int operator()(int len, const double * Y, const double * X, double * angle) + float scale = angleInDegrees ? 1.f : (float)(CV_PI/180); + int i = 0; +#if CV_SIMD + const int VECSZ = v_float32::nlanes; + v_atan_f32 v(scale); + + for( ; i < len; i += VECSZ*2 ) { - int i = 0; - const int c = v_atan::WorkWidth; - float bufY[c]; - float bufX[c]; - float bufA[c]; - for ( ; i <= len - c ; i += c) + if( i + VECSZ*2 > len ) { - for (int j = 0; j < c; ++j) - { - bufY[j] = static_cast(Y[i + j]); - bufX[j] = static_cast(X[i + j]); - } - impl(c, bufY, bufX, bufA); - for (int j = 0; j < c; ++j) - { - angle[i + j] = bufA[j]; - } + // if it's inplace operation, we cannot repeatedly process + // the tail for the second time, so we have to use the + // scalar code + if( i == 0 || angle == X || angle == Y ) + break; + i = len - VECSZ*2; } - return i; - } -private: - v_atan impl; -}; -#endif - -#endif -template -static inline T atanImpl(T y, T x) -{ - T ax = std::abs(x), ay = std::abs(y); - T a, c, c2; - if( ax >= ay ) - { - c = ay/(ax + static_cast(DBL_EPSILON)); - c2 = c*c; - a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c; - } - else - { - c = ax/(ay + static_cast(DBL_EPSILON)); - c2 = c*c; - a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c; - } - if( x < 0 ) - a = 180.f - a; - if( y < 0 ) - a = 360.f - a; - return a; -} + v_float32 y0 = vx_load(Y + i); + v_float32 x0 = vx_load(X + i); + v_float32 y1 = vx_load(Y + i + VECSZ); + v_float32 x1 = vx_load(X + i + VECSZ); -template -static inline void atanImpl(const T *Y, const T *X, T *angle, int len, bool angleInDegrees) -{ - int i = 0; - T scale = angleInDegrees ? 1 : static_cast(CV_PI/180); + v_float32 r0 = v.compute(y0, x0); + v_float32 r1 = v.compute(y1, x1); -#if CV_SIMD128 - i = v_atan(scale)(len, Y, X, angle); + v_store(angle + i, r0); + v_store(angle + i + VECSZ, r1); + } + vx_cleanup(); #endif for( ; i < len; i++ ) - { - angle[i] = atanImpl(Y[i], X[i]) * scale; - } + angle[i] = atan_f32(Y[i], X[i])*scale; } -} // anonymous:: - -///////////////////////////////////// ATAN2 //////////////////////////////////// - void fastAtan32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees ) { CV_INSTRUMENT_REGION() - atanImpl(Y, X, angle, len, angleInDegrees); + fastAtan32f_(Y, X, angle, len, angleInDegrees ); } void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool angleInDegrees) { CV_INSTRUMENT_REGION() - atanImpl(Y, X, angle, len, angleInDegrees); + + const int BLKSZ = 128; + float ybuf[BLKSZ], xbuf[BLKSZ], abuf[BLKSZ]; + for( int i = 0; i < len; i += BLKSZ ) + { + int j, blksz = std::min(BLKSZ, len - i); + for( j = 0; j < blksz; j++ ) + { + ybuf[j] = (float)Y[i + j]; + xbuf[j] = (float)X[i + j]; + } + fastAtan32f_(ybuf, xbuf, abuf, blksz, angleInDegrees); + for( j = 0; j < blksz; j++ ) + angle[i + j] = abuf[j]; + } } // deprecated @@ -207,16 +184,24 @@ void magnitude32f(const float* x, const float* y, float* mag, int len) int i = 0; -#if CV_SIMD128 - for( ; i <= len - 8; i += 8 ) +#if CV_SIMD + const int VECSZ = v_float32::nlanes; + for( ; i < len; i += VECSZ*2 ) { - v_float32x4 x0 = v_load(x + i), x1 = v_load(x + i + 4); - v_float32x4 y0 = v_load(y + i), y1 = v_load(y + i + 4); + if( i + VECSZ*2 > len ) + { + if( i == 0 || mag == x || mag == y ) + break; + i = len - VECSZ*2; + } + v_float32 x0 = vx_load(x + i), x1 = vx_load(x + i + VECSZ); + v_float32 y0 = vx_load(y + i), y1 = vx_load(y + i + VECSZ); x0 = v_sqrt(v_muladd(x0, x0, y0*y0)); x1 = v_sqrt(v_muladd(x1, x1, y1*y1)); v_store(mag + i, x0); - v_store(mag + i + 4, x1); + v_store(mag + i + VECSZ, x1); } + vx_cleanup(); #endif for( ; i < len; i++ ) @@ -232,16 +217,24 @@ void magnitude64f(const double* x, const double* y, double* mag, int len) int i = 0; -#if CV_SIMD128_64F - for( ; i <= len - 4; i += 4 ) +#if CV_SIMD_64F + const int VECSZ = v_float64::nlanes; + for( ; i < len; i += VECSZ*2 ) { - v_float64x2 x0 = v_load(x + i), x1 = v_load(x + i + 2); - v_float64x2 y0 = v_load(y + i), y1 = v_load(y + i + 2); + if( i + VECSZ*2 > len ) + { + if( i == 0 || mag == x || mag == y ) + break; + i = len - VECSZ*2; + } + v_float64 x0 = vx_load(x + i), x1 = vx_load(x + i + VECSZ); + v_float64 y0 = vx_load(y + i), y1 = vx_load(y + i + VECSZ); x0 = v_sqrt(v_muladd(x0, x0, y0*y0)); x1 = v_sqrt(v_muladd(x1, x1, y1*y1)); v_store(mag + i, x0); - v_store(mag + i + 2, x1); + v_store(mag + i + VECSZ, x1); } + vx_cleanup(); #endif for( ; i < len; i++ ) @@ -258,14 +251,22 @@ void invSqrt32f(const float* src, float* dst, int len) int i = 0; -#if CV_SIMD128 - for( ; i <= len - 8; i += 8 ) +#if CV_SIMD + const int VECSZ = v_float32::nlanes; + for( ; i < len; i += VECSZ*2 ) { - v_float32x4 t0 = v_load(src + i), t1 = v_load(src + i + 4); + if( i + VECSZ*2 > len ) + { + if( i == 0 || src == dst ) + break; + i = len - VECSZ*2; + } + v_float32 t0 = vx_load(src + i), t1 = vx_load(src + i + VECSZ); t0 = v_invsqrt(t0); t1 = v_invsqrt(t1); - v_store(dst + i, t0); v_store(dst + i + 4, t1); + v_store(dst + i, t0); v_store(dst + i + VECSZ, t1); } + vx_cleanup(); #endif for( ; i < len; i++ ) @@ -276,13 +277,23 @@ void invSqrt32f(const float* src, float* dst, int len) void invSqrt64f(const double* src, double* dst, int len) { CV_INSTRUMENT_REGION() - int i = 0; -#if CV_SSE2 - __m128d v_1 = _mm_set1_pd(1.0); - for ( ; i <= len - 2; i += 2) - _mm_storeu_pd(dst + i, _mm_div_pd(v_1, _mm_sqrt_pd(_mm_loadu_pd(src + i)))); +#if CV_SIMD_64F + const int VECSZ = v_float64::nlanes; + for ( ; i < len; i += VECSZ*2) + { + if( i + VECSZ*2 > len ) + { + if( i == 0 || src == dst ) + break; + i = len - VECSZ*2; + } + v_float64 t0 = vx_load(src + i), t1 = vx_load(src + i + VECSZ); + t0 = v_invsqrt(t0); + t1 = v_invsqrt(t1); + v_store(dst + i, t0); v_store(dst + i + VECSZ, t1); + } #endif for( ; i < len; i++ ) @@ -296,14 +307,22 @@ void sqrt32f(const float* src, float* dst, int len) int i = 0; -#if CV_SIMD128 - for( ; i <= len - 8; i += 8 ) +#if CV_SIMD + const int VECSZ = v_float32::nlanes; + for( ; i < len; i += VECSZ*2 ) { - v_float32x4 t0 = v_load(src + i), t1 = v_load(src + i + 4); + if( i + VECSZ*2 > len ) + { + if( i == 0 || src == dst ) + break; + i = len - VECSZ*2; + } + v_float32 t0 = vx_load(src + i), t1 = vx_load(src + i + VECSZ); t0 = v_sqrt(t0); t1 = v_sqrt(t1); - v_store(dst + i, t0); v_store(dst + i + 4, t1); + v_store(dst + i, t0); v_store(dst + i + VECSZ, t1); } + vx_cleanup(); #endif for( ; i < len; i++ ) @@ -317,14 +336,22 @@ void sqrt64f(const double* src, double* dst, int len) int i = 0; -#if CV_SIMD128_64F - for( ; i <= len - 4; i += 4 ) +#if CV_SIMD_64F + const int VECSZ = v_float64::nlanes; + for( ; i < len; i += VECSZ*2 ) { - v_float64x2 t0 = v_load(src + i), t1 = v_load(src + i + 2); + if( i + VECSZ*2 > len ) + { + if( i == 0 || src == dst ) + break; + i = len - VECSZ*2; + } + v_float64 t0 = vx_load(src + i), t1 = vx_load(src + i + VECSZ); t0 = v_sqrt(t0); t1 = v_sqrt(t1); - v_store(dst + i, t0); v_store(dst + i + 2, t1); + v_store(dst + i, t0); v_store(dst + i + VECSZ, t1); } + vx_cleanup(); #endif for( ; i < len; i++ ) @@ -377,21 +404,6 @@ void log64f(const double *src, double *dst, int n) ////////////////////////////////////// EXP ///////////////////////////////////// -typedef union -{ - struct { -#if ( defined( WORDS_BIGENDIAN ) && !defined( OPENCV_UNIVERSAL_BUILD ) ) || defined( __BIG_ENDIAN__ ) - int hi; - int lo; -#else - int lo; - int hi; -#endif - } i; - double d; -} -DBLINT; - #define EXPTAB_SCALE 6 #define EXPTAB_MASK ((1 << EXPTAB_SCALE) - 1) @@ -464,6 +476,8 @@ static const double expTab[] = { 1.9784560263879509682582499181312 * EXPPOLY_32F_A0, }; +static float expTab_f[EXPTAB_MASK+1]; +static volatile bool extTab_f_initialized = false; // the code below uses _mm_cast* intrinsics, which are not avialable on VS2005 #if (defined _MSC_VER && _MSC_VER < 1500) || \ @@ -480,283 +494,117 @@ void exp32f( const float *_x, float *y, int n ) { CV_INSTRUMENT_REGION() + if( !extTab_f_initialized ) + { + for( int j = 0; j <= EXPTAB_MASK; j++ ) + expTab_f[j] = (float)expTab[j]; + extTab_f_initialized = true; + } + static const float A4 = (float)(1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0), A3 = (float)(.6931471805521448196800669615864773144641 / EXPPOLY_32F_A0), A2 = (float)(.2402265109513301490103372422686535526573 / EXPPOLY_32F_A0), A1 = (float)(.5550339366753125211915322047004666939128e-1 / EXPPOLY_32F_A0); -#undef EXPPOLY -#define EXPPOLY(x) \ -(((((x) + A1)*(x) + A2)*(x) + A3)*(x) + A4) - int i = 0; const Cv32suf* x = (const Cv32suf*)_x; - Cv32suf buf[4]; - -#if CV_AVX2 - if( n >= 8 ) + float minval = (float)(-exp_max_val/exp_prescale); + float maxval = (float)(exp_max_val/exp_prescale); + float postscale = (float)exp_postscale; + +#if CV_SIMD + const int VECSZ = v_float32::nlanes; + static const v_float32 vprescale = vx_setall_f32((float)exp_prescale); + static const v_float32 vpostscale = vx_setall_f32((float)exp_postscale); + static const v_float32 vminval = vx_setall_f32(minval); + static const v_float32 vmaxval = vx_setall_f32(maxval); + + static const v_float32 vA1 = vx_setall_f32((float)A1); + static const v_float32 vA2 = vx_setall_f32((float)A2); + static const v_float32 vA3 = vx_setall_f32((float)A3); + static const v_float32 vA4 = vx_setall_f32((float)A4); + + static const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK); + bool y_aligned = (size_t)(void*)y % 32 == 0; + + for( ; i < n; i += VECSZ*2 ) { - static const __m256d prescale4 = _mm256_set1_pd(exp_prescale); - static const __m256 postscale8 = _mm256_set1_ps((float)exp_postscale); - static const __m128 maxval4 = _mm_set1_ps((float)(exp_max_val/exp_prescale)); - static const __m128 minval4 = _mm_set1_ps((float)(-exp_max_val/exp_prescale)); - - static const __m256 mA1 = _mm256_set1_ps(A1); - static const __m256 mA2 = _mm256_set1_ps(A2); - static const __m256 mA3 = _mm256_set1_ps(A3); - static const __m256 mA4 = _mm256_set1_ps(A4); - bool y_aligned = (size_t)(void*)y % 32 == 0; - - ushort CV_DECL_ALIGNED(32) tab_idx[16]; - - for( ; i <= n - 8; i += 8 ) + if( i + VECSZ*2 > n ) { - __m128i xi0, xi1; - - __m256d xd0 = _mm256_cvtps_pd(_mm_min_ps(_mm_max_ps(_mm_loadu_ps(&x[i].f), minval4), maxval4)); - __m256d xd1 = _mm256_cvtps_pd(_mm_min_ps(_mm_max_ps(_mm_loadu_ps(&x[i+4].f), minval4), maxval4)); - - xd0 = _mm256_mul_pd(xd0, prescale4); - xd1 = _mm256_mul_pd(xd1, prescale4); + if( i == 0 || _x == y ) + break; + i = n - VECSZ*2; + y_aligned = false; + } - xi0 = _mm256_cvtpd_epi32(xd0); - xi1 = _mm256_cvtpd_epi32(xd1); + v_float32 xf0 = vx_load(&x[i].f), xf1 = vx_load(&x[i + VECSZ].f); - xd0 = _mm256_sub_pd(xd0, _mm256_cvtepi32_pd(xi0)); - xd1 = _mm256_sub_pd(xd1, _mm256_cvtepi32_pd(xi1)); + xf0 = v_min(v_max(xf0, vminval), vmaxval); + xf1 = v_min(v_max(xf1, vminval), vmaxval); - // gcc does not support _mm256_set_m128 - //xf = _mm256_set_m128(_mm256_cvtpd_ps(xd1), _mm256_cvtpd_ps(xd0)); - __m256 xf = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(xd0)), _mm256_cvtpd_ps(xd1), 1); + xf0 *= vprescale; + xf1 *= vprescale; - xf = _mm256_mul_ps(xf, postscale8); + v_int32 xi0 = v_round(xf0); + v_int32 xi1 = v_round(xf1); + xf0 = (xf0 - v_cvt_f32(xi0))*vpostscale; + xf1 = (xf1 - v_cvt_f32(xi1))*vpostscale; - xi0 = _mm_packs_epi32(xi0, xi1); + v_float32 yf0 = v_lut(expTab_f, xi0 & vidxmask); + v_float32 yf1 = v_lut(expTab_f, xi1 & vidxmask); - _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi16(EXPTAB_MASK))); + v_int32 v0 = vx_setzero_s32(), v127 = vx_setall_s32(127), v255 = vx_setall_s32(255); + xi0 = v_min(v_max(v_shr(xi0) + v127, v0), v255); + xi1 = v_min(v_max(v_shr(xi1) + v127, v0), v255); - xi0 = _mm_add_epi16(_mm_srai_epi16(xi0, EXPTAB_SCALE), _mm_set1_epi16(127)); - xi0 = _mm_max_epi16(xi0, _mm_setzero_si128()); - xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(255)); - xi1 = _mm_unpackhi_epi16(xi0, _mm_setzero_si128()); - xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128()); + yf0 *= v_reinterpret_as_f32(v_shl<23>(xi0)); + yf1 *= v_reinterpret_as_f32(v_shl<23>(xi1)); - __m256d yd0 = _mm256_set_pd(expTab[tab_idx[3]], expTab[tab_idx[2]], expTab[tab_idx[1]], expTab[tab_idx[0]]); - __m256d yd1 = _mm256_set_pd(expTab[tab_idx[7]], expTab[tab_idx[6]], expTab[tab_idx[5]], expTab[tab_idx[4]]); + v_float32 zf0 = xf0 + vA1; + v_float32 zf1 = xf1 + vA1; - // gcc does not support _mm256_set_m128 - //__m256 yf = _mm256_set_m128(_mm256_cvtpd_ps(yd1), _mm256_cvtpd_ps(yd0)); - __m256 yf = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(yd0)), _mm256_cvtpd_ps(yd1), 1); + zf0 = v_fma(zf0, xf0, vA2); + zf1 = v_fma(zf1, xf1, vA2); - //_mm256_set_m128i(xi1, xi0) - __m256i temp = _mm256_castps_si256(_mm256_insertf128_ps(_mm256_castps128_ps256(_mm_castsi128_ps(xi0)), _mm_castsi128_ps(xi1), 1)); + zf0 = v_fma(zf0, xf0, vA3); + zf1 = v_fma(zf1, xf1, vA3); - yf = _mm256_mul_ps(yf, _mm256_castsi256_ps(_mm256_slli_epi32(temp, 23))); + zf0 = v_fma(zf0, xf0, vA4); + zf1 = v_fma(zf1, xf1, vA4); - __m256 zf = _mm256_add_ps(xf, mA1); + zf0 *= yf0; + zf1 *= yf1; -#if CV_FMA3 - zf = _mm256_fmadd_ps(zf, xf, mA2); - zf = _mm256_fmadd_ps(zf, xf, mA3); - zf = _mm256_fmadd_ps(zf, xf, mA4); -#else - zf = _mm256_add_ps(_mm256_mul_ps(zf, xf), mA2); - zf = _mm256_add_ps(_mm256_mul_ps(zf, xf), mA3); - zf = _mm256_add_ps(_mm256_mul_ps(zf, xf), mA4); -#endif - zf = _mm256_mul_ps(zf, yf); - - if( y_aligned ) - { - _mm256_store_ps(y + i, zf); - } - else - { - _mm256_storeu_ps(y + i, zf); - } + if( y_aligned ) + { + v_store_aligned(y + i, zf0); + v_store_aligned(y + i + VECSZ, zf1); } - } -#elif CV_SSE2 - if( n >= 8 ) - { - static const __m128d prescale2 = _mm_set1_pd(exp_prescale); - static const __m128 postscale4 = _mm_set1_ps((float)exp_postscale); - static const __m128 maxval4 = _mm_set1_ps((float)(exp_max_val/exp_prescale)); - static const __m128 minval4 = _mm_set1_ps((float)(-exp_max_val/exp_prescale)); - - static const __m128 mA1 = _mm_set1_ps(A1); - static const __m128 mA2 = _mm_set1_ps(A2); - static const __m128 mA3 = _mm_set1_ps(A3); - static const __m128 mA4 = _mm_set1_ps(A4); - bool y_aligned = (size_t)(void*)y % 16 == 0; - - ushort CV_DECL_ALIGNED(16) tab_idx[8]; - - for( ; i <= n - 8; i += 8 ) + else { - __m128 xf0, xf1; - xf0 = _mm_loadu_ps(&x[i].f); - xf1 = _mm_loadu_ps(&x[i+4].f); - __m128i xi0, xi1, xi2, xi3; - - xf0 = _mm_min_ps(_mm_max_ps(xf0, minval4), maxval4); - xf1 = _mm_min_ps(_mm_max_ps(xf1, minval4), maxval4); - - __m128d xd0 = _mm_cvtps_pd(xf0); - __m128d xd2 = _mm_cvtps_pd(_mm_movehl_ps(xf0, xf0)); - __m128d xd1 = _mm_cvtps_pd(xf1); - __m128d xd3 = _mm_cvtps_pd(_mm_movehl_ps(xf1, xf1)); - - xd0 = _mm_mul_pd(xd0, prescale2); - xd2 = _mm_mul_pd(xd2, prescale2); - xd1 = _mm_mul_pd(xd1, prescale2); - xd3 = _mm_mul_pd(xd3, prescale2); - - xi0 = _mm_cvtpd_epi32(xd0); - xi2 = _mm_cvtpd_epi32(xd2); - - xi1 = _mm_cvtpd_epi32(xd1); - xi3 = _mm_cvtpd_epi32(xd3); - - xd0 = _mm_sub_pd(xd0, _mm_cvtepi32_pd(xi0)); - xd2 = _mm_sub_pd(xd2, _mm_cvtepi32_pd(xi2)); - xd1 = _mm_sub_pd(xd1, _mm_cvtepi32_pd(xi1)); - xd3 = _mm_sub_pd(xd3, _mm_cvtepi32_pd(xi3)); - - xf0 = _mm_movelh_ps(_mm_cvtpd_ps(xd0), _mm_cvtpd_ps(xd2)); - xf1 = _mm_movelh_ps(_mm_cvtpd_ps(xd1), _mm_cvtpd_ps(xd3)); - - xf0 = _mm_mul_ps(xf0, postscale4); - xf1 = _mm_mul_ps(xf1, postscale4); - - xi0 = _mm_unpacklo_epi64(xi0, xi2); - xi1 = _mm_unpacklo_epi64(xi1, xi3); - xi0 = _mm_packs_epi32(xi0, xi1); - - _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi16(EXPTAB_MASK))); - - xi0 = _mm_add_epi16(_mm_srai_epi16(xi0, EXPTAB_SCALE), _mm_set1_epi16(127)); - xi0 = _mm_max_epi16(xi0, _mm_setzero_si128()); - xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(255)); - xi1 = _mm_unpackhi_epi16(xi0, _mm_setzero_si128()); - xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128()); - - __m128d yd0 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[0]), _mm_load_sd(expTab + tab_idx[1])); - __m128d yd1 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[2]), _mm_load_sd(expTab + tab_idx[3])); - __m128d yd2 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[4]), _mm_load_sd(expTab + tab_idx[5])); - __m128d yd3 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[6]), _mm_load_sd(expTab + tab_idx[7])); - - __m128 yf0 = _mm_movelh_ps(_mm_cvtpd_ps(yd0), _mm_cvtpd_ps(yd1)); - __m128 yf1 = _mm_movelh_ps(_mm_cvtpd_ps(yd2), _mm_cvtpd_ps(yd3)); - - yf0 = _mm_mul_ps(yf0, _mm_castsi128_ps(_mm_slli_epi32(xi0, 23))); - yf1 = _mm_mul_ps(yf1, _mm_castsi128_ps(_mm_slli_epi32(xi1, 23))); - - __m128 zf0 = _mm_add_ps(xf0, mA1); - __m128 zf1 = _mm_add_ps(xf1, mA1); - - zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA2); - zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA2); - - zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA3); - zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA3); - - zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA4); - zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA4); - - zf0 = _mm_mul_ps(zf0, yf0); - zf1 = _mm_mul_ps(zf1, yf1); - - if( y_aligned ) - { - _mm_store_ps(y + i, zf0); - _mm_store_ps(y + i + 4, zf1); - } - else - { - _mm_storeu_ps(y + i, zf0); - _mm_storeu_ps(y + i + 4, zf1); - } + v_store(y + i, zf0); + v_store(y + i + VECSZ, zf1); } } - else + vx_cleanup(); #endif - for( ; i <= n - 4; i += 4 ) - { - double x0 = x[i].f * exp_prescale; - double x1 = x[i + 1].f * exp_prescale; - double x2 = x[i + 2].f * exp_prescale; - double x3 = x[i + 3].f * exp_prescale; - int val0, val1, val2, val3, t; - - if( ((x[i].i >> 23) & 255) > 127 + 10 ) - x0 = x[i].i < 0 ? -exp_max_val : exp_max_val; - - if( ((x[i+1].i >> 23) & 255) > 127 + 10 ) - x1 = x[i+1].i < 0 ? -exp_max_val : exp_max_val; - - if( ((x[i+2].i >> 23) & 255) > 127 + 10 ) - x2 = x[i+2].i < 0 ? -exp_max_val : exp_max_val; - - if( ((x[i+3].i >> 23) & 255) > 127 + 10 ) - x3 = x[i+3].i < 0 ? -exp_max_val : exp_max_val; - - val0 = cvRound(x0); - val1 = cvRound(x1); - val2 = cvRound(x2); - val3 = cvRound(x3); - - x0 = (x0 - val0)*exp_postscale; - x1 = (x1 - val1)*exp_postscale; - x2 = (x2 - val2)*exp_postscale; - x3 = (x3 - val3)*exp_postscale; - - t = (val0 >> EXPTAB_SCALE) + 127; - t = !(t & ~255) ? t : t < 0 ? 0 : 255; - buf[0].i = t << 23; - - t = (val1 >> EXPTAB_SCALE) + 127; - t = !(t & ~255) ? t : t < 0 ? 0 : 255; - buf[1].i = t << 23; - - t = (val2 >> EXPTAB_SCALE) + 127; - t = !(t & ~255) ? t : t < 0 ? 0 : 255; - buf[2].i = t << 23; - - t = (val3 >> EXPTAB_SCALE) + 127; - t = !(t & ~255) ? t : t < 0 ? 0 : 255; - buf[3].i = t << 23; - - x0 = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 ); - x1 = buf[1].f * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 ); - - y[i] = (float)x0; - y[i + 1] = (float)x1; - - x2 = buf[2].f * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 ); - x3 = buf[3].f * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 ); - - y[i + 2] = (float)x2; - y[i + 3] = (float)x3; - } for( ; i < n; i++ ) { - double x0 = x[i].f * exp_prescale; - int val0, t; + float x0 = x[i].f; + x0 = std::min(std::max(x0, minval), maxval); + x0 *= (float)exp_prescale; + Cv32suf buf; - if( ((x[i].i >> 23) & 255) > 127 + 10 ) - x0 = x[i].i < 0 ? -exp_max_val : exp_max_val; + int xi = saturate_cast(x0); + x0 = (x0 - xi)*postscale; - val0 = cvRound(x0); - t = (val0 >> EXPTAB_SCALE) + 127; + int t = (xi >> EXPTAB_SCALE) + 127; t = !(t & ~255) ? t : t < 0 ? 0 : 255; + buf.i = t << 23; - buf[0].i = t << 23; - x0 = (x0 - val0)*exp_postscale; - - y[i] = (float)(buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY(x0)); + y[i] = buf.f * expTab_f[xi & EXPTAB_MASK] * ((((x0 + A1)*x0 + A2)*x0 + A3)*x0 + A4); } } @@ -772,162 +620,111 @@ void exp64f( const double *_x, double *y, int n ) A1 = .96180973140732918010002372686186e-2 / EXPPOLY_32F_A0, A0 = .13369713757180123244806654839424e-2 / EXPPOLY_32F_A0; -#undef EXPPOLY -#define EXPPOLY(x) (((((A0*(x) + A1)*(x) + A2)*(x) + A3)*(x) + A4)*(x) + A5) - int i = 0; - Cv64suf buf[4]; const Cv64suf* x = (const Cv64suf*)_x; - -#if CV_SSE2 - static const __m128d prescale2 = _mm_set1_pd(exp_prescale); - static const __m128d postscale2 = _mm_set1_pd(exp_postscale); - static const __m128d maxval2 = _mm_set1_pd(exp_max_val); - static const __m128d minval2 = _mm_set1_pd(-exp_max_val); - - static const __m128d mA0 = _mm_set1_pd(A0); - static const __m128d mA1 = _mm_set1_pd(A1); - static const __m128d mA2 = _mm_set1_pd(A2); - static const __m128d mA3 = _mm_set1_pd(A3); - static const __m128d mA4 = _mm_set1_pd(A4); - static const __m128d mA5 = _mm_set1_pd(A5); - - int CV_DECL_ALIGNED(16) tab_idx[4]; - - for( ; i <= n - 4; i += 4 ) + double minval = (-exp_max_val/exp_prescale); + double maxval = (exp_max_val/exp_prescale); + +#if CV_SIMD_64F + const int VECSZ = v_float64::nlanes; + static const v_float64 vprescale = vx_setall_f64(exp_prescale); + static const v_float64 vpostscale = vx_setall_f64(exp_postscale); + static const v_float64 vminval = vx_setall_f64(minval); + static const v_float64 vmaxval = vx_setall_f64(maxval); + + static const v_float64 vA1 = vx_setall_f64(A1); + static const v_float64 vA2 = vx_setall_f64(A2); + static const v_float64 vA3 = vx_setall_f64(A3); + static const v_float64 vA4 = vx_setall_f64(A4); + static const v_float64 vA5 = vx_setall_f64(A5); + + static const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK); + bool y_aligned = (size_t)(void*)y % 32 == 0; + + for( ; i < n; i += VECSZ*2 ) { - __m128d xf0 = _mm_loadu_pd(&x[i].f), xf1 = _mm_loadu_pd(&x[i+2].f); - __m128i xi0, xi1; - xf0 = _mm_min_pd(_mm_max_pd(xf0, minval2), maxval2); - xf1 = _mm_min_pd(_mm_max_pd(xf1, minval2), maxval2); - xf0 = _mm_mul_pd(xf0, prescale2); - xf1 = _mm_mul_pd(xf1, prescale2); - - xi0 = _mm_cvtpd_epi32(xf0); - xi1 = _mm_cvtpd_epi32(xf1); - xf0 = _mm_mul_pd(_mm_sub_pd(xf0, _mm_cvtepi32_pd(xi0)), postscale2); - xf1 = _mm_mul_pd(_mm_sub_pd(xf1, _mm_cvtepi32_pd(xi1)), postscale2); - - xi0 = _mm_unpacklo_epi64(xi0, xi1); - _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi32(EXPTAB_MASK))); - - xi0 = _mm_add_epi32(_mm_srai_epi32(xi0, EXPTAB_SCALE), _mm_set1_epi32(1023)); - xi0 = _mm_packs_epi32(xi0, xi0); - xi0 = _mm_max_epi16(xi0, _mm_setzero_si128()); - xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(2047)); - xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128()); - xi1 = _mm_unpackhi_epi32(xi0, _mm_setzero_si128()); - xi0 = _mm_unpacklo_epi32(xi0, _mm_setzero_si128()); - - __m128d yf0 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[0]), _mm_load_sd(expTab + tab_idx[1])); - __m128d yf1 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[2]), _mm_load_sd(expTab + tab_idx[3])); - yf0 = _mm_mul_pd(yf0, _mm_castsi128_pd(_mm_slli_epi64(xi0, 52))); - yf1 = _mm_mul_pd(yf1, _mm_castsi128_pd(_mm_slli_epi64(xi1, 52))); - - __m128d zf0 = _mm_add_pd(_mm_mul_pd(mA0, xf0), mA1); - __m128d zf1 = _mm_add_pd(_mm_mul_pd(mA0, xf1), mA1); - - zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA2); - zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA2); - - zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA3); - zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA3); - - zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA4); - zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA4); - - zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA5); - zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA5); - - zf0 = _mm_mul_pd(zf0, yf0); - zf1 = _mm_mul_pd(zf1, yf1); - - _mm_storeu_pd(y + i, zf0); - _mm_storeu_pd(y + i + 2, zf1); - } -#endif - for( ; i <= n - 4; i += 4 ) - { - double x0 = x[i].f * exp_prescale; - double x1 = x[i + 1].f * exp_prescale; - double x2 = x[i + 2].f * exp_prescale; - double x3 = x[i + 3].f * exp_prescale; + if( i + VECSZ*2 > n ) + { + if( i == 0 || _x == y ) + break; + i = n - VECSZ*2; + y_aligned = false; + } - double y0, y1, y2, y3; - int val0, val1, val2, val3, t; + v_float64 xf0 = vx_load(&x[i].f), xf1 = vx_load(&x[i + VECSZ].f); - t = (int)(x[i].i >> 52); - if( (t & 2047) > 1023 + 10 ) - x0 = t < 0 ? -exp_max_val : exp_max_val; + xf0 = v_min(v_max(xf0, vminval), vmaxval); + xf1 = v_min(v_max(xf1, vminval), vmaxval); - t = (int)(x[i+1].i >> 52); - if( (t & 2047) > 1023 + 10 ) - x1 = t < 0 ? -exp_max_val : exp_max_val; + xf0 *= vprescale; + xf1 *= vprescale; - t = (int)(x[i+2].i >> 52); - if( (t & 2047) > 1023 + 10 ) - x2 = t < 0 ? -exp_max_val : exp_max_val; + v_int32 xi0 = v_round(xf0); + v_int32 xi1 = v_round(xf1); + xf0 = (xf0 - v_cvt_f64(xi0))*vpostscale; + xf1 = (xf1 - v_cvt_f64(xi1))*vpostscale; - t = (int)(x[i+3].i >> 52); - if( (t & 2047) > 1023 + 10 ) - x3 = t < 0 ? -exp_max_val : exp_max_val; + v_float64 yf0 = v_lut(expTab, xi0 & vidxmask); + v_float64 yf1 = v_lut(expTab, xi1 & vidxmask); - val0 = cvRound(x0); - val1 = cvRound(x1); - val2 = cvRound(x2); - val3 = cvRound(x3); + v_int32 v0 = vx_setzero_s32(), v1023 = vx_setall_s32(1023), v2047 = vx_setall_s32(2047); + xi0 = v_min(v_max(v_shr(xi0) + v1023, v0), v2047); + xi1 = v_min(v_max(v_shr(xi1) + v1023, v0), v2047); - x0 = (x0 - val0)*exp_postscale; - x1 = (x1 - val1)*exp_postscale; - x2 = (x2 - val2)*exp_postscale; - x3 = (x3 - val3)*exp_postscale; + v_int64 xq0, xq1, dummy; + v_expand(xi0, xq0, dummy); + v_expand(xi1, xq1, dummy); - t = (val0 >> EXPTAB_SCALE) + 1023; - t = !(t & ~2047) ? t : t < 0 ? 0 : 2047; - buf[0].i = (int64)t << 52; + yf0 *= v_reinterpret_as_f64(v_shl<52>(xq0)); + yf1 *= v_reinterpret_as_f64(v_shl<52>(xq1)); - t = (val1 >> EXPTAB_SCALE) + 1023; - t = !(t & ~2047) ? t : t < 0 ? 0 : 2047; - buf[1].i = (int64)t << 52; + v_float64 zf0 = xf0 + vA1; + v_float64 zf1 = xf1 + vA1; - t = (val2 >> EXPTAB_SCALE) + 1023; - t = !(t & ~2047) ? t : t < 0 ? 0 : 2047; - buf[2].i = (int64)t << 52; + zf0 = v_fma(zf0, xf0, vA2); + zf1 = v_fma(zf1, xf1, vA2); - t = (val3 >> EXPTAB_SCALE) + 1023; - t = !(t & ~2047) ? t : t < 0 ? 0 : 2047; - buf[3].i = (int64)t << 52; + zf0 = v_fma(zf0, xf0, vA3); + zf1 = v_fma(zf1, xf1, vA3); - y0 = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 ); - y1 = buf[1].f * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 ); + zf0 = v_fma(zf0, xf0, vA4); + zf1 = v_fma(zf1, xf1, vA4); - y[i] = y0; - y[i + 1] = y1; + zf0 = v_fma(zf0, xf0, vA5); + zf1 = v_fma(zf1, xf1, vA5); - y2 = buf[2].f * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 ); - y3 = buf[3].f * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 ); + zf0 *= yf0; + zf1 *= yf1; - y[i + 2] = y2; - y[i + 3] = y3; + if( y_aligned ) + { + v_store_aligned(y + i, zf0); + v_store_aligned(y + i + VECSZ, zf1); + } + else + { + v_store(y + i, zf0); + v_store(y + i + VECSZ, zf1); + } } + vx_cleanup(); +#endif for( ; i < n; i++ ) { - double x0 = x[i].f * exp_prescale; - int val0, t; + double x0 = x[i].f; + x0 = std::min(std::max(x0, minval), maxval); + x0 *= exp_prescale; + Cv64suf buf; - t = (int)(x[i].i >> 52); - if( (t & 2047) > 1023 + 10 ) - x0 = t < 0 ? -exp_max_val : exp_max_val; + int xi = saturate_cast(x0); + x0 = (x0 - xi)*exp_postscale; - val0 = cvRound(x0); - t = (val0 >> EXPTAB_SCALE) + 1023; + int t = (xi >> EXPTAB_SCALE) + 1023; t = !(t & ~2047) ? t : t < 0 ? 0 : 2047; + buf.i = (int64)t << 52; - buf[0].i = (int64)t << 52; - x0 = (x0 - val0)*exp_postscale; - - y[i] = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 ); + y[i] = buf.f * expTab[xi & EXPTAB_MASK] * (((((A0*x0 + A1)*x0 + A2)*x0 + A3)*x0 + A4)*x0 + A5); } } @@ -937,12 +734,10 @@ void exp64f( const double *_x, double *y, int n ) /////////////////////////////////////////// LOG /////////////////////////////////////// -#define LOGTAB_SCALE 8 +#define LOGTAB_SCALE 8 #define LOGTAB_MASK ((1 << LOGTAB_SCALE) - 1) -#define LOGTAB_MASK2 ((1 << (20 - LOGTAB_SCALE)) - 1) -#define LOGTAB_MASK2_32F ((1 << (23 - LOGTAB_SCALE)) - 1) -static const double CV_DECL_ALIGNED(16) icvLogTab[] = { +static const double CV_DECL_ALIGNED(16) logTab[] = { 0.0000000000000000000000000000000000000000, 1.000000000000000000000000000000000000000, .00389864041565732288852075271279318258166, .9961089494163424124513618677042801556420, .00778214044205494809292034119607706088573, .9922480620155038759689922480620155038760, @@ -1201,154 +996,85 @@ static const double CV_DECL_ALIGNED(16) icvLogTab[] = { .69314718055994530941723212145818, 5.0e-01, }; +static float logTab_f[(LOGTAB_MASK+1)*2]; +static volatile bool logTab_f_initialized = false; - -#define LOGTAB_TRANSLATE(x,h) (((x) - 1.)*icvLogTab[(h)+1]) +#define LOGTAB_TRANSLATE(tab, x, h) (((x) - 1.f)*tab[(h)+1]) static const double ln_2 = 0.69314718055994530941723212145818; void log32f( const float *_x, float *y, int n ) { CV_INSTRUMENT_REGION() - static const float shift[] = { 0, -1.f/512 }; + if( !logTab_f_initialized ) + { + for( int j = 0; j < (LOGTAB_MASK+1)*2; j++ ) + logTab_f[j] = (float)logTab[j]; + logTab_f_initialized = true; + } + + static const int LOGTAB_MASK2_32F = (1 << (23 - LOGTAB_SCALE)) - 1; static const float A0 = 0.3333333333333333333333333f, A1 = -0.5f, A2 = 1.f; -#undef LOGPOLY -#define LOGPOLY(x) (((A0*(x) + A1)*(x) + A2)*(x)) - int i = 0; - Cv32suf buf[4]; const int* x = (const int*)_x; -#if CV_SSE2 - static const __m128d ln2_2 = _mm_set1_pd(ln_2); - static const __m128 _1_4 = _mm_set1_ps(1.f); - static const __m128 shift4 = _mm_set1_ps(-1.f/512); - - static const __m128 mA0 = _mm_set1_ps(A0); - static const __m128 mA1 = _mm_set1_ps(A1); - static const __m128 mA2 = _mm_set1_ps(A2); +#if CV_SIMD + const int VECSZ = v_float32::nlanes; + static const v_float32 vln2 = vx_setall_f32((float)ln_2); + static const v_float32 v1 = vx_setall_f32(1.f); + static const v_float32 vshift = vx_setall_f32(-1.f/512); - int CV_DECL_ALIGNED(16) idx[4]; + static const v_float32 vA0 = vx_setall_f32(A0); + static const v_float32 vA1 = vx_setall_f32(A1); + static const v_float32 vA2 = vx_setall_f32(A2); - for( ; i <= n - 4; i += 4 ) + for( ; i < n; i += VECSZ ) { - __m128i h0 = _mm_loadu_si128((const __m128i*)(x + i)); - __m128i yi0 = _mm_sub_epi32(_mm_and_si128(_mm_srli_epi32(h0, 23), _mm_set1_epi32(255)), _mm_set1_epi32(127)); - __m128d yd0 = _mm_mul_pd(_mm_cvtepi32_pd(yi0), ln2_2); - __m128d yd1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_unpackhi_epi64(yi0,yi0)), ln2_2); - - __m128i xi0 = _mm_or_si128(_mm_and_si128(h0, _mm_set1_epi32(LOGTAB_MASK2_32F)), _mm_set1_epi32(127 << 23)); - - h0 = _mm_and_si128(_mm_srli_epi32(h0, 23 - LOGTAB_SCALE - 1), _mm_set1_epi32(LOGTAB_MASK*2)); - _mm_store_si128((__m128i*)idx, h0); - h0 = _mm_cmpeq_epi32(h0, _mm_set1_epi32(510)); - - __m128d t0, t1, t2, t3, t4; - t0 = _mm_load_pd(icvLogTab + idx[0]); - t2 = _mm_load_pd(icvLogTab + idx[1]); - t1 = _mm_unpackhi_pd(t0, t2); - t0 = _mm_unpacklo_pd(t0, t2); - t2 = _mm_load_pd(icvLogTab + idx[2]); - t4 = _mm_load_pd(icvLogTab + idx[3]); - t3 = _mm_unpackhi_pd(t2, t4); - t2 = _mm_unpacklo_pd(t2, t4); - - yd0 = _mm_add_pd(yd0, t0); - yd1 = _mm_add_pd(yd1, t2); - - __m128 yf0 = _mm_movelh_ps(_mm_cvtpd_ps(yd0), _mm_cvtpd_ps(yd1)); - - __m128 xf0 = _mm_sub_ps(_mm_castsi128_ps(xi0), _1_4); - xf0 = _mm_mul_ps(xf0, _mm_movelh_ps(_mm_cvtpd_ps(t1), _mm_cvtpd_ps(t3))); - xf0 = _mm_add_ps(xf0, _mm_and_ps(_mm_castsi128_ps(h0), shift4)); - - __m128 zf0 = _mm_mul_ps(xf0, mA0); - zf0 = _mm_mul_ps(_mm_add_ps(zf0, mA1), xf0); - zf0 = _mm_mul_ps(_mm_add_ps(zf0, mA2), xf0); - yf0 = _mm_add_ps(yf0, zf0); - - _mm_storeu_ps(y + i, yf0); - } -#endif - for( ; i <= n - 4; i += 4 ) - { - double x0, x1, x2, x3; - double y0, y1, y2, y3; - int h0, h1, h2, h3; - - h0 = x[i]; - h1 = x[i+1]; - buf[0].i = (h0 & LOGTAB_MASK2_32F) | (127 << 23); - buf[1].i = (h1 & LOGTAB_MASK2_32F) | (127 << 23); - - y0 = (((h0 >> 23) & 0xff) - 127) * ln_2; - y1 = (((h1 >> 23) & 0xff) - 127) * ln_2; - - h0 = (h0 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; - h1 = (h1 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; - - y0 += icvLogTab[h0]; - y1 += icvLogTab[h1]; - - h2 = x[i+2]; - h3 = x[i+3]; - - x0 = LOGTAB_TRANSLATE( buf[0].f, h0 ); - x1 = LOGTAB_TRANSLATE( buf[1].f, h1 ); - - buf[2].i = (h2 & LOGTAB_MASK2_32F) | (127 << 23); - buf[3].i = (h3 & LOGTAB_MASK2_32F) | (127 << 23); - - y2 = (((h2 >> 23) & 0xff) - 127) * ln_2; - y3 = (((h3 >> 23) & 0xff) - 127) * ln_2; + if( i + VECSZ > n ) + { + if( i == 0 || _x == y ) + break; + i = n - VECSZ; + } - h2 = (h2 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; - h3 = (h3 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; + v_int32 h0 = vx_load(x + i); + v_int32 yi0 = (v_shr<23>(h0) & vx_setall_s32(255)) - vx_setall_s32(127); + v_int32 xi0 = (h0 & vx_setall_s32(LOGTAB_MASK2_32F)) | vx_setall_s32(127 << 23); - y2 += icvLogTab[h2]; - y3 += icvLogTab[h3]; + h0 = v_shr<23 - LOGTAB_SCALE - 1>(h0) & vx_setall_s32(LOGTAB_MASK*2); + v_float32 yf0, xf0; - x2 = LOGTAB_TRANSLATE( buf[2].f, h2 ); - x3 = LOGTAB_TRANSLATE( buf[3].f, h3 ); + v_lut_deinterleave(logTab_f, h0, yf0, xf0); - x0 += shift[h0 == 510]; - x1 += shift[h1 == 510]; - y0 += LOGPOLY( x0 ); - y1 += LOGPOLY( x1 ); + yf0 = v_fma(v_cvt_f32(yi0), vln2, yf0); - y[i] = (float) y0; - y[i + 1] = (float) y1; + v_float32 delta = v_reinterpret_as_f32(h0 == vx_setall_s32(510)) & vshift; + xf0 = v_fma((v_reinterpret_as_f32(xi0) - v1), xf0, delta); - x2 += shift[h2 == 510]; - x3 += shift[h3 == 510]; - y2 += LOGPOLY( x2 ); - y3 += LOGPOLY( x3 ); + v_float32 zf0 = v_fma(xf0, vA0, vA1); + zf0 = v_fma(zf0, xf0, vA2); + zf0 = v_fma(zf0, xf0, yf0); - y[i + 2] = (float) y2; - y[i + 3] = (float) y3; + v_store(y + i, zf0); } + vx_cleanup(); +#endif for( ; i < n; i++ ) { - int h0 = x[i]; - double y0; - float x0; - - y0 = (((h0 >> 23) & 0xff) - 127) * ln_2; + Cv32suf buf; + int i0 = x[i]; - buf[0].i = (h0 & LOGTAB_MASK2_32F) | (127 << 23); - h0 = (h0 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; + buf.i = (i0 & LOGTAB_MASK2_32F) | (127 << 23); + int idx = (i0 >> (23 - LOGTAB_SCALE - 1)) & (LOGTAB_MASK*2); - y0 += icvLogTab[h0]; - x0 = (float)LOGTAB_TRANSLATE( buf[0].f, h0 ); - x0 += shift[h0 == 510]; - y0 += LOGPOLY( x0 ); - - y[i] = (float)y0; + float y0 = (((i0 >> 23) & 0xff) - 127) * (float)ln_2 + logTab_f[idx]; + float x0 = (buf.f - 1.f)*logTab_f[idx + 1] + (idx == 510 ? -1.f/512 : 0.f); + y[i] = ((A0*x0 + A1)*x0 + A2)*x0 + y0; } } @@ -1356,7 +1082,7 @@ void log64f( const double *x, double *y, int n ) { CV_INSTRUMENT_REGION() - static const double shift[] = { 0, -1./512 }; + static const int64 LOGTAB_MASK2_64F = ((int64)1 << (52 - LOGTAB_SCALE)) - 1; static const double A7 = 1.0, A6 = -0.5, @@ -1367,175 +1093,69 @@ void log64f( const double *x, double *y, int n ) A1 = 0.1428571428571428769682682968777953647077083587646484375, A0 = -0.125; -#undef LOGPOLY -#define LOGPOLY(x,k) ((x)+=shift[k], xq = (x)*(x),\ -(((A0*xq + A2)*xq + A4)*xq + A6)*xq + \ -(((A1*xq + A3)*xq + A5)*xq + A7)*(x)) - int i = 0; - DBLINT buf[4]; - DBLINT *X = (DBLINT *) x; -#if CV_SSE2 - static const __m128d ln2_2 = _mm_set1_pd(ln_2); - static const __m128d _1_2 = _mm_set1_pd(1.); - static const __m128d shift2 = _mm_set1_pd(-1./512); +#if CV_SIMD_64F + const int VECSZ = v_float64::nlanes; + static const v_float64 vln2 = vx_setall_f64(ln_2); - static const __m128i log_and_mask2 = _mm_set_epi32(LOGTAB_MASK2, 0xffffffff, LOGTAB_MASK2, 0xffffffff); - static const __m128i log_or_mask2 = _mm_set_epi32(1023 << 20, 0, 1023 << 20, 0); + static const v_float64 + vA0 = vx_setall_f64(A0), vA1 = vx_setall_f64(A1), + vA2 = vx_setall_f64(A2), vA3 = vx_setall_f64(A3), + vA4 = vx_setall_f64(A4), vA5 = vx_setall_f64(A5), + vA6 = vx_setall_f64(A6), vA7 = vx_setall_f64(A7); - static const __m128d mA0 = _mm_set1_pd(A0); - static const __m128d mA1 = _mm_set1_pd(A1); - static const __m128d mA2 = _mm_set1_pd(A2); - static const __m128d mA3 = _mm_set1_pd(A3); - static const __m128d mA4 = _mm_set1_pd(A4); - static const __m128d mA5 = _mm_set1_pd(A5); - static const __m128d mA6 = _mm_set1_pd(A6); - static const __m128d mA7 = _mm_set1_pd(A7); - - int CV_DECL_ALIGNED(16) idx[4]; - - for( ; i <= n - 4; i += 4 ) + for( ; i < n; i += VECSZ ) { - __m128i h0 = _mm_loadu_si128((const __m128i*)(x + i)); - __m128i h1 = _mm_loadu_si128((const __m128i*)(x + i + 2)); - - __m128d xd0 = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(h0, log_and_mask2), log_or_mask2)); - __m128d xd1 = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(h1, log_and_mask2), log_or_mask2)); - - h0 = _mm_unpackhi_epi32(_mm_unpacklo_epi32(h0, h1), _mm_unpackhi_epi32(h0, h1)); - - __m128i yi0 = _mm_sub_epi32(_mm_and_si128(_mm_srli_epi32(h0, 20), - _mm_set1_epi32(2047)), _mm_set1_epi32(1023)); - __m128d yd0 = _mm_mul_pd(_mm_cvtepi32_pd(yi0), ln2_2); - __m128d yd1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_unpackhi_epi64(yi0, yi0)), ln2_2); - - h0 = _mm_and_si128(_mm_srli_epi32(h0, 20 - LOGTAB_SCALE - 1), _mm_set1_epi32(LOGTAB_MASK * 2)); - _mm_store_si128((__m128i*)idx, h0); - h0 = _mm_cmpeq_epi32(h0, _mm_set1_epi32(510)); - - __m128d t0, t1, t2, t3, t4; - t0 = _mm_load_pd(icvLogTab + idx[0]); - t2 = _mm_load_pd(icvLogTab + idx[1]); - t1 = _mm_unpackhi_pd(t0, t2); - t0 = _mm_unpacklo_pd(t0, t2); - t2 = _mm_load_pd(icvLogTab + idx[2]); - t4 = _mm_load_pd(icvLogTab + idx[3]); - t3 = _mm_unpackhi_pd(t2, t4); - t2 = _mm_unpacklo_pd(t2, t4); - - yd0 = _mm_add_pd(yd0, t0); - yd1 = _mm_add_pd(yd1, t2); - - xd0 = _mm_mul_pd(_mm_sub_pd(xd0, _1_2), t1); - xd1 = _mm_mul_pd(_mm_sub_pd(xd1, _1_2), t3); - - xd0 = _mm_add_pd(xd0, _mm_and_pd(_mm_castsi128_pd(_mm_unpacklo_epi32(h0, h0)), shift2)); - xd1 = _mm_add_pd(xd1, _mm_and_pd(_mm_castsi128_pd(_mm_unpackhi_epi32(h0, h0)), shift2)); - - __m128d zd0 = _mm_mul_pd(xd0, mA0); - __m128d zd1 = _mm_mul_pd(xd1, mA0); - zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA1), xd0); - zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA1), xd1); - zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA2), xd0); - zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA2), xd1); - zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA3), xd0); - zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA3), xd1); - zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA4), xd0); - zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA4), xd1); - zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA5), xd0); - zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA5), xd1); - zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA6), xd0); - zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA6), xd1); - zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA7), xd0); - zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA7), xd1); - - yd0 = _mm_add_pd(yd0, zd0); - yd1 = _mm_add_pd(yd1, zd1); - - _mm_storeu_pd(y + i, yd0); - _mm_storeu_pd(y + i + 2, yd1); - } -#endif - for( ; i <= n - 4; i += 4 ) - { - double xq; - double x0, x1, x2, x3; - double y0, y1, y2, y3; - int h0, h1, h2, h3; - - h0 = X[i].i.lo; - h1 = X[i + 1].i.lo; - buf[0].i.lo = h0; - buf[1].i.lo = h1; - - h0 = X[i].i.hi; - h1 = X[i + 1].i.hi; - buf[0].i.hi = (h0 & LOGTAB_MASK2) | (1023 << 20); - buf[1].i.hi = (h1 & LOGTAB_MASK2) | (1023 << 20); - - y0 = (((h0 >> 20) & 0x7ff) - 1023) * ln_2; - y1 = (((h1 >> 20) & 0x7ff) - 1023) * ln_2; - - h2 = X[i + 2].i.lo; - h3 = X[i + 3].i.lo; - buf[2].i.lo = h2; - buf[3].i.lo = h3; - - h0 = (h0 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; - h1 = (h1 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; - - y0 += icvLogTab[h0]; - y1 += icvLogTab[h1]; - - h2 = X[i + 2].i.hi; - h3 = X[i + 3].i.hi; - - x0 = LOGTAB_TRANSLATE( buf[0].d, h0 ); - x1 = LOGTAB_TRANSLATE( buf[1].d, h1 ); - - buf[2].i.hi = (h2 & LOGTAB_MASK2) | (1023 << 20); - buf[3].i.hi = (h3 & LOGTAB_MASK2) | (1023 << 20); - - y2 = (((h2 >> 20) & 0x7ff) - 1023) * ln_2; - y3 = (((h3 >> 20) & 0x7ff) - 1023) * ln_2; - - h2 = (h2 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; - h3 = (h3 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; + if( i + VECSZ > n ) + { + if( i == 0 || x == y ) + break; + i = n - VECSZ; + } - y2 += icvLogTab[h2]; - y3 += icvLogTab[h3]; + v_int64 h0 = vx_load((const int64*)x + i); + v_int32 yi0 = v_pack(v_shr<52>(h0), vx_setzero_s64()); + yi0 = (yi0 & vx_setall_s32(0x7ff)) - vx_setall_s32(1023); - x2 = LOGTAB_TRANSLATE( buf[2].d, h2 ); - x3 = LOGTAB_TRANSLATE( buf[3].d, h3 ); + v_int64 xi0 = (h0 & vx_setall_s64(LOGTAB_MASK2_64F)) | vx_setall_s64((int64)1023 << 52); + h0 = v_shr<52 - LOGTAB_SCALE - 1>(h0); + v_int32 idx = v_pack(h0, h0) & vx_setall_s32(LOGTAB_MASK*2); - y0 += LOGPOLY( x0, h0 == 510 ); - y1 += LOGPOLY( x1, h1 == 510 ); + v_float64 xf0, yf0; + v_lut_deinterleave(logTab, idx, yf0, xf0); - y[i] = y0; - y[i + 1] = y1; + yf0 = v_fma(v_cvt_f64(yi0), vln2, yf0); + v_float64 delta = v_cvt_f64(idx == vx_setall_s32(510))*vx_setall_f64(1./512); + xf0 = v_fma(v_reinterpret_as_f64(xi0) - vx_setall_f64(1.), xf0, delta); - y2 += LOGPOLY( x2, h2 == 510 ); - y3 += LOGPOLY( x3, h3 == 510 ); + v_float64 xq = xf0*xf0; + v_float64 zf0 = v_fma(xq, vA0, vA2); + v_float64 zf1 = v_fma(xq, vA1, vA3); + zf0 = v_fma(zf0, xq, vA4); + zf1 = v_fma(zf1, xq, vA5); + zf0 = v_fma(zf0, xq, vA6); + zf1 = v_fma(zf1, xq, vA7); + zf1 = v_fma(zf1, xf0, yf0); + zf0 = v_fma(zf0, xq, zf1); - y[i + 2] = y2; - y[i + 3] = y3; + v_store(y + i, zf0); } +#endif for( ; i < n; i++ ) { - int h0 = X[i].i.hi; - double xq; - double x0, y0 = (((h0 >> 20) & 0x7ff) - 1023) * ln_2; - - buf[0].i.hi = (h0 & LOGTAB_MASK2) | (1023 << 20); - buf[0].i.lo = X[i].i.lo; - h0 = (h0 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; - - y0 += icvLogTab[h0]; - x0 = LOGTAB_TRANSLATE( buf[0].d, h0 ); - y0 += LOGPOLY( x0, h0 == 510 ); - y[i] = y0; + Cv64suf buf; + int64 i0 = ((const int64*)x)[i]; + + buf.i = (i0 & LOGTAB_MASK2_64F) | ((int64)1023 << 52); + int idx = (int)(i0 >> (52 - LOGTAB_SCALE - 1)) & (LOGTAB_MASK*2); + + double y0 = (((int)(i0 >> 52) & 0x7ff) - 1023) * ln_2 + logTab[idx]; + double x0 = (buf.f - 1.)*logTab[idx + 1] + (idx == 510 ? -1./512 : 0.); + + double xq = x0*x0; + y[i] = (((A0*xq + A2)*xq + A4)*xq + A6)*xq + (((A1*xq + A3)*xq + A5)*xq + A7)*x0 + y0; } } @@ -1543,7 +1163,7 @@ void log64f( const double *x, double *y, int n ) float fastAtan2( float y, float x ) { - return atanImpl(y, x); + return atan_f32(y, x); } #endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY diff --git a/modules/core/test/test_intrin.cpp b/modules/core/test/test_intrin.cpp index 4171babc03..9a1130fe96 100644 --- a/modules/core/test/test_intrin.cpp +++ b/modules/core/test/test_intrin.cpp @@ -241,9 +241,9 @@ TEST(hal_intrin, float64x2) { } #endif -TEST(hal_intrin,float16x4) +TEST(hal_intrin,float16) { - CV_CPU_CALL_FP16_(test_hal_intrin_float16x4, ()); + CV_CPU_CALL_FP16_(test_hal_intrin_float16, ()); throw SkipTestException("Unsupported hardware: FP16 is not available"); } diff --git a/modules/core/test/test_intrin.fp16.cpp b/modules/core/test/test_intrin.fp16.cpp index 7855fda287..893c5f147a 100644 --- a/modules/core/test/test_intrin.fp16.cpp +++ b/modules/core/test/test_intrin.fp16.cpp @@ -7,9 +7,9 @@ namespace opencv_test { namespace hal { CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN -void test_hal_intrin_float16x4() +void test_hal_intrin_float16() { - TheTest() + TheTest() .test_loadstore_fp16() .test_float_cvt_fp16() ; diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index 7579d9cf05..2f8c1cf0b7 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -6,7 +6,7 @@ namespace opencv_test { namespace hal { CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN -void test_hal_intrin_float16x4(); +void test_hal_intrin_float16(); #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY @@ -50,6 +50,8 @@ template <> struct initializer<2> template struct Data { typedef typename R::lane_type LaneType; + typedef typename V_TypeTraits::int_type int_type; + Data() { for (int i = 0; i < R::nlanes; ++i) @@ -104,6 +106,17 @@ template struct Data CV_Assert(i >= 0 && i < R::nlanes); return d[i]; } + int_type as_int(int i) const + { + CV_Assert(i >= 0 && i < R::nlanes); + union + { + LaneType l; + int_type i; + } v; + v.l = d[i]; + return v.i; + } const LaneType * mid() const { return d + R::nlanes / 2; @@ -247,8 +260,9 @@ template struct TheTest EXPECT_EQ(d, res); // zero, all - Data resZ = V_RegTrait128::zero(); - Data resV = V_RegTrait128::all(8); + Data resZ, resV; + resZ.fill((LaneType)0); + resV.fill((LaneType)8); for (int i = 0; i < R::nlanes; ++i) { EXPECT_EQ((LaneType)0, resZ[i]); @@ -339,7 +353,7 @@ template struct TheTest // v_expand and v_load_expand TheTest & test_expand() { - typedef typename V_RegTrait128::w_reg Rx2; + typedef typename V_RegTraits::w_reg Rx2; Data dataA; R a = dataA; @@ -362,7 +376,7 @@ template struct TheTest TheTest & test_expand_q() { - typedef typename V_RegTrait128::q_reg Rx4; + typedef typename V_RegTraits::q_reg Rx4; Data data; Data out = v_load_expand_q(data.d); const int n = Rx4::nlanes; @@ -436,7 +450,7 @@ template struct TheTest TheTest & test_mul_expand() { - typedef typename V_RegTrait128::w_reg Rx2; + typedef typename V_RegTraits::w_reg Rx2; Data dataA, dataB(2); R a = dataA, b = dataB; Rx2 c, d; @@ -456,7 +470,7 @@ template struct TheTest TheTest & test_abs() { - typedef typename V_RegTrait128::u_reg Ru; + typedef typename V_RegTraits::u_reg Ru; typedef typename Ru::lane_type u_type; Data dataA, dataB(10); R a = dataA, b = dataB; @@ -520,7 +534,7 @@ template struct TheTest TheTest & test_dot_prod() { - typedef typename V_RegTrait128::w_reg Rx2; + typedef typename V_RegTraits::w_reg Rx2; typedef typename Rx2::lane_type w_type; Data dataA, dataB(2); @@ -608,7 +622,7 @@ template struct TheTest TheTest & test_absdiff() { - typedef typename V_RegTrait128::u_reg Ru; + typedef typename V_RegTraits::u_reg Ru; typedef typename Ru::lane_type u_type; Data dataA(std::numeric_limits::max()), dataB(std::numeric_limits::min()); @@ -657,12 +671,21 @@ template struct TheTest TheTest & test_mask() { - typedef V_TypeTraits Traits; - typedef typename Traits::int_type int_type; + typedef typename V_RegTraits::int_reg int_reg; + typedef typename V_RegTraits::u_reg uint_reg; + typedef typename int_reg::lane_type int_type; + typedef typename uint_reg::lane_type uint_type; Data dataA, dataB(0), dataC, dataD(1), dataE(2); dataA[1] *= (LaneType)-1; - const LaneType mask_one = Traits::reinterpret_from_int(~(typename Traits::uint_type)(0)); + union + { + LaneType l; + uint_type ui; + } + all1s; + all1s.ui = (uint_type)-1; + LaneType mask_one = all1s.l; dataB[1] = mask_one; dataB[R::nlanes / 2] = mask_one; dataB[R::nlanes - 1] = mask_one; @@ -684,10 +707,8 @@ template struct TheTest Data resF = f; for (int i = 0; i < R::nlanes; ++i) { - int_type m2 = Traits::reinterpret_int(dataB[i]); - EXPECT_EQ((Traits::reinterpret_int(dataD[i]) & m2) - | (Traits::reinterpret_int(dataE[i]) & ~m2), - Traits::reinterpret_int(resF[i])); + int_type m2 = dataB.as_int(i); + EXPECT_EQ((dataD.as_int(i) & m2) | (dataE.as_int(i) & ~m2), resF.as_int(i)); } return *this; @@ -697,7 +718,7 @@ template struct TheTest TheTest & test_pack() { SCOPED_TRACE(s); - typedef typename V_RegTrait128::w_reg Rx2; + typedef typename V_RegTraits::w_reg Rx2; typedef typename Rx2::lane_type w_type; Data dataA, dataB; dataA += std::numeric_limits::is_signed ? -10 : 10; @@ -734,8 +755,9 @@ template struct TheTest TheTest & test_pack_u() { SCOPED_TRACE(s); - typedef typename V_TypeTraits::w_type LaneType_w; - typedef typename V_RegTrait128::int_reg Ri2; + //typedef typename V_RegTraits::w_type LaneType_w; + typedef typename V_RegTraits::w_reg R2; + typedef typename V_RegTraits::int_reg Ri2; typedef typename Ri2::lane_type w_type; Data dataA, dataB; @@ -864,7 +886,7 @@ template struct TheTest TheTest & test_float_math() { - typedef typename V_RegTrait128::int_reg Ri; + typedef typename V_RegTraits::round_reg Ri; Data data1, data2, data3; data1 *= 1.1; data2 += 10; @@ -1005,31 +1027,28 @@ template struct TheTest TheTest & test_loadstore_fp16() { -#if CV_FP16 && CV_SIMD128 +#if CV_FP16 && CV_SIMD AlignedData data; AlignedData out; - if(1 /* checkHardwareSupport(CV_CPU_FP16) */ ) - { - // check if addresses are aligned and unaligned respectively - EXPECT_EQ((size_t)0, (size_t)&data.a.d % 16); - EXPECT_NE((size_t)0, (size_t)&data.u.d % 16); - EXPECT_EQ((size_t)0, (size_t)&out.a.d % 16); - EXPECT_NE((size_t)0, (size_t)&out.u.d % 16); - - // check some initialization methods - R r1 = data.u; - R r2 = v_load_f16(data.a.d); - R r3(r2); - EXPECT_EQ(data.u[0], r1.get0()); - EXPECT_EQ(data.a[0], r2.get0()); - EXPECT_EQ(data.a[0], r3.get0()); - - // check some store methods - out.a.clear(); - v_store_f16(out.a.d, r1); - EXPECT_EQ(data.a, out.a); - } + // check if addresses are aligned and unaligned respectively + EXPECT_EQ((size_t)0, (size_t)&data.a.d % 16); + EXPECT_NE((size_t)0, (size_t)&data.u.d % 16); + EXPECT_EQ((size_t)0, (size_t)&out.a.d % 16); + EXPECT_NE((size_t)0, (size_t)&out.u.d % 16); + + // check some initialization methods + R r1 = data.u; + R r2 = v_load_f16(data.a.d); + R r3(r2); + EXPECT_EQ(data.u[0], r1.get0()); + EXPECT_EQ(data.a[0], r2.get0()); + EXPECT_EQ(data.a[0], r3.get0()); + + // check some store methods + out.a.clear(); + v_store(out.a.d, r1); + EXPECT_EQ(data.a, out.a); return *this; #endif @@ -1037,18 +1056,15 @@ template struct TheTest TheTest & test_float_cvt_fp16() { -#if CV_FP16 && CV_SIMD128 - AlignedData data; - - if(1 /* checkHardwareSupport(CV_CPU_FP16) */) - { - // check conversion - v_float32x4 r1 = v_load(data.a.d); - v_float16x4 r2 = v_cvt_f16(r1); - v_float32x4 r3 = v_cvt_f32(r2); - EXPECT_EQ(0x3c00, r2.get0()); - EXPECT_EQ(r3.get0(), r1.get0()); - } +#if CV_FP16 && CV_SIMD + AlignedData data; + + // check conversion + v_float32 r1 = vx_load(data.a.d); + v_float16 r2 = v_cvt_f16(r1, vx_setzero_f32()); + v_float32 r3 = v_cvt_f32(r2); + EXPECT_EQ(0x3c00, r2.get0()); + EXPECT_EQ(r3.get0(), r1.get0()); return *this; #endif diff --git a/modules/core/test/test_math.cpp b/modules/core/test/test_math.cpp index 44b6ebdbb3..68dfc3c969 100644 --- a/modules/core/test/test_math.cpp +++ b/modules/core/test/test_math.cpp @@ -134,7 +134,9 @@ double Core_PowTest::get_success_error_level( int test_case_idx, int i, int j ) if( depth < CV_32F ) return power == cvRound(power) && power >= 0 ? 0 : 1; else - return Base::get_success_error_level( test_case_idx, i, j ); + { + return depth != CV_64F ? Base::get_success_error_level( test_case_idx, i, j ) : DBL_EPSILON*1024*1.1; + } } diff --git a/modules/ts/src/ts_func.cpp b/modules/ts/src/ts_func.cpp index a639dc2e18..a5366bf6fd 100644 --- a/modules/ts/src/ts_func.cpp +++ b/modules/ts/src/ts_func.cpp @@ -2129,7 +2129,7 @@ int cmpEps2( TS* ts, const Mat& a, const Mat& b, double success_err_level, switch( code ) { case CMP_EPS_BIG_DIFF: - sprintf( msg, "%s: Too big difference (=%g)", desc, diff ); + sprintf( msg, "%s: Too big difference (=%g > %g)", desc, diff, success_err_level ); code = TS::FAIL_BAD_ACCURACY; break; case CMP_EPS_INVALID_TEST_DATA: