|
|
@ -899,6 +899,15 @@ inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ |
|
|
|
OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps) |
|
|
|
OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps) |
|
|
|
OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd) |
|
|
|
OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec, cast) \ |
|
|
|
|
|
|
|
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
|
|
|
|
|
|
|
|
{ return cast(v_reinterpret_as_f64(a) == v_reinterpret_as_f64(b)); } \
|
|
|
|
|
|
|
|
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
|
|
|
|
|
|
|
|
{ return cast(v_reinterpret_as_f64(a) != v_reinterpret_as_f64(b)); } |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2, v_reinterpret_as_u64); |
|
|
|
|
|
|
|
OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_int64x2, v_reinterpret_as_s64); |
|
|
|
|
|
|
|
|
|
|
|
OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8) |
|
|
|
OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8) |
|
|
|
OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8) |
|
|
|
OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8) |
|
|
|
OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16) |
|
|
|
OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16) |
|
|
@ -1520,6 +1529,35 @@ inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& |
|
|
|
v_transpose4x4(u0, u1, u2, u3, a, b, c, d); |
|
|
|
v_transpose4x4(u0, u1, u2, u3, a, b, c, d); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
__m128i t0 = _mm_loadu_si128((const __m128i*)ptr); |
|
|
|
|
|
|
|
__m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); |
|
|
|
|
|
|
|
__m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
a = v_uint64x2(_mm_unpacklo_epi64(t0, _mm_unpackhi_epi64(t1, t1))); |
|
|
|
|
|
|
|
b = v_uint64x2(_mm_unpacklo_epi64(_mm_unpackhi_epi64(t0, t0), t2)); |
|
|
|
|
|
|
|
c = v_uint64x2(_mm_unpacklo_epi64(t1, _mm_unpackhi_epi64(t2, t2))); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inline void v_load_deinterleave(const int64 *ptr, v_int64x2& a, v_int64x2& b, v_int64x2& c) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_uint64x2 t0, t1, t2; |
|
|
|
|
|
|
|
v_load_deinterleave((const uint64*)ptr, t0, t1, t2); |
|
|
|
|
|
|
|
a = v_reinterpret_as_s64(t0); |
|
|
|
|
|
|
|
b = v_reinterpret_as_s64(t1); |
|
|
|
|
|
|
|
c = v_reinterpret_as_s64(t2); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inline void v_load_deinterleave(const double *ptr, v_float64x2& a, v_float64x2& b, v_float64x2& c) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_uint64x2 t0, t1, t2; |
|
|
|
|
|
|
|
v_load_deinterleave((const uint64*)ptr, t0, t1, t2); |
|
|
|
|
|
|
|
a = v_reinterpret_as_f64(t0); |
|
|
|
|
|
|
|
b = v_reinterpret_as_f64(t1); |
|
|
|
|
|
|
|
c = v_reinterpret_as_f64(t2); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// 2-channel, float only
|
|
|
|
// 2-channel, float only
|
|
|
|
inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b) |
|
|
|
inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b) |
|
|
|
{ |
|
|
|
{ |
|
|
@ -1717,6 +1755,27 @@ inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32 |
|
|
|
_mm_storeu_ps((ptr + 4), u1); |
|
|
|
_mm_storeu_ps((ptr + 4), u1); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
__m128i t0 = _mm_unpacklo_epi64(a.val, b.val); |
|
|
|
|
|
|
|
__m128i t1 = _mm_unpacklo_epi64(c.val, _mm_unpackhi_epi64(a.val, a.val)); |
|
|
|
|
|
|
|
__m128i t2 = _mm_unpackhi_epi64(b.val, c.val); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_mm_storeu_si128((__m128i*)ptr, t0); |
|
|
|
|
|
|
|
_mm_storeu_si128((__m128i*)(ptr + 2), t1); |
|
|
|
|
|
|
|
_mm_storeu_si128((__m128i*)(ptr + 4), t2); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inline void v_store_interleave(int64 *ptr, const v_int64x2& a, const v_int64x2& b, const v_int64x2& c) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_store_interleave((uint64*)ptr, v_reinterpret_as_u64(a), v_reinterpret_as_u64(b), v_reinterpret_as_u64(c)); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inline void v_store_interleave(double *ptr, const v_float64x2& a, const v_float64x2& b, const v_float64x2& c) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
v_store_interleave((uint64*)ptr, v_reinterpret_as_u64(a), v_reinterpret_as_u64(b), v_reinterpret_as_u64(c)); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \ |
|
|
|
#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \ |
|
|
|
inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
|
|
|
|
inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
|
|
|
|
_Tpvec& b0, _Tpvec& c0 ) \
|
|
|
|
_Tpvec& b0, _Tpvec& c0 ) \
|
|
|
|