|
|
|
@ -302,8 +302,8 @@ inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps) |
|
|
|
|
template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
|
|
|
|
|
{ return _Tpvec(cast(a.val)); } |
|
|
|
|
|
|
|
|
|
OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, char, OPENCV_HAL_NOP) |
|
|
|
|
OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, char, OPENCV_HAL_NOP) |
|
|
|
|
OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, schar, OPENCV_HAL_NOP) |
|
|
|
|
OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, schar, OPENCV_HAL_NOP) |
|
|
|
|
OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP) |
|
|
|
|
OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP) |
|
|
|
|
OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP) |
|
|
|
@ -1393,6 +1393,41 @@ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ |
|
|
|
|
OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps) |
|
|
|
|
OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd) |
|
|
|
|
|
|
|
|
|
inline unsigned v_reduce_sum(const v_uint8x16& a) |
|
|
|
|
{ |
|
|
|
|
__m128i half = _mm_sad_epu8(a.val, _mm_setzero_si128()); |
|
|
|
|
return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half))); |
|
|
|
|
} |
|
|
|
|
inline int v_reduce_sum(const v_int8x16& a) |
|
|
|
|
{ |
|
|
|
|
__m128i half = _mm_set1_epi8((schar)-128); |
|
|
|
|
half = _mm_sad_epu8(_mm_xor_si128(a.val, half), _mm_setzero_si128()); |
|
|
|
|
return _mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half))) - 2048; |
|
|
|
|
} |
|
|
|
|
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(func) \ |
|
|
|
|
inline schar v_reduce_##func(const v_int8x16& a) \
|
|
|
|
|
{ \
|
|
|
|
|
__m128i val = a.val; \
|
|
|
|
|
__m128i smask = _mm_set1_epi8((schar)-128); \
|
|
|
|
|
val = _mm_xor_si128(val, smask); \
|
|
|
|
|
val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
|
|
|
|
|
val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
|
|
|
|
|
val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
|
|
|
|
|
val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
|
|
|
|
|
return (schar)_mm_cvtsi128_si32(val) ^ (schar)-128; \
|
|
|
|
|
} \
|
|
|
|
|
inline uchar v_reduce_##func(const v_uint8x16& a) \
|
|
|
|
|
{ \
|
|
|
|
|
__m128i val = a.val; \
|
|
|
|
|
val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
|
|
|
|
|
val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
|
|
|
|
|
val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
|
|
|
|
|
val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
|
|
|
|
|
return (uchar)_mm_cvtsi128_si32(val); \
|
|
|
|
|
} |
|
|
|
|
OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(max) |
|
|
|
|
OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(min) |
|
|
|
|
|
|
|
|
|
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \ |
|
|
|
|
inline scalartype v_reduce_##func(const v_##_Tpvec& a) \
|
|
|
|
|
{ \
|
|
|
|
@ -1412,26 +1447,8 @@ inline unsigned scalartype v_reduce_##func(const v_u##_Tpvec& a) \ |
|
|
|
|
val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
|
|
|
|
|
return (unsigned scalartype)(_mm_cvtsi128_si32(val) ^ sbit); \
|
|
|
|
|
} |
|
|
|
|
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(_Tpvec, scalartype, suffix) \ |
|
|
|
|
inline scalartype v_reduce_sum(const v_##_Tpvec& a) \
|
|
|
|
|
{ \
|
|
|
|
|
__m128i val = a.val; \
|
|
|
|
|
val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 8)); \
|
|
|
|
|
val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 4)); \
|
|
|
|
|
val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 2)); \
|
|
|
|
|
return (scalartype)_mm_cvtsi128_si32(val); \
|
|
|
|
|
} \
|
|
|
|
|
inline unsigned scalartype v_reduce_sum(const v_u##_Tpvec& a) \
|
|
|
|
|
{ \
|
|
|
|
|
__m128i val = a.val; \
|
|
|
|
|
val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 8)); \
|
|
|
|
|
val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 4)); \
|
|
|
|
|
val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 2)); \
|
|
|
|
|
return (unsigned scalartype)_mm_cvtsi128_si32(val); \
|
|
|
|
|
} |
|
|
|
|
OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, max, epi16, (short)-32768) |
|
|
|
|
OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, min, epi16, (short)-32768) |
|
|
|
|
OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(int16x8, short, 16) |
|
|
|
|
|
|
|
|
|
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, cast_from, cast_to, extract) \ |
|
|
|
|
inline scalartype v_reduce_sum(const _Tpvec& a) \
|
|
|
|
@ -1456,6 +1473,23 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_uint32x4, unsigned, __m128i, epi32, OPENCV |
|
|
|
|
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32) |
|
|
|
|
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32) |
|
|
|
|
|
|
|
|
|
inline int v_reduce_sum(const v_int16x8& a) |
|
|
|
|
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); } |
|
|
|
|
inline unsigned v_reduce_sum(const v_uint16x8& a) |
|
|
|
|
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); } |
|
|
|
|
|
|
|
|
|
inline uint64 v_reduce_sum(const v_uint64x2& a) |
|
|
|
|
{ |
|
|
|
|
uint64 CV_DECL_ALIGNED(32) idx[2]; |
|
|
|
|
v_store_aligned(idx, a); |
|
|
|
|
return idx[0] + idx[1]; |
|
|
|
|
} |
|
|
|
|
inline int64 v_reduce_sum(const v_int64x2& a) |
|
|
|
|
{ |
|
|
|
|
int64 CV_DECL_ALIGNED(32) idx[2]; |
|
|
|
|
v_store_aligned(idx, a); |
|
|
|
|
return idx[0] + idx[1]; |
|
|
|
|
} |
|
|
|
|
inline double v_reduce_sum(const v_float64x2& a) |
|
|
|
|
{ |
|
|
|
|
double CV_DECL_ALIGNED(32) idx[2]; |
|
|
|
@ -1520,27 +1554,42 @@ inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b) |
|
|
|
|
return v_reduce_sum(v_absdiff(a, b)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#define OPENCV_HAL_IMPL_SSE_POPCOUNT(_Tpvec) \ |
|
|
|
|
inline v_uint32x4 v_popcount(const _Tpvec& a) \
|
|
|
|
|
{ \
|
|
|
|
|
__m128i m1 = _mm_set1_epi32(0x55555555); \
|
|
|
|
|
__m128i m2 = _mm_set1_epi32(0x33333333); \
|
|
|
|
|
__m128i m4 = _mm_set1_epi32(0x0f0f0f0f); \
|
|
|
|
|
__m128i p = a.val; \
|
|
|
|
|
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1)); \
|
|
|
|
|
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2)); \
|
|
|
|
|
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4)); \
|
|
|
|
|
p = _mm_adds_epi8(p, _mm_srli_si128(p, 1)); \
|
|
|
|
|
p = _mm_adds_epi8(p, _mm_srli_si128(p, 2)); \
|
|
|
|
|
return v_uint32x4(_mm_and_si128(p, _mm_set1_epi32(0x000000ff))); \
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint8x16) |
|
|
|
|
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint16x8) |
|
|
|
|
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint32x4) |
|
|
|
|
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int8x16) |
|
|
|
|
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int16x8) |
|
|
|
|
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int32x4) |
|
|
|
|
inline v_uint8x16 v_popcount(const v_uint8x16& a) |
|
|
|
|
{ |
|
|
|
|
__m128i m1 = _mm_set1_epi32(0x55555555); |
|
|
|
|
__m128i m2 = _mm_set1_epi32(0x33333333); |
|
|
|
|
__m128i m4 = _mm_set1_epi32(0x0f0f0f0f); |
|
|
|
|
__m128i p = a.val; |
|
|
|
|
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1)); |
|
|
|
|
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2)); |
|
|
|
|
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4)); |
|
|
|
|
return v_uint8x16(p); |
|
|
|
|
} |
|
|
|
|
inline v_uint16x8 v_popcount(const v_uint16x8& a) |
|
|
|
|
{ |
|
|
|
|
v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a)); |
|
|
|
|
p += v_rotate_right<1>(p); |
|
|
|
|
return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff); |
|
|
|
|
} |
|
|
|
|
inline v_uint32x4 v_popcount(const v_uint32x4& a) |
|
|
|
|
{ |
|
|
|
|
v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a)); |
|
|
|
|
p += v_rotate_right<1>(p); |
|
|
|
|
p += v_rotate_right<2>(p); |
|
|
|
|
return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff); |
|
|
|
|
} |
|
|
|
|
inline v_uint64x2 v_popcount(const v_uint64x2& a) |
|
|
|
|
{ |
|
|
|
|
return v_uint64x2(_mm_sad_epu8(v_popcount(v_reinterpret_as_u8(a)).val, _mm_setzero_si128())); |
|
|
|
|
} |
|
|
|
|
inline v_uint8x16 v_popcount(const v_int8x16& a) |
|
|
|
|
{ return v_popcount(v_reinterpret_as_u8(a)); } |
|
|
|
|
inline v_uint16x8 v_popcount(const v_int16x8& a) |
|
|
|
|
{ return v_popcount(v_reinterpret_as_u16(a)); } |
|
|
|
|
inline v_uint32x4 v_popcount(const v_int32x4& a) |
|
|
|
|
{ return v_popcount(v_reinterpret_as_u32(a)); } |
|
|
|
|
inline v_uint64x2 v_popcount(const v_int64x2& a) |
|
|
|
|
{ return v_popcount(v_reinterpret_as_u64(a)); } |
|
|
|
|
|
|
|
|
|
#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \ |
|
|
|
|
inline int v_signmask(const _Tpvec& a) \
|
|
|
|
|