diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index d9719b7fa0..c3e89b98c1 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -213,7 +213,7 @@ Regular integers: |min, max | x | x | x | x | x | x | |absdiff | x | x | x | x | x | x | |absdiffs | | x | | x | | | -|reduce | | | | | x | x | +|reduce | x | x | x | x | x | x | |mask | x | x | x | x | x | x | |pack | x | x | x | x | x | x | |pack_u | x | | x | | | | @@ -670,7 +670,7 @@ Scheme: @code {A1 A2 A3 ...} => min(A1,A2,A3,...) @endcode -For 32-bit integer and 32-bit floating point types. */ +For all types except 64-bit integer and 64-bit floating point types. */ OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_min, std::min) /** @brief Find one max value @@ -679,7 +679,7 @@ Scheme: @code {A1 A2 A3 ...} => max(A1,A2,A3,...) @endcode -For 32-bit integer and 32-bit floating point types. */ +For all types except 64-bit integer and 64-bit floating point types. */ OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_max, std::max) static const unsigned char popCountTable[] = @@ -1219,7 +1219,7 @@ Scheme: @code {A1 A2 A3 ...} => sum{A1,A2,A3,...} @endcode -For 32-bit integer and 32-bit floating point types.*/ +*/ template inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a) { typename V_TypeTraits<_Tp>::sum_type c = a.s[0]; diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index 559222bb57..280691b448 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -1241,6 +1241,20 @@ inline int v_reduce_sum(const v_int16x8& a) return vget_lane_s32(vpadd_s32(t1, t1), 0); } +#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \ +inline scalartype v_reduce_##func(const _Tpvec& a) \ +{ \ + _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \ + a0 = vp##vectorfunc##_##suffix(a0, a0); \ + a0 = vp##vectorfunc##_##suffix(a0, a0); \ + return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \ +} + +OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_uint8x16, uint8x8, uchar, max, max, u8) +OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_uint8x16, uint8x8, uchar, min, min, u8) +OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_int8x16, int8x8, schar, max, max, s8) +OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_int8x16, int8x8, schar, min, min, s8) + #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \ inline scalartype v_reduce_##func(const _Tpvec& a) \ { \ @@ -1249,10 +1263,10 @@ inline scalartype v_reduce_##func(const _Tpvec& a) \ return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \ } -OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned int, max, max, u16) -OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned int, min, min, u16) -OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, int, max, max, s16) -OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, int, min, min, s16) +OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, max, max, u16) +OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, min, min, u16) +OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, max, max, s16) +OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, min, min, s16) #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \ inline scalartype v_reduce_##func(const _Tpvec& a) \ diff --git a/modules/core/src/minmax.cpp b/modules/core/src/minmax.cpp index da75e20a1d..b4e5e4632a 100644 --- a/modules/core/src/minmax.cpp +++ b/modules/core/src/minmax.cpp @@ -71,33 +71,759 @@ minMaxIdx_( const T* src, const uchar* mask, WT* _minVal, WT* _maxVal, *_maxVal = maxVal; } +#if CV_SIMD128 +template CV_ALWAYS_INLINE void +minMaxIdx_init( const T* src, const uchar* mask, WT* minval, WT* maxval, + size_t* minidx, size_t* maxidx, WT &minVal, WT &maxVal, + size_t &minIdx, size_t &maxIdx, const WT minInit, const WT maxInit, + const int nlanes, int len, size_t startidx, int &j, int &len0 ) +{ + len0 = len & -nlanes; + j = 0; + + minVal = *minval, maxVal = *maxval; + minIdx = *minidx, maxIdx = *maxidx; + + // To handle start values out of range + if ( minVal < minInit || maxVal < minInit || minVal > maxInit || maxVal > maxInit ) + { + uchar done = 0x00; + + for ( ; (j < len) && (done != 0x03); j++ ) + { + if ( !mask || mask[j] ) { + T val = src[j]; + if ( val < minVal ) + { + minVal = val; + minIdx = startidx + j; + done |= 0x01; + } + if ( val > maxVal ) + { + maxVal = val; + maxIdx = startidx + j; + done |= 0x02; + } + } + } + + len0 = j + ((len - j) & -nlanes); + } +} + +#if CV_SIMD128_64F +CV_ALWAYS_INLINE double v_reduce_min(const v_float64x2& a) +{ + double CV_DECL_ALIGNED(32) idx[2]; + v_store_aligned(idx, a); + return std::min(idx[0], idx[1]); +} + +CV_ALWAYS_INLINE double v_reduce_max(const v_float64x2& a) +{ + double CV_DECL_ALIGNED(32) idx[2]; + v_store_aligned(idx, a); + return std::max(idx[0], idx[1]); +} + +CV_ALWAYS_INLINE uint64_t v_reduce_min(const v_uint64x2& a) +{ + uint64_t CV_DECL_ALIGNED(32) idx[2]; + v_store_aligned(idx, a); + return std::min(idx[0], idx[1]); +} + +CV_ALWAYS_INLINE v_uint64x2 v_select(const v_uint64x2& mask, const v_uint64x2& a, const v_uint64x2& b) +{ + return b ^ ((a ^ b) & mask); +} +#endif + +#define MINMAXIDX_REDUCE(suffix, suffix2, maxLimit, IR) \ +template CV_ALWAYS_INLINE void \ +minMaxIdx_reduce_##suffix( VT &valMin, VT &valMax, IT &idxMin, IT &idxMax, IT &none, \ + T &minVal, T &maxVal, size_t &minIdx, size_t &maxIdx, \ + size_t delta ) \ +{ \ + if ( v_check_any(idxMin != none) ) \ + { \ + minVal = v_reduce_min(valMin); \ + minIdx = (size_t)v_reduce_min(v_select(v_reinterpret_as_##suffix2(v_setall_##suffix((IR)minVal) == valMin), \ + idxMin, v_setall_##suffix2(maxLimit))) + delta; \ + } \ + if ( v_check_any(idxMax != none) ) \ + { \ + maxVal = v_reduce_max(valMax); \ + maxIdx = (size_t)v_reduce_min(v_select(v_reinterpret_as_##suffix2(v_setall_##suffix((IR)maxVal) == valMax), \ + idxMax, v_setall_##suffix2(maxLimit))) + delta; \ + } \ +} + +MINMAXIDX_REDUCE(u8, u8, UCHAR_MAX, uchar) +MINMAXIDX_REDUCE(s8, u8, UCHAR_MAX, uchar) +MINMAXIDX_REDUCE(u16, u16, USHRT_MAX, ushort) +MINMAXIDX_REDUCE(s16, u16, USHRT_MAX, ushort) +MINMAXIDX_REDUCE(s32, u32, UINT_MAX, uint) +MINMAXIDX_REDUCE(f32, u32, (1 << 23) - 1, float) +#if CV_SIMD128_64F +MINMAXIDX_REDUCE(f64, u64, UINT_MAX, double) +#endif + +template CV_ALWAYS_INLINE void +minMaxIdx_finish( const T* src, const uchar* mask, WT* minval, WT* maxval, + size_t* minidx, size_t* maxidx, WT minVal, WT maxVal, + size_t minIdx, size_t maxIdx, int len, size_t startidx, + int j ) +{ + for ( ; j < len ; j++ ) + { + if ( !mask || mask[j] ) + { + T val = src[j]; + if ( val < minVal ) + { + minVal = val; + minIdx = startidx + j; + } + if ( val > maxVal ) + { + maxVal = val; + maxIdx = startidx + j; + } + } + } + + *minidx = minIdx; + *maxidx = maxIdx; + *minval = minVal; + *maxval = maxVal; +} +#endif + static void minMaxIdx_8u(const uchar* src, const uchar* mask, int* minval, int* maxval, size_t* minidx, size_t* maxidx, int len, size_t startidx ) -{ minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); } +{ +#if CV_SIMD128 + if ( len >= v_uint8x16::nlanes ) + { + int j, len0; + int minVal, maxVal; + size_t minIdx, maxIdx; + + minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx, + (int)0, (int)UCHAR_MAX, v_uint8x16::nlanes, len, startidx, j, len0 ); + + if ( j <= len0 - v_uint8x16::nlanes ) + { + v_uint8x16 inc = v_setall_u8(v_uint8x16::nlanes); + v_uint8x16 none = v_reinterpret_as_u8(v_setall_s8(-1)); + v_uint8x16 idxStart(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + + do + { + v_uint8x16 valMin = v_setall_u8((uchar)minVal), valMax = v_setall_u8((uchar)maxVal); + v_uint8x16 idx = idxStart, idxMin = none, idxMax = none; + + int k = j; + size_t delta = startidx + j; + + if ( !mask ) + { + for( ; k < std::min(len0, j + 15 * v_uint8x16::nlanes); k += v_uint8x16::nlanes ) + { + v_uint8x16 data = v_load(src + k); + v_uint8x16 cmpMin = (data < valMin); + v_uint8x16 cmpMax = (data > valMax); + idxMin = v_select(cmpMin, idx, idxMin); + idxMax = v_select(cmpMax, idx, idxMax); + valMin = v_min(data, valMin); + valMax = v_max(data, valMax); + idx += inc; + } + } + else + { + for( ; k < std::min(len0, j + 15 * v_uint8x16::nlanes); k += v_uint8x16::nlanes ) + { + v_uint8x16 data = v_load(src + k); + v_uint8x16 maskVal = v_load(mask + k) != v_setzero_u8(); + v_uint8x16 cmpMin = (data < valMin) & maskVal; + v_uint8x16 cmpMax = (data > valMax) & maskVal; + idxMin = v_select(cmpMin, idx, idxMin); + idxMax = v_select(cmpMax, idx, idxMax); + valMin = v_select(cmpMin, data, valMin); + valMax = v_select(cmpMax, data, valMax); + idx += inc; + } + } + + j = k; + + minMaxIdx_reduce_u8( valMin, valMax, idxMin, idxMax, none, minVal, maxVal, + minIdx, maxIdx, delta ); + } + while ( j < len0 ); + } + + minMaxIdx_finish( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, + minIdx, maxIdx, len, startidx, j ); + } + else + { + minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx); + } +#else + minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx); +#endif +} static void minMaxIdx_8s(const schar* src, const uchar* mask, int* minval, int* maxval, size_t* minidx, size_t* maxidx, int len, size_t startidx ) -{ minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); } +{ +#if CV_SIMD128 + if ( len >= v_int8x16::nlanes ) + { + int j, len0; + int minVal, maxVal; + size_t minIdx, maxIdx; + + minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx, + (int)SCHAR_MIN, (int)SCHAR_MAX, v_int8x16::nlanes, len, startidx, j, len0 ); + + if ( j <= len0 - v_int8x16::nlanes ) + { + v_uint8x16 inc = v_setall_u8(v_int8x16::nlanes); + v_uint8x16 none = v_reinterpret_as_u8(v_setall_s8(-1)); + v_uint8x16 idxStart(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + + do + { + v_int8x16 valMin = v_setall_s8((schar)minVal), valMax = v_setall_s8((schar)maxVal); + v_uint8x16 idx = idxStart, idxMin = none, idxMax = none; + + int k = j; + size_t delta = startidx + j; + + if ( !mask ) + { + for( ; k < std::min(len0, j + 15 * v_int8x16::nlanes); k += v_int8x16::nlanes ) + { + v_int8x16 data = v_load(src + k); + v_uint8x16 cmpMin = v_reinterpret_as_u8(data < valMin); + v_uint8x16 cmpMax = v_reinterpret_as_u8(data > valMax); + idxMin = v_select(cmpMin, idx, idxMin); + idxMax = v_select(cmpMax, idx, idxMax); + valMin = v_min(data, valMin); + valMax = v_max(data, valMax); + idx += inc; + } + } + else + { + for( ; k < std::min(len0, j + 15 * v_int8x16::nlanes); k += v_int8x16::nlanes ) + { + v_int8x16 data = v_load(src + k); + v_uint8x16 maskVal = v_load(mask + k) != v_setzero_u8(); + v_uint8x16 cmpMin = v_reinterpret_as_u8(data < valMin) & maskVal; + v_uint8x16 cmpMax = v_reinterpret_as_u8(data > valMax) & maskVal; + idxMin = v_select(cmpMin, idx, idxMin); + idxMax = v_select(cmpMax, idx, idxMax); + valMin = v_select(v_reinterpret_as_s8(cmpMin), data, valMin); + valMax = v_select(v_reinterpret_as_s8(cmpMax), data, valMax); + idx += inc; + } + } + + j = k; + + minMaxIdx_reduce_s8( valMin, valMax, idxMin, idxMax, none, minVal, maxVal, + minIdx, maxIdx, delta ); + } + while ( j < len0 ); + } + + minMaxIdx_finish( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, + minIdx, maxIdx, len, startidx, j ); + } + else + { + minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx); + } +#else + minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); +#endif +} static void minMaxIdx_16u(const ushort* src, const uchar* mask, int* minval, int* maxval, size_t* minidx, size_t* maxidx, int len, size_t startidx ) -{ minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); } +{ +#if CV_SIMD128 + if ( len >= v_uint16x8::nlanes ) + { + int j, len0; + int minVal, maxVal; + size_t minIdx, maxIdx; + + minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx, + (int)0, (int)USHRT_MAX, v_uint16x8::nlanes, len, startidx, j, len0 ); + + if ( j <= len0 - v_uint16x8::nlanes ) + { + v_uint16x8 inc = v_setall_u16(v_uint16x8::nlanes); + v_uint16x8 none = v_reinterpret_as_u16(v_setall_s16(-1)); + v_uint16x8 idxStart(0, 1, 2, 3, 4, 5, 6, 7); + + do + { + v_uint16x8 valMin = v_setall_u16((ushort)minVal), valMax = v_setall_u16((ushort)maxVal); + v_uint16x8 idx = idxStart, idxMin = none, idxMax = none; + + int k = j; + size_t delta = startidx + j; + + if ( !mask ) + { + for( ; k < std::min(len0, j + 8191 * v_uint16x8::nlanes); k += v_uint16x8::nlanes ) + { + v_uint16x8 data = v_load(src + k); + v_uint16x8 cmpMin = (data < valMin); + v_uint16x8 cmpMax = (data > valMax); + idxMin = v_select(cmpMin, idx, idxMin); + idxMax = v_select(cmpMax, idx, idxMax); + valMin = v_min(data, valMin); + valMax = v_max(data, valMax); + idx += inc; + } + } + else + { + for( ; k < std::min(len0, j + 8191 * v_uint16x8::nlanes); k += v_uint16x8::nlanes ) + { + v_uint16x8 data = v_load(src + k); + v_uint16x8 maskVal = v_load_expand(mask + k) != v_setzero_u16(); + v_uint16x8 cmpMin = (data < valMin) & maskVal; + v_uint16x8 cmpMax = (data > valMax) & maskVal; + idxMin = v_select(cmpMin, idx, idxMin); + idxMax = v_select(cmpMax, idx, idxMax); + valMin = v_select(cmpMin, data, valMin); + valMax = v_select(cmpMax, data, valMax); + idx += inc; + } + } + + j = k; + + minMaxIdx_reduce_u16( valMin, valMax, idxMin, idxMax, none, minVal, maxVal, + minIdx, maxIdx, delta ); + } + while ( j < len0 ); + } + + minMaxIdx_finish( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, + minIdx, maxIdx, len, startidx, j ); + } + else + { + minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx); + } +#else + minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); +#endif +} static void minMaxIdx_16s(const short* src, const uchar* mask, int* minval, int* maxval, size_t* minidx, size_t* maxidx, int len, size_t startidx ) -{ minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); } +{ +#if CV_SIMD128 + if ( len >= v_int16x8::nlanes ) + { + int j, len0; + int minVal, maxVal; + size_t minIdx, maxIdx; + + minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx, + (int)SHRT_MIN, (int)SHRT_MAX, v_int16x8::nlanes, len, startidx, j, len0 ); + + if ( j <= len0 - v_int16x8::nlanes ) + { + v_uint16x8 inc = v_setall_u16(v_int16x8::nlanes); + v_uint16x8 none = v_reinterpret_as_u16(v_setall_s16(-1)); + v_uint16x8 idxStart(0, 1, 2, 3, 4, 5, 6, 7); + + do + { + v_int16x8 valMin = v_setall_s16((short)minVal), valMax = v_setall_s16((short)maxVal); + v_uint16x8 idx = idxStart, idxMin = none, idxMax = none; + + int k = j; + size_t delta = startidx + j; + + if ( !mask ) + { + for( ; k < std::min(len0, j + 8191 * v_int16x8::nlanes); k += v_int16x8::nlanes ) + { + v_int16x8 data = v_load(src + k); + v_uint16x8 cmpMin = v_reinterpret_as_u16(data < valMin); + v_uint16x8 cmpMax = v_reinterpret_as_u16(data > valMax); + idxMin = v_select(cmpMin, idx, idxMin); + idxMax = v_select(cmpMax, idx, idxMax); + valMin = v_min(data, valMin); + valMax = v_max(data, valMax); + idx += inc; + } + } + else + { + for( ; k < std::min(len0, j + 8191 * v_int16x8::nlanes); k += v_int16x8::nlanes ) + { + v_int16x8 data = v_load(src + k); + v_uint16x8 maskVal = v_load_expand(mask + k) != v_setzero_u16(); + v_uint16x8 cmpMin = v_reinterpret_as_u16(data < valMin) & maskVal; + v_uint16x8 cmpMax = v_reinterpret_as_u16(data > valMax) & maskVal; + idxMin = v_select(cmpMin, idx, idxMin); + idxMax = v_select(cmpMax, idx, idxMax); + valMin = v_select(v_reinterpret_as_s16(cmpMin), data, valMin); + valMax = v_select(v_reinterpret_as_s16(cmpMax), data, valMax); + idx += inc; + } + } + + j = k; + + minMaxIdx_reduce_s16( valMin, valMax, idxMin, idxMax, none, minVal, maxVal, + minIdx, maxIdx, delta ); + } + while ( j < len0 ); + } + + minMaxIdx_finish( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, + minIdx, maxIdx, len, startidx, j ); + } + else + { + minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx); + } +#else + minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); +#endif +} static void minMaxIdx_32s(const int* src, const uchar* mask, int* minval, int* maxval, size_t* minidx, size_t* maxidx, int len, size_t startidx ) -{ minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); } +{ +#if CV_SIMD128 + if ( len >= 2 * v_int32x4::nlanes ) + { + int j = 0, len0 = len & -(2 * v_int32x4::nlanes); + int minVal = *minval, maxVal = *maxval; + size_t minIdx = *minidx, maxIdx = *maxidx; + + { + v_uint32x4 inc = v_setall_u32(v_int32x4::nlanes); + v_uint32x4 none = v_reinterpret_as_u32(v_setall_s32(-1)); + v_uint32x4 idxStart(0, 1, 2, 3); + + do + { + v_int32x4 valMin = v_setall_s32(minVal), valMax = v_setall_s32(maxVal); + v_uint32x4 idx = idxStart, idxMin = none, idxMax = none; + + int k = j; + size_t delta = startidx + j; + + if ( !mask ) + { + for( ; k < std::min(len0, j + 32766 * 2 * v_int32x4::nlanes); k += 2 * v_int32x4::nlanes ) + { + v_int32x4 data = v_load(src + k); + v_uint32x4 cmpMin = v_reinterpret_as_u32(data < valMin); + v_uint32x4 cmpMax = v_reinterpret_as_u32(data > valMax); + idxMin = v_select(cmpMin, idx, idxMin); + idxMax = v_select(cmpMax, idx, idxMax); + valMin = v_min(data, valMin); + valMax = v_max(data, valMax); + idx += inc; + data = v_load(src + k + v_int32x4::nlanes); + cmpMin = v_reinterpret_as_u32(data < valMin); + cmpMax = v_reinterpret_as_u32(data > valMax); + idxMin = v_select(cmpMin, idx, idxMin); + idxMax = v_select(cmpMax, idx, idxMax); + valMin = v_min(data, valMin); + valMax = v_max(data, valMax); + idx += inc; + } + } + else + { + for( ; k < std::min(len0, j + 32766 * 2 * v_int32x4::nlanes); k += 2 * v_int32x4::nlanes ) + { + v_int32x4 data = v_load(src + k); + v_uint16x8 maskVal = v_load_expand(mask + k) != v_setzero_u16(); + v_int32x4 maskVal1, maskVal2; + v_expand(v_reinterpret_as_s16(maskVal), maskVal1, maskVal2); + v_uint32x4 cmpMin = v_reinterpret_as_u32((data < valMin) & maskVal1); + v_uint32x4 cmpMax = v_reinterpret_as_u32((data > valMax) & maskVal1); + idxMin = v_select(cmpMin, idx, idxMin); + idxMax = v_select(cmpMax, idx, idxMax); + valMin = v_select(v_reinterpret_as_s32(cmpMin), data, valMin); + valMax = v_select(v_reinterpret_as_s32(cmpMax), data, valMax); + idx += inc; + data = v_load(src + k + v_int32x4::nlanes); + cmpMin = v_reinterpret_as_u32((data < valMin) & maskVal2); + cmpMax = v_reinterpret_as_u32((data > valMax) & maskVal2); + idxMin = v_select(cmpMin, idx, idxMin); + idxMax = v_select(cmpMax, idx, idxMax); + valMin = v_select(v_reinterpret_as_s32(cmpMin), data, valMin); + valMax = v_select(v_reinterpret_as_s32(cmpMax), data, valMax); + idx += inc; + } + } + + j = k; + + minMaxIdx_reduce_s32( valMin, valMax, idxMin, idxMax, none, minVal, maxVal, + minIdx, maxIdx, delta ); + } + while ( j < len0 ); + } + + minMaxIdx_finish( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, + minIdx, maxIdx, len, startidx, j ); + } + else + { + minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx); + } +#else + minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); +#endif +} static void minMaxIdx_32f(const float* src, const uchar* mask, float* minval, float* maxval, size_t* minidx, size_t* maxidx, int len, size_t startidx ) -{ minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); } +{ +#if CV_SIMD128 + if ( len >= 2 * v_float32x4::nlanes ) + { + int j, len0; + float minVal, maxVal; + size_t minIdx, maxIdx; + + minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx, + FLT_MIN, FLT_MAX, 2 * v_float32x4::nlanes, len, startidx, j, len0 ); + + if ( j <= len0 - 2 * v_float32x4::nlanes ) + { + v_uint32x4 inc = v_setall_u32(v_float32x4::nlanes); + v_uint32x4 none = v_reinterpret_as_u32(v_setall_s32(-1)); + v_uint32x4 idxStart(0, 1, 2, 3); + + do + { + v_float32x4 valMin = v_setall_f32(minVal), valMax = v_setall_f32(maxVal); + v_uint32x4 idx = idxStart, idxMin = none, idxMax = none; + + int k = j; + size_t delta = startidx + j; + + if ( !mask ) + { + for( ; k < std::min(len0, j + 32766 * 2 * v_float32x4::nlanes); k += 2 * v_float32x4::nlanes ) + { + v_float32x4 data = v_load(src + k); + v_uint32x4 cmpMin = v_reinterpret_as_u32(data < valMin); + v_uint32x4 cmpMax = v_reinterpret_as_u32(data > valMax); + idxMin = v_select(cmpMin, idx, idxMin); + idxMax = v_select(cmpMax, idx, idxMax); + valMin = v_min(data, valMin); + valMax = v_max(data, valMax); + idx += inc; + data = v_load(src + k + v_float32x4::nlanes); + cmpMin = v_reinterpret_as_u32(data < valMin); + cmpMax = v_reinterpret_as_u32(data > valMax); + idxMin = v_select(cmpMin, idx, idxMin); + idxMax = v_select(cmpMax, idx, idxMax); + valMin = v_min(data, valMin); + valMax = v_max(data, valMax); + idx += inc; + } + } + else + { + for( ; k < std::min(len0, j + 32766 * 2 * v_float32x4::nlanes); k += 2 * v_float32x4::nlanes ) + { + v_float32x4 data = v_load(src + k); + v_uint16x8 maskVal = v_load_expand(mask + k) != v_setzero_u16(); + v_int32x4 maskVal1, maskVal2; + v_expand(v_reinterpret_as_s16(maskVal), maskVal1, maskVal2); + v_uint32x4 cmpMin = v_reinterpret_as_u32(v_reinterpret_as_s32(data < valMin) & maskVal1); + v_uint32x4 cmpMax = v_reinterpret_as_u32(v_reinterpret_as_s32(data > valMax) & maskVal1); + idxMin = v_select(cmpMin, idx, idxMin); + idxMax = v_select(cmpMax, idx, idxMax); + valMin = v_select(v_reinterpret_as_f32(cmpMin), data, valMin); + valMax = v_select(v_reinterpret_as_f32(cmpMax), data, valMax); + idx += inc; + data = v_load(src + k + v_float32x4::nlanes); + cmpMin = v_reinterpret_as_u32(v_reinterpret_as_s32(data < valMin) & maskVal2); + cmpMax = v_reinterpret_as_u32(v_reinterpret_as_s32(data > valMax) & maskVal2); + idxMin = v_select(cmpMin, idx, idxMin); + idxMax = v_select(cmpMax, idx, idxMax); + valMin = v_select(v_reinterpret_as_f32(cmpMin), data, valMin); + valMax = v_select(v_reinterpret_as_f32(cmpMax), data, valMax); + idx += inc; + } + } + + j = k; + + minMaxIdx_reduce_f32( valMin, valMax, idxMin, idxMax, none, minVal, maxVal, + minIdx, maxIdx, delta ); + } + while ( j < len0 ); + } + + minMaxIdx_finish( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, + minIdx, maxIdx, len, startidx, j ); + } + else + { + minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx); + } +#else + minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); +#endif +} static void minMaxIdx_64f(const double* src, const uchar* mask, double* minval, double* maxval, size_t* minidx, size_t* maxidx, int len, size_t startidx ) -{ minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); } +{ +#if CV_SIMD128_64F + if ( len >= 4 * v_float64x2::nlanes ) + { + int j, len0; + double minVal, maxVal; + size_t minIdx, maxIdx; + + minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx, + DBL_MIN, DBL_MAX, 4 * v_float64x2::nlanes, len, startidx, j, len0 ); + + if ( j <= len0 - 4 * v_float64x2::nlanes ) + { + v_uint64x2 inc = v_setall_u64(v_float64x2::nlanes); + v_uint64x2 none = v_reinterpret_as_u64(v_setall_s64(-1)); + v_uint64x2 idxStart(0, 1); + + do + { + v_float64x2 valMin = v_setall_f64(minVal), valMax = v_setall_f64(maxVal); + v_uint64x2 idx = idxStart, idxMin = none, idxMax = none; + + int k = j; + size_t delta = startidx + j; + + if ( !mask ) + { + for( ; k < std::min(len0, j + 32764 * 4 * v_float64x2::nlanes); k += 4 * v_float64x2::nlanes ) + { + v_float64x2 data = v_load(src + k); + v_uint64x2 cmpMin = v_reinterpret_as_u64(data < valMin); + v_uint64x2 cmpMax = v_reinterpret_as_u64(data > valMax); + idxMin = v_select(cmpMin, idx, idxMin); + idxMax = v_select(cmpMax, idx, idxMax); + valMin = v_min(data, valMin); + valMax = v_max(data, valMax); + idx += inc; + data = v_load(src + k + v_float64x2::nlanes); + cmpMin = v_reinterpret_as_u64(data < valMin); + cmpMax = v_reinterpret_as_u64(data > valMax); + idxMin = v_select(cmpMin, idx, idxMin); + idxMax = v_select(cmpMax, idx, idxMax); + valMin = v_min(data, valMin); + valMax = v_max(data, valMax); + idx += inc; + data = v_load(src + k + 2 * v_float64x2::nlanes); + cmpMin = v_reinterpret_as_u64(data < valMin); + cmpMax = v_reinterpret_as_u64(data > valMax); + idxMin = v_select(cmpMin, idx, idxMin); + idxMax = v_select(cmpMax, idx, idxMax); + valMin = v_min(data, valMin); + valMax = v_max(data, valMax); + idx += inc; + data = v_load(src + k + 3 * v_float64x2::nlanes); + cmpMin = v_reinterpret_as_u64(data < valMin); + cmpMax = v_reinterpret_as_u64(data > valMax); + idxMin = v_select(cmpMin, idx, idxMin); + idxMax = v_select(cmpMax, idx, idxMax); + valMin = v_min(data, valMin); + valMax = v_max(data, valMax); + idx += inc; + } + } + else + { + for( ; k < std::min(len0, j + 32764 * 4 * v_float64x2::nlanes); k += 4 * v_float64x2::nlanes ) + { + v_float64x2 data = v_load(src + k); + v_uint16x8 maskVal = v_load_expand(mask + k) != v_setzero_u16(); + v_int32x4 maskVal1, maskVal2; + v_expand(v_reinterpret_as_s16(maskVal), maskVal1, maskVal2); + v_int64x2 maskVal3, maskVal4; + v_expand(maskVal1, maskVal3, maskVal4); + v_uint64x2 cmpMin = v_reinterpret_as_u64(v_reinterpret_as_s64(data < valMin) & maskVal3); + v_uint64x2 cmpMax = v_reinterpret_as_u64(v_reinterpret_as_s64(data > valMax) & maskVal3); + idxMin = v_select(cmpMin, idx, idxMin); + idxMax = v_select(cmpMax, idx, idxMax); + valMin = v_select(v_reinterpret_as_f64(cmpMin), data, valMin); + valMax = v_select(v_reinterpret_as_f64(cmpMax), data, valMax); + idx += inc; + data = v_load(src + k + v_float64x2::nlanes); + cmpMin = v_reinterpret_as_u64(v_reinterpret_as_s64(data < valMin) & maskVal4); + cmpMax = v_reinterpret_as_u64(v_reinterpret_as_s64(data > valMax) & maskVal4); + idxMin = v_select(cmpMin, idx, idxMin); + idxMax = v_select(cmpMax, idx, idxMax); + valMin = v_select(v_reinterpret_as_f64(cmpMin), data, valMin); + valMax = v_select(v_reinterpret_as_f64(cmpMax), data, valMax); + idx += inc; + data = v_load(src + k + 2 * v_float64x2::nlanes); + v_expand(maskVal2, maskVal3, maskVal4); + cmpMin = v_reinterpret_as_u64(v_reinterpret_as_s64(data < valMin) & maskVal3); + cmpMax = v_reinterpret_as_u64(v_reinterpret_as_s64(data > valMax) & maskVal3); + idxMin = v_select(cmpMin, idx, idxMin); + idxMax = v_select(cmpMax, idx, idxMax); + valMin = v_select(v_reinterpret_as_f64(cmpMin), data, valMin); + valMax = v_select(v_reinterpret_as_f64(cmpMax), data, valMax); + idx += inc; + data = v_load(src + k + 3 * v_float64x2::nlanes); + cmpMin = v_reinterpret_as_u64(v_reinterpret_as_s64(data < valMin) & maskVal4); + cmpMax = v_reinterpret_as_u64(v_reinterpret_as_s64(data > valMax) & maskVal4); + idxMin = v_select(cmpMin, idx, idxMin); + idxMax = v_select(cmpMax, idx, idxMax); + valMin = v_select(v_reinterpret_as_f64(cmpMin), data, valMin); + valMax = v_select(v_reinterpret_as_f64(cmpMax), data, valMax); + idx += inc; + } + } + + j = k; + + minMaxIdx_reduce_f64( valMin, valMax, idxMin, idxMax, none, minVal, maxVal, + minIdx, maxIdx, delta ); + } + while ( j < len0 ); + } + + minMaxIdx_finish( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, + minIdx, maxIdx, len, startidx, j ); + } + else + { + minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx); + } +#else + minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); +#endif +} typedef void (*MinMaxIdxFunc)(const uchar*, const uchar*, int*, int*, size_t*, size_t*, int, size_t); diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index d8d94fdb0d..6731091463 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -894,13 +894,18 @@ template struct TheTest TheTest & test_reduce() { Data dataA; + int sum = 0; + for (int i = 0; i < R::nlanes; ++i) + { + sum += (int)(dataA[i]); // To prevent a constant overflow with int8 + } R a = dataA; - EXPECT_EQ((LaneType)1, v_reduce_min(a)); - EXPECT_EQ((LaneType)R::nlanes, v_reduce_max(a)); - EXPECT_EQ((LaneType)((1 + R::nlanes)*R::nlanes/2), v_reduce_sum(a)); + EXPECT_EQ((LaneType)1, (LaneType)v_reduce_min(a)); + EXPECT_EQ((LaneType)(R::nlanes), (LaneType)v_reduce_max(a)); + EXPECT_EQ((int)(sum), (int)v_reduce_sum(a)); dataA[0] += R::nlanes; R an = dataA; - EXPECT_EQ((LaneType)2, v_reduce_min(an)); + EXPECT_EQ((LaneType)2, (LaneType)v_reduce_min(an)); return *this; } @@ -1588,6 +1593,7 @@ void test_hal_intrin_uint8() .test_dotprod_expand() .test_min_max() .test_absdiff() + .test_reduce() .test_reduce_sad() .test_mask() .test_popcount() @@ -1629,6 +1635,7 @@ void test_hal_intrin_int8() .test_absdiff() .test_absdiffs() .test_abs() + .test_reduce() .test_reduce_sad() .test_mask() .test_popcount()