Merge pull request #26109 from WanliZhong:univ_intrin_operator2warpper

Replace operators with wrapper functions on universal intrinsics backends #26109

This PR aims to replace the operators(logic, arithmetic, bit) with wrapper functions(v_add, v_eq, v_and...)

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
pull/26147/head
Wanli 2 months ago committed by GitHub
parent 4c81e174bf
commit c8080aa415
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 387
      modules/core/include/opencv2/core/hal/intrin.hpp
  2. 270
      modules/core/include/opencv2/core/hal/intrin_avx.hpp
  3. 249
      modules/core/include/opencv2/core/hal/intrin_avx512.hpp
  4. 140
      modules/core/include/opencv2/core/hal/intrin_cpp.hpp
  5. 208
      modules/core/include/opencv2/core/hal/intrin_lasx.hpp
  6. 190
      modules/core/include/opencv2/core/hal/intrin_lsx.hpp
  7. 123
      modules/core/include/opencv2/core/hal/intrin_msa.hpp
  8. 168
      modules/core/include/opencv2/core/hal/intrin_rvv071.hpp
  9. 221
      modules/core/include/opencv2/core/hal/intrin_sse.hpp
  10. 112
      modules/core/include/opencv2/core/hal/intrin_vsx.hpp
  11. 191
      modules/core/include/opencv2/core/hal/intrin_wasm.hpp
  12. 2
      modules/features2d/src/fast.avx2.cpp
  13. 4
      modules/imgproc/src/resize.cpp
  14. 76
      modules/imgproc/src/sumpixels.simd.hpp

@ -717,44 +717,70 @@ namespace CV__SIMD_NAMESPACE {
/** @brief SIMD processing state cleanup call */
inline void vx_cleanup() { VXPREFIX(_cleanup)(); }
#if !CV_SIMD_SCALABLE && !(CV_NEON && !defined(CV_FORCE_SIMD128_CPP))
#if !CV_SIMD_SCALABLE
// Compatibility layer
#if !(CV_NEON && !defined(CV_FORCE_SIMD128_CPP))
template<typename T> struct VTraits {
static inline int vlanes() { return T::nlanes; }
enum { nlanes = T::nlanes, max_nlanes = T::nlanes };
using lane_type = typename T::lane_type;
};
#define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \
inline _Tpvec v_add(const _Tpvec& a, const _Tpvec& b) \
{ \
return a + b; \
} \
inline _Tpvec v_sub(const _Tpvec& a, const _Tpvec& b) \
//////////// get0 ////////////
#define OPENCV_HAL_WRAP_GRT0(_Tpvec) \
inline typename VTraits<_Tpvec>::lane_type v_get0(const _Tpvec& v) \
{ \
return a - b; \
} \
template<typename... Args> \
inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
return v_add(f1 + f2, vf...); \
return v.get0(); \
}
#define OPENCV_HAL_WRAP_SHIFT_OP(_Tpvec) \
inline _Tpvec v_shr(const _Tpvec& a, int n) \
{ \
return a >> n; \
} \
inline _Tpvec v_shl(const _Tpvec& a, int n) \
{ \
return a << n; \
OPENCV_HAL_WRAP_GRT0(v_uint8)
OPENCV_HAL_WRAP_GRT0(v_int8)
OPENCV_HAL_WRAP_GRT0(v_uint16)
OPENCV_HAL_WRAP_GRT0(v_int16)
OPENCV_HAL_WRAP_GRT0(v_uint32)
OPENCV_HAL_WRAP_GRT0(v_int32)
OPENCV_HAL_WRAP_GRT0(v_uint64)
OPENCV_HAL_WRAP_GRT0(v_int64)
OPENCV_HAL_WRAP_GRT0(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_GRT0(v_float64)
#endif
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
OPENCV_HAL_WRAP_GRT0(v_uint8x16)
OPENCV_HAL_WRAP_GRT0(v_uint16x8)
OPENCV_HAL_WRAP_GRT0(v_uint32x4)
OPENCV_HAL_WRAP_GRT0(v_uint64x2)
OPENCV_HAL_WRAP_GRT0(v_int8x16)
OPENCV_HAL_WRAP_GRT0(v_int16x8)
OPENCV_HAL_WRAP_GRT0(v_int32x4)
OPENCV_HAL_WRAP_GRT0(v_int64x2)
OPENCV_HAL_WRAP_GRT0(v_float32x4)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_GRT0(v_float64x2)
#endif
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
OPENCV_HAL_WRAP_GRT0(v_uint8x32)
OPENCV_HAL_WRAP_GRT0(v_uint16x16)
OPENCV_HAL_WRAP_GRT0(v_uint32x8)
OPENCV_HAL_WRAP_GRT0(v_uint64x4)
OPENCV_HAL_WRAP_GRT0(v_int8x32)
OPENCV_HAL_WRAP_GRT0(v_int16x16)
OPENCV_HAL_WRAP_GRT0(v_int32x8)
OPENCV_HAL_WRAP_GRT0(v_int64x4)
OPENCV_HAL_WRAP_GRT0(v_float32x8)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_GRT0(v_float64x4)
#endif
#endif
#endif
#define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \
template<typename... Args> \
inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const _Tpvec& f3, const Args&... vf) { \
return v_add(v_add(f1, f2), f3, vf...); \
}
OPENCV_HAL_WRAP_SHIFT_OP(v_uint16)
OPENCV_HAL_WRAP_SHIFT_OP(v_uint32)
OPENCV_HAL_WRAP_SHIFT_OP(v_uint64)
OPENCV_HAL_WRAP_SHIFT_OP(v_int16)
OPENCV_HAL_WRAP_SHIFT_OP(v_int32)
OPENCV_HAL_WRAP_SHIFT_OP(v_int64)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
@ -778,12 +804,6 @@ namespace CV__SIMD_NAMESPACE {
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x4)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x2)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x4)
OPENCV_HAL_WRAP_SHIFT_OP(v_uint16x8)
OPENCV_HAL_WRAP_SHIFT_OP(v_uint32x4)
OPENCV_HAL_WRAP_SHIFT_OP(v_uint64x2)
OPENCV_HAL_WRAP_SHIFT_OP(v_int16x8)
OPENCV_HAL_WRAP_SHIFT_OP(v_int32x4)
OPENCV_HAL_WRAP_SHIFT_OP(v_int64x2)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x2)
#endif
@ -799,110 +819,15 @@ namespace CV__SIMD_NAMESPACE {
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x4)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x8)
OPENCV_HAL_WRAP_SHIFT_OP(v_uint16x16)
OPENCV_HAL_WRAP_SHIFT_OP(v_uint32x8)
OPENCV_HAL_WRAP_SHIFT_OP(v_uint64x4)
OPENCV_HAL_WRAP_SHIFT_OP(v_int16x16)
OPENCV_HAL_WRAP_SHIFT_OP(v_int32x8)
OPENCV_HAL_WRAP_SHIFT_OP(v_int64x4)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x4)
#endif
#endif
#define OPENCV_HAL_WRAP_BIN_OP_LOGIC(_Tpvec) \
inline _Tpvec v_and(const _Tpvec& a, const _Tpvec& b) \
{ \
return a & b; \
} \
inline _Tpvec v_or(const _Tpvec& a, const _Tpvec& b) \
{ \
return a | b; \
} \
inline _Tpvec v_xor(const _Tpvec& a, const _Tpvec& b) \
{ \
return a ^ b; \
}
#define OPENCV_HAL_WRAP_NOT_OP(_Tpvec) \
inline _Tpvec v_not(const _Tpvec& a) \
{ \
return ~a; \
}
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32)
OPENCV_HAL_WRAP_NOT_OP(v_uint8)
OPENCV_HAL_WRAP_NOT_OP(v_uint16)
OPENCV_HAL_WRAP_NOT_OP(v_uint32)
OPENCV_HAL_WRAP_NOT_OP(v_uint64)
OPENCV_HAL_WRAP_NOT_OP(v_int8)
OPENCV_HAL_WRAP_NOT_OP(v_int16)
OPENCV_HAL_WRAP_NOT_OP(v_int32)
OPENCV_HAL_WRAP_NOT_OP(v_int64)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64)
#endif
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x16)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x8)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32x4)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64x2)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8x16)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x8)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x4)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x2)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32x4)
OPENCV_HAL_WRAP_NOT_OP(v_uint8x16)
OPENCV_HAL_WRAP_NOT_OP(v_uint16x8)
OPENCV_HAL_WRAP_NOT_OP(v_uint32x4)
OPENCV_HAL_WRAP_NOT_OP(v_uint64x2)
OPENCV_HAL_WRAP_NOT_OP(v_int8x16)
OPENCV_HAL_WRAP_NOT_OP(v_int16x8)
OPENCV_HAL_WRAP_NOT_OP(v_int32x4)
OPENCV_HAL_WRAP_NOT_OP(v_int64x2)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64x2)
#endif
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x32)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x16)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32x8)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64x4)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8x32)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x16)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x8)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x4)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32x8)
OPENCV_HAL_WRAP_NOT_OP(v_uint8x32)
OPENCV_HAL_WRAP_NOT_OP(v_uint16x16)
OPENCV_HAL_WRAP_NOT_OP(v_uint32x8)
OPENCV_HAL_WRAP_NOT_OP(v_uint64x4)
OPENCV_HAL_WRAP_NOT_OP(v_int8x32)
OPENCV_HAL_WRAP_NOT_OP(v_int16x16)
OPENCV_HAL_WRAP_NOT_OP(v_int32x8)
OPENCV_HAL_WRAP_NOT_OP(v_int64x4)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64x4)
#endif
#endif
#define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
{ \
return a * b; \
} \
template<typename... Args> \
inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
return v_mul(f1 * f2, vf...); \
inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const _Tpvec& f3, const Args&... vf) { \
return v_mul(v_mul(f1, f2), f3, vf...); \
}
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8)
@ -939,140 +864,6 @@ namespace CV__SIMD_NAMESPACE {
#endif
#endif
#define OPENCV_HAL_WRAP_BIN_OP_DIV(_Tpvec) \
inline _Tpvec v_div(const _Tpvec& a, const _Tpvec& b) \
{ \
return a / b; \
}
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64)
#endif
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32x4)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64x2)
#endif
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32x8)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64x4)
#endif
#endif
#define OPENCV_HAL_WRAP_CMP_OP(_Tpvec, intrin, op) \
inline _Tpvec v_##intrin(const _Tpvec& a, const _Tpvec& b) \
{ \
return a op b; \
}
#define OPENCV_HAL_WRAP_EQ_OP(_Tpvec) \
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
{ \
return a == b; \
} \
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
{ \
return a != b; \
}
#define OPENCV_HAL_WRAP_CMP(_Tpvec) \
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, eq, ==) \
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, ne, !=) \
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, lt, <) \
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, gt, >) \
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, le, <=) \
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, ge, >=)
OPENCV_HAL_WRAP_CMP(v_uint8)
OPENCV_HAL_WRAP_CMP(v_uint16)
OPENCV_HAL_WRAP_CMP(v_uint32)
OPENCV_HAL_WRAP_EQ_OP(v_uint64)
OPENCV_HAL_WRAP_CMP(v_int8)
OPENCV_HAL_WRAP_CMP(v_int16)
OPENCV_HAL_WRAP_CMP(v_int32)
OPENCV_HAL_WRAP_EQ_OP(v_int64)
OPENCV_HAL_WRAP_CMP(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_CMP(v_float64)
#endif
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
OPENCV_HAL_WRAP_CMP(v_uint8x16)
OPENCV_HAL_WRAP_CMP(v_uint16x8)
OPENCV_HAL_WRAP_CMP(v_uint32x4)
OPENCV_HAL_WRAP_EQ_OP(v_uint64x2)
OPENCV_HAL_WRAP_CMP(v_int8x16)
OPENCV_HAL_WRAP_CMP(v_int16x8)
OPENCV_HAL_WRAP_CMP(v_int32x4)
OPENCV_HAL_WRAP_EQ_OP(v_int64x2)
OPENCV_HAL_WRAP_CMP(v_float32x4)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_CMP(v_float64x2)
#endif
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
OPENCV_HAL_WRAP_CMP(v_uint8x32)
OPENCV_HAL_WRAP_CMP(v_uint16x16)
OPENCV_HAL_WRAP_CMP(v_uint32x8)
OPENCV_HAL_WRAP_EQ_OP(v_uint64x4)
OPENCV_HAL_WRAP_CMP(v_int8x32)
OPENCV_HAL_WRAP_CMP(v_int16x16)
OPENCV_HAL_WRAP_CMP(v_int32x8)
OPENCV_HAL_WRAP_EQ_OP(v_int64x4)
OPENCV_HAL_WRAP_CMP(v_float32x8)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_CMP(v_float64x4)
#endif
#endif
//////////// get0 ////////////
#define OPENCV_HAL_WRAP_GRT0(_Tpvec) \
inline typename VTraits<_Tpvec>::lane_type v_get0(const _Tpvec& v) \
{ \
return v.get0(); \
}
OPENCV_HAL_WRAP_GRT0(v_uint8)
OPENCV_HAL_WRAP_GRT0(v_int8)
OPENCV_HAL_WRAP_GRT0(v_uint16)
OPENCV_HAL_WRAP_GRT0(v_int16)
OPENCV_HAL_WRAP_GRT0(v_uint32)
OPENCV_HAL_WRAP_GRT0(v_int32)
OPENCV_HAL_WRAP_GRT0(v_uint64)
OPENCV_HAL_WRAP_GRT0(v_int64)
OPENCV_HAL_WRAP_GRT0(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_GRT0(v_float64)
#endif
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
OPENCV_HAL_WRAP_GRT0(v_uint8x16)
OPENCV_HAL_WRAP_GRT0(v_uint16x8)
OPENCV_HAL_WRAP_GRT0(v_uint32x4)
OPENCV_HAL_WRAP_GRT0(v_uint64x2)
OPENCV_HAL_WRAP_GRT0(v_int8x16)
OPENCV_HAL_WRAP_GRT0(v_int16x8)
OPENCV_HAL_WRAP_GRT0(v_int32x4)
OPENCV_HAL_WRAP_GRT0(v_int64x2)
OPENCV_HAL_WRAP_GRT0(v_float32x4)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_GRT0(v_float64x2)
#endif
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
OPENCV_HAL_WRAP_GRT0(v_uint8x32)
OPENCV_HAL_WRAP_GRT0(v_uint16x16)
OPENCV_HAL_WRAP_GRT0(v_uint32x8)
OPENCV_HAL_WRAP_GRT0(v_uint64x4)
OPENCV_HAL_WRAP_GRT0(v_int8x32)
OPENCV_HAL_WRAP_GRT0(v_int16x16)
OPENCV_HAL_WRAP_GRT0(v_int32x8)
OPENCV_HAL_WRAP_GRT0(v_int64x4)
OPENCV_HAL_WRAP_GRT0(v_float32x8)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_GRT0(v_float64x4)
#endif
#endif
#define OPENCV_HAL_WRAP_EXTRACT(_Tpvec) \
inline typename VTraits<_Tpvec>::lane_type v_extract_highest(const _Tpvec& v) \
{ \
@ -1142,74 +933,6 @@ namespace CV__SIMD_NAMESPACE {
#endif //!CV_SIMD_SCALABLE
#if (CV_NEON /* || CV_others */) && !defined(CV_FORCE_SIMD128_CPP)
// Compatibility layer for the backend that cleaned up.
#define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \
template<typename... Args> \
inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
return v_add(v_add(f1, f2), vf...); \
}
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
#endif
#define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
template<typename... Args> \
inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
return v_mul(v_mul(f1, f2), vf...); \
}
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
#endif
#define OPENCV_HAL_WRAP_EXTRACT(_Tpvec) \
inline typename VTraits<_Tpvec>::lane_type v_extract_highest(const _Tpvec& v) \
{ \
return v_extract_n<VTraits<_Tpvec>::nlanes-1>(v); \
}
OPENCV_HAL_WRAP_EXTRACT(v_uint8)
OPENCV_HAL_WRAP_EXTRACT(v_int8)
OPENCV_HAL_WRAP_EXTRACT(v_uint16)
OPENCV_HAL_WRAP_EXTRACT(v_int16)
OPENCV_HAL_WRAP_EXTRACT(v_uint32)
OPENCV_HAL_WRAP_EXTRACT(v_int32)
OPENCV_HAL_WRAP_EXTRACT(v_uint64)
OPENCV_HAL_WRAP_EXTRACT(v_int64)
OPENCV_HAL_WRAP_EXTRACT(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_EXTRACT(v_float64)
#endif
#define OPENCV_HAL_WRAP_BROADCAST(_Tpvec) \
inline _Tpvec v_broadcast_highest(const _Tpvec& v) \
{ \
return v_broadcast_element<VTraits<_Tpvec>::nlanes-1>(v); \
}
OPENCV_HAL_WRAP_BROADCAST(v_uint32)
OPENCV_HAL_WRAP_BROADCAST(v_int32)
OPENCV_HAL_WRAP_BROADCAST(v_float32)
#endif //CV_NEON
//! @cond IGNORED
// backward compatibility

@ -673,53 +673,51 @@ OPENCV_HAL_IMPL_AVX_ZIP(v_float64x4)
/** Arithmetics **/
#define OPENCV_HAL_IMPL_AVX_BIN_OP(bin_op, _Tpvec, intrin) \
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(a.val, b.val)); } \
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
{ a.val = intrin(a.val, b.val); return a; }
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint8x32, _mm256_adds_epu8)
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint8x32, _mm256_subs_epu8)
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int8x32, _mm256_adds_epi8)
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int8x32, _mm256_subs_epi8)
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint16x16, _mm256_adds_epu16)
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint16x16, _mm256_subs_epu16)
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int16x16, _mm256_adds_epi16)
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int16x16, _mm256_subs_epi16)
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint32x8, _mm256_add_epi32)
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint32x8, _mm256_sub_epi32)
OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint32x8, _mm256_mullo_epi32)
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int32x8, _mm256_add_epi32)
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int32x8, _mm256_sub_epi32)
OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int32x8, _mm256_mullo_epi32)
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint64x4, _mm256_add_epi64)
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint64x4, _mm256_sub_epi64)
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int64x4, _mm256_add_epi64)
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int64x4, _mm256_sub_epi64)
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float32x8, _mm256_add_ps)
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float32x8, _mm256_sub_ps)
OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float32x8, _mm256_mul_ps)
OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float32x8, _mm256_div_ps)
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float64x4, _mm256_add_pd)
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float64x4, _mm256_sub_pd)
OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float64x4, _mm256_mul_pd)
OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float64x4, _mm256_div_pd)
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(a.val, b.val)); }
OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint8x32, _mm256_adds_epu8)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint8x32, _mm256_subs_epu8)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int8x32, _mm256_adds_epi8)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int8x32, _mm256_subs_epi8)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint16x16, _mm256_adds_epu16)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint16x16, _mm256_subs_epu16)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int16x16, _mm256_adds_epi16)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int16x16, _mm256_subs_epi16)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint32x8, _mm256_add_epi32)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint32x8, _mm256_sub_epi32)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_uint32x8, _mm256_mullo_epi32)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int32x8, _mm256_add_epi32)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int32x8, _mm256_sub_epi32)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_int32x8, _mm256_mullo_epi32)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint64x4, _mm256_add_epi64)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint64x4, _mm256_sub_epi64)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int64x4, _mm256_add_epi64)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int64x4, _mm256_sub_epi64)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_float32x8, _mm256_add_ps)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_float32x8, _mm256_sub_ps)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_float32x8, _mm256_mul_ps)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_div, v_float32x8, _mm256_div_ps)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_float64x4, _mm256_add_pd)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_float64x4, _mm256_sub_pd)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_float64x4, _mm256_mul_pd)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_div, v_float64x4, _mm256_div_pd)
// saturating multiply 8-bit, 16-bit
inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b)
inline v_uint8x32 v_mul(const v_uint8x32& a, const v_uint8x32& b)
{
v_uint16x16 c, d;
v_mul_expand(a, b, c, d);
return v_pack(c, d);
}
inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b)
inline v_int8x32 v_mul(const v_int8x32& a, const v_int8x32& b)
{
v_int16x16 c, d;
v_mul_expand(a, b, c, d);
return v_pack(c, d);
}
inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b)
inline v_uint16x16 v_mul(const v_uint16x16& a, const v_uint16x16& b)
{
__m256i pl = _mm256_mullo_epi16(a.val, b.val);
__m256i ph = _mm256_mulhi_epu16(a.val, b.val);
@ -727,7 +725,7 @@ inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b)
__m256i p1 = _mm256_unpackhi_epi16(pl, ph);
return v_uint16x16(_v256_packs_epu32(p0, p1));
}
inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b)
inline v_int16x16 v_mul(const v_int16x16& a, const v_int16x16& b)
{
__m256i pl = _mm256_mullo_epi16(a.val, b.val);
__m256i ph = _mm256_mulhi_epi16(a.val, b.val);
@ -735,14 +733,6 @@ inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b)
__m256i p1 = _mm256_unpackhi_epi16(pl, ph);
return v_int16x16(_mm256_packs_epi32(p0, p1));
}
inline v_uint8x32& operator *= (v_uint8x32& a, const v_uint8x32& b)
{ a = a * b; return a; }
inline v_int8x32& operator *= (v_int8x32& a, const v_int8x32& b)
{ a = a * b; return a; }
inline v_uint16x16& operator *= (v_uint16x16& a, const v_uint16x16& b)
{ a = a * b; return a; }
inline v_int16x16& operator *= (v_int16x16& a, const v_int16x16& b)
{ a = a * b; return a; }
/** Non-saturating arithmetics **/
#define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
@ -833,13 +823,13 @@ inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return
/** Bitwise shifts **/
#define OPENCV_HAL_IMPL_AVX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \
{ return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); } \
inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \
{ return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); } \
inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
inline _Tpuvec v_shr(const _Tpuvec& a, int imm) \
{ return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); } \
inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \
{ return _Tpsvec(srai(a.val, imm)); } \
template<int imm> \
inline _Tpuvec v_shl(const _Tpuvec& a) \
@ -867,11 +857,11 @@ OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint64x4, v_int64x4, epi64, _mm256_srai_epi64xx
/** Bitwise logic **/
#define OPENCV_HAL_IMPL_AVX_LOGIC_OP(_Tpvec, suffix, not_const) \
OPENCV_HAL_IMPL_AVX_BIN_OP(&, _Tpvec, _mm256_and_##suffix) \
OPENCV_HAL_IMPL_AVX_BIN_OP(|, _Tpvec, _mm256_or_##suffix) \
OPENCV_HAL_IMPL_AVX_BIN_OP(^, _Tpvec, _mm256_xor_##suffix) \
inline _Tpvec operator ~ (const _Tpvec& a) \
#define OPENCV_HAL_IMPL_AVX_LOGIC_OP(_Tpvec, suffix, not_const) \
OPENCV_HAL_IMPL_AVX_BIN_OP(v_and, _Tpvec, _mm256_and_##suffix) \
OPENCV_HAL_IMPL_AVX_BIN_OP(v_or, _Tpvec, _mm256_or_##suffix) \
OPENCV_HAL_IMPL_AVX_BIN_OP(v_xor, _Tpvec, _mm256_xor_##suffix) \
inline _Tpvec v_not(const _Tpvec& a) \
{ return _Tpvec(_mm256_xor_##suffix(a.val, not_const)); }
OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint8x32, si256, _mm256_set1_epi32(-1))
@ -900,29 +890,29 @@ OPENCV_HAL_IMPL_AVX_SELECT(v_float32x8, ps)
OPENCV_HAL_IMPL_AVX_SELECT(v_float64x4, pd)
/** Comparison **/
#define OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpvec) \
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
{ return ~(a == b); } \
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
{ return b > a; } \
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
{ return ~(a < b); } \
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
{ return b >= a; }
#define OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpvec) \
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
{ return v_not(v_eq(a, b)); } \
inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
{ return v_gt(b, a); } \
inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
{ return v_not(v_lt(a, b)); } \
inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
{ return v_ge(b, a); }
#define OPENCV_HAL_IMPL_AVX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, sbit) \
inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b) \
{ return _Tpuvec(_mm256_cmpeq_##suffix(a.val, b.val)); } \
inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b) \
{ \
__m256i smask = _mm256_set1_##suffix(sbit); \
return _Tpuvec(_mm256_cmpgt_##suffix( \
_mm256_xor_si256(a.val, smask), \
_mm256_xor_si256(b.val, smask))); \
} \
inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b) \
{ return _Tpsvec(_mm256_cmpeq_##suffix(a.val, b.val)); } \
inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b) \
{ return _Tpsvec(_mm256_cmpgt_##suffix(a.val, b.val)); } \
OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpuvec) \
OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpsvec)
@ -932,25 +922,25 @@ OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint16x16, v_int16x16, epi16, (short)-32768)
OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint32x8, v_int32x8, epi32, (int)0x80000000)
#define OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(_Tpvec) \
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(_mm256_cmpeq_epi64(a.val, b.val)); } \
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
{ return ~(a == b); }
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
{ return v_not(v_eq(a, b)); }
OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_uint64x4)
OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_int64x4)
#define OPENCV_HAL_IMPL_AVX_CMP_FLT(bin_op, imm8, _Tpvec, suffix) \
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(_mm256_cmp_##suffix(a.val, b.val, imm8)); }
#define OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(_Tpvec, suffix) \
OPENCV_HAL_IMPL_AVX_CMP_FLT(==, _CMP_EQ_OQ, _Tpvec, suffix) \
OPENCV_HAL_IMPL_AVX_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, suffix) \
OPENCV_HAL_IMPL_AVX_CMP_FLT(<, _CMP_LT_OQ, _Tpvec, suffix) \
OPENCV_HAL_IMPL_AVX_CMP_FLT(>, _CMP_GT_OQ, _Tpvec, suffix) \
OPENCV_HAL_IMPL_AVX_CMP_FLT(<=, _CMP_LE_OQ, _Tpvec, suffix) \
OPENCV_HAL_IMPL_AVX_CMP_FLT(>=, _CMP_GE_OQ, _Tpvec, suffix)
OPENCV_HAL_IMPL_AVX_CMP_FLT(v_eq, _CMP_EQ_OQ, _Tpvec, suffix) \
OPENCV_HAL_IMPL_AVX_CMP_FLT(v_ne, _CMP_NEQ_OQ, _Tpvec, suffix) \
OPENCV_HAL_IMPL_AVX_CMP_FLT(v_lt, _CMP_LT_OQ, _Tpvec, suffix) \
OPENCV_HAL_IMPL_AVX_CMP_FLT(v_gt, _CMP_GT_OQ, _Tpvec, suffix) \
OPENCV_HAL_IMPL_AVX_CMP_FLT(v_le, _CMP_LE_OQ, _Tpvec, suffix) \
OPENCV_HAL_IMPL_AVX_CMP_FLT(v_ge, _CMP_GE_OQ, _Tpvec, suffix)
OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float32x8, ps)
OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float64x4, pd)
@ -1216,9 +1206,9 @@ inline unsigned v_reduce_sum(const v_uint32x8& a)
{ return v_reduce_sum(v_reinterpret_as_s32(a)); }
inline int v_reduce_sum(const v_int16x16& a)
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
inline unsigned v_reduce_sum(const v_uint16x16& a)
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
inline float v_reduce_sum(const v_float32x8& a)
{
@ -1273,27 +1263,27 @@ inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b)
{
v_uint32x8 l, h;
v_expand(v_add_wrap(a - b, b - a), l, h);
return v_reduce_sum(l + h);
v_expand(v_add_wrap(v_sub(a, b), v_sub(b, a)), l, h);
return v_reduce_sum(v_add(l, h));
}
inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b)
{
v_uint32x8 l, h;
v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h);
return v_reduce_sum(l + h);
return v_reduce_sum(v_add(l, h));
}
inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b)
{
return v_reduce_sum(v_max(a, b) - v_min(a, b));
return v_reduce_sum(v_sub(v_max(a, b), v_min(a, b)));
}
inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b)
{
v_int32x8 m = a < b;
return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m));
v_int32x8 m = v_lt(a, b);
return v_reduce_sum(v_reinterpret_as_u32(v_sub(v_xor(v_sub(a, b), m), m)));
}
inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
{
return v_reduce_sum((a - b) & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
return v_reduce_sum(v_and(v_sub(a, b), v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)))));
}
/** Popcount **/
@ -1308,15 +1298,15 @@ inline v_uint8x32 v_popcount(const v_uint8x32& a)
inline v_uint16x16 v_popcount(const v_uint16x16& a)
{
v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
p += v_rotate_right<1>(p);
return v_reinterpret_as_u16(p) & v256_setall_u16(0x00ff);
p = v_add(p, v_rotate_right<1>(p));
return v_and(v_reinterpret_as_u16(p), v256_setall_u16(0x00ff));
}
inline v_uint32x8 v_popcount(const v_uint32x8& a)
{
v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
p += v_rotate_right<1>(p);
p += v_rotate_right<2>(p);
return v_reinterpret_as_u32(p) & v256_setall_u32(0x000000ff);
p = v_add(p, v_rotate_right<1>(p));
p = v_add(p, v_rotate_right<2>(p));
return v_and(v_reinterpret_as_u32(p), v256_setall_u32(0x000000ff));
}
inline v_uint64x4 v_popcount(const v_uint64x4& a)
{
@ -1408,9 +1398,9 @@ OPENCV_HAL_IMPL_AVX_CHECK_SHORT(v_int16x16)
inline _Tpvec v_sqrt(const _Tpvec& x) \
{ return _Tpvec(_mm256_sqrt_##suffix(x.val)); } \
inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
{ return v_fma(a, a, b * b); } \
{ return v_fma(a, a, v_mul(b, b)); } \
inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
{ return v_sqrt(v_fma(a, a, b*b)); }
{ return v_sqrt(v_fma(a, a, v_mul(b, b))); }
OPENCV_HAL_IMPL_AVX_MULADD(v_float32x8, ps)
OPENCV_HAL_IMPL_AVX_MULADD(v_float64x4, pd)
@ -1419,7 +1409,7 @@ OPENCV_HAL_IMPL_AVX_MISC(v_float64x4, pd)
inline v_int32x8 v_fma(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
{
return a * b + c;
return v_add(v_mul(a, b), c);
}
inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
@ -1429,16 +1419,16 @@ inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x
inline v_float32x8 v_invsqrt(const v_float32x8& x)
{
v_float32x8 half = x * v256_setall_f32(0.5);
v_float32x8 half = v_mul(x, v256_setall_f32(0.5));
v_float32x8 t = v_float32x8(_mm256_rsqrt_ps(x.val));
// todo: _mm256_fnmsub_ps
t *= v256_setall_f32(1.5) - ((t * t) * half);
t = v_mul(t, v_sub(v256_setall_f32(1.5), v_mul(v_mul(t, t), half)));
return t;
}
inline v_float64x4 v_invsqrt(const v_float64x4& x)
{
return v256_setall_f64(1.) / v_sqrt(x);
return v_div(v256_setall_f64(1.), v_sqrt(x));
}
/** Absolute values **/
@ -1451,23 +1441,23 @@ OPENCV_HAL_IMPL_AVX_ABS(int16x16, epi16)
OPENCV_HAL_IMPL_AVX_ABS(int32x8, epi32)
inline v_float32x8 v_abs(const v_float32x8& x)
{ return x & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))); }
{ return v_and(x, v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)))); }
inline v_float64x4 v_abs(const v_float64x4& x)
{ return x & v_float64x4(_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_set1_epi64x(-1), 1))); }
{ return v_and(x, v_float64x4(_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_set1_epi64x(-1), 1)))); }
/** Absolute difference **/
inline v_uint8x32 v_absdiff(const v_uint8x32& a, const v_uint8x32& b)
{ return v_add_wrap(a - b, b - a); }
{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
inline v_uint16x16 v_absdiff(const v_uint16x16& a, const v_uint16x16& b)
{ return v_add_wrap(a - b, b - a); }
{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
inline v_uint32x8 v_absdiff(const v_uint32x8& a, const v_uint32x8& b)
{ return v_max(a, b) - v_min(a, b); }
{ return v_sub(v_max(a, b), v_min(a, b)); }
inline v_uint8x32 v_absdiff(const v_int8x32& a, const v_int8x32& b)
{
v_int8x32 d = v_sub_wrap(a, b);
v_int8x32 m = a < b;
return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
v_int8x32 m = v_lt(a, b);
return v_reinterpret_as_u8(v_sub_wrap(v_xor(d, m), m));
}
inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b)
@ -1475,26 +1465,26 @@ inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b)
inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b)
{
v_int32x8 d = a - b;
v_int32x8 m = a < b;
return v_reinterpret_as_u32((d ^ m) - m);
v_int32x8 d = v_sub(a, b);
v_int32x8 m = v_lt(a, b);
return v_reinterpret_as_u32(v_sub(v_xor(d, m), m));
}
inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
{ return v_abs(a - b); }
{ return v_abs(v_sub(a, b)); }
inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
{ return v_abs(a - b); }
{ return v_abs(v_sub(a, b)); }
/** Saturating absolute difference **/
inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b)
{
v_int8x32 d = a - b;
v_int8x32 m = a < b;
return (d ^ m) - m;
v_int8x32 d = v_sub(a, b);
v_int8x32 m = v_lt(a, b);
return v_sub(v_xor(d, m), m);
}
inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b)
{ return v_max(a, b) - v_min(a, b); }
{ return v_sub(v_max(a, b), v_min(a, b)); }
////////// Conversions /////////
@ -1789,7 +1779,7 @@ inline v_float32x8 v_pack_triplets(const v_float32x8& vec)
inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
{ return v_int32x8(_mm256_madd_epi16(a.val, b.val)); }
inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
{ return v_dotprod(a, b) + c; }
{ return v_add(v_dotprod(a, b), c); }
// 32 >> 64
inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
@ -1799,7 +1789,7 @@ inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
return v_int64x4(_mm256_add_epi64(even, odd));
}
inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
{ return v_dotprod(a, b) + c; }
{ return v_add(v_dotprod(a, b), c); }
// 8 >> 32
inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
@ -1816,7 +1806,7 @@ inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
return v_uint32x8(_mm256_add_epi32(prod0, prod1));
}
inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
{
@ -1831,7 +1821,7 @@ inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
return v_int32x8(_mm256_add_epi32(prod0, prod1));
}
inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
// 16 >> 64
inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
@ -1855,7 +1845,7 @@ inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
));
}
inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
{
@ -1871,13 +1861,13 @@ inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
));
}
inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
// 32 >> 64f
inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b)
{ return v_cvt_f64(v_dotprod(a, b)); }
inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
//////// Fast Dot Product ////////
@ -1923,7 +1913,7 @@ inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16&
return v_uint64x4(_mm256_add_epi64(p15_, p9d_));
}
inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
{ return v_dotprod_expand_fast(a, b) + c; }
{ return v_add(v_dotprod_expand_fast(a, b), c); }
inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
{
@ -1934,7 +1924,7 @@ inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
return v_int64x4(_mm256_add_epi64(lo, hi));
}
inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
{ return v_dotprod_expand_fast(a, b) + c; }
{ return v_add(v_dotprod_expand_fast(a, b), c); }
// 32 >> 64f
inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b)
@ -1953,7 +1943,7 @@ inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0,
v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
v_float32x8 v37 = OPENCV_HAL_AVX_SPLAT2_PS(v, 3);
return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v_mul(v37, m3))));
}
inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0,
@ -2058,43 +2048,43 @@ v_uint8x32 v_rshr_pack(const v_uint16x16& a, const v_uint16x16& b)
{
// we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
v_reinterpret_as_s16((b + delta) >> n));
return v_pack_u(v_reinterpret_as_s16(v_shr(v_add(a, delta), n)),
v_reinterpret_as_s16(v_shr(v_add(b, delta), n)));
}
template<int n> inline
void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a)
{
v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
v_pack_u_store(ptr, v_reinterpret_as_s16(v_shr(v_add(a, delta), n)));
}
template<int n> inline
v_uint8x32 v_rshr_pack_u(const v_int16x16& a, const v_int16x16& b)
{
v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
return v_pack_u((a + delta) >> n, (b + delta) >> n);
return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
}
template<int n> inline
void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a)
{
v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
v_pack_u_store(ptr, (a + delta) >> n);
v_pack_u_store(ptr, v_shr(v_add(a, delta), n));
}
template<int n> inline
v_int8x32 v_rshr_pack(const v_int16x16& a, const v_int16x16& b)
{
v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
return v_pack((a + delta) >> n, (b + delta) >> n);
return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
}
template<int n> inline
void v_rshr_pack_store(schar* ptr, const v_int16x16& a)
{
v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
v_pack_store(ptr, (a + delta) >> n);
v_pack_store(ptr, v_shr(v_add(a, delta), n));
}
// 32
@ -2127,43 +2117,43 @@ v_uint16x16 v_rshr_pack(const v_uint32x8& a, const v_uint32x8& b)
{
// we assume that n > 0, and so the shifted 32-bit values can be treated as signed numbers.
v_uint32x8 delta = v256_setall_u32(1 << (n-1));
return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
v_reinterpret_as_s32((b + delta) >> n));
return v_pack_u(v_reinterpret_as_s32(v_shr(v_add(a, delta), n)),
v_reinterpret_as_s32(v_shr(v_add(b, delta), n)));
}
template<int n> inline
void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a)
{
v_uint32x8 delta = v256_setall_u32(1 << (n-1));
v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
v_pack_u_store(ptr, v_reinterpret_as_s32(v_shr(v_add(a, delta), n)));
}
template<int n> inline
v_uint16x16 v_rshr_pack_u(const v_int32x8& a, const v_int32x8& b)
{
v_int32x8 delta = v256_setall_s32(1 << (n-1));
return v_pack_u((a + delta) >> n, (b + delta) >> n);
return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
}
template<int n> inline
void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a)
{
v_int32x8 delta = v256_setall_s32(1 << (n-1));
v_pack_u_store(ptr, (a + delta) >> n);
v_pack_u_store(ptr, v_shr(v_add(a, delta), n));
}
template<int n> inline
v_int16x16 v_rshr_pack(const v_int32x8& a, const v_int32x8& b)
{
v_int32x8 delta = v256_setall_s32(1 << (n-1));
return v_pack((a + delta) >> n, (b + delta) >> n);
return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
}
template<int n> inline
void v_rshr_pack_store(short* ptr, const v_int32x8& a)
{
v_int32x8 delta = v256_setall_s32(1 << (n-1));
v_pack_store(ptr, (a + delta) >> n);
v_pack_store(ptr, v_shr(v_add(a, delta), n));
}
// 64
@ -2192,28 +2182,28 @@ template<int n> inline
v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b)
{
v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
return v_pack((a + delta) >> n, (b + delta) >> n);
return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
}
template<int n> inline
void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a)
{
v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
v_pack_store(ptr, (a + delta) >> n);
v_pack_store(ptr, v_shr(v_add(a, delta), n));
}
template<int n> inline
v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b)
{
v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
return v_pack((a + delta) >> n, (b + delta) >> n);
return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
}
template<int n> inline
void v_rshr_pack_store(int* ptr, const v_int64x4& a)
{
v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
v_pack_store(ptr, (a + delta) >> n);
v_pack_store(ptr, v_shr(v_add(a, delta), n));
}
// pack boolean

@ -663,58 +663,56 @@ inline v_int8x64 v_mul_wrap(const v_int8x64& a, const v_int8x64& b)
}
#define OPENCV_HAL_IMPL_AVX512_BIN_OP(bin_op, _Tpvec, intrin) \
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(a.val, b.val)); } \
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
{ a.val = intrin(a.val, b.val); return a; }
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint32x16, _mm512_add_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint32x16, _mm512_sub_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int32x16, _mm512_add_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int32x16, _mm512_sub_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint64x8, _mm512_add_epi64)
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint64x8, _mm512_sub_epi64)
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int64x8, _mm512_add_epi64)
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int64x8, _mm512_sub_epi64)
OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint32x16, _mm512_mullo_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int32x16, _mm512_mullo_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint64x8, _mm512_mullo_epi64)
OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int64x8, _mm512_mullo_epi64)
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(a.val, b.val)); }
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint32x16, _mm512_add_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint32x16, _mm512_sub_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int32x16, _mm512_add_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int32x16, _mm512_sub_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint64x8, _mm512_add_epi64)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint64x8, _mm512_sub_epi64)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int64x8, _mm512_add_epi64)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int64x8, _mm512_sub_epi64)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_uint32x16, _mm512_mullo_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_int32x16, _mm512_mullo_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_uint64x8, _mm512_mullo_epi64)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_int64x8, _mm512_mullo_epi64)
/** Saturating arithmetics **/
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint8x64, _mm512_adds_epu8)
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint8x64, _mm512_subs_epu8)
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int8x64, _mm512_adds_epi8)
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int8x64, _mm512_subs_epi8)
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint16x32, _mm512_adds_epu16)
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint16x32, _mm512_subs_epu16)
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int16x32, _mm512_adds_epi16)
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int16x32, _mm512_subs_epi16)
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float32x16, _mm512_add_ps)
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float32x16, _mm512_sub_ps)
OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float32x16, _mm512_mul_ps)
OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float32x16, _mm512_div_ps)
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float64x8, _mm512_add_pd)
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float64x8, _mm512_sub_pd)
OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float64x8, _mm512_mul_pd)
OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float64x8, _mm512_div_pd)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint8x64, _mm512_adds_epu8)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint8x64, _mm512_subs_epu8)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int8x64, _mm512_adds_epi8)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int8x64, _mm512_subs_epi8)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint16x32, _mm512_adds_epu16)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint16x32, _mm512_subs_epu16)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int16x32, _mm512_adds_epi16)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int16x32, _mm512_subs_epi16)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_float32x16, _mm512_add_ps)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_float32x16, _mm512_sub_ps)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_float32x16, _mm512_mul_ps)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_div, v_float32x16, _mm512_div_ps)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_float64x8, _mm512_add_pd)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_float64x8, _mm512_sub_pd)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_float64x8, _mm512_mul_pd)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_div, v_float64x8, _mm512_div_pd)
// saturating multiply
inline v_uint8x64 operator * (const v_uint8x64& a, const v_uint8x64& b)
inline v_uint8x64 v_mul(const v_uint8x64& a, const v_uint8x64& b)
{
v_uint16x32 c, d;
v_mul_expand(a, b, c, d);
return v_pack(c, d);
}
inline v_int8x64 operator * (const v_int8x64& a, const v_int8x64& b)
inline v_int8x64 v_mul(const v_int8x64& a, const v_int8x64& b)
{
v_int16x32 c, d;
v_mul_expand(a, b, c, d);
return v_pack(c, d);
}
inline v_uint16x32 operator * (const v_uint16x32& a, const v_uint16x32& b)
inline v_uint16x32 v_mul(const v_uint16x32& a, const v_uint16x32& b)
{
__m512i pl = _mm512_mullo_epi16(a.val, b.val);
__m512i ph = _mm512_mulhi_epu16(a.val, b.val);
@ -724,7 +722,7 @@ inline v_uint16x32 operator * (const v_uint16x32& a, const v_uint16x32& b)
const __m512i m = _mm512_set1_epi32(65535);
return v_uint16x32(_mm512_packus_epi32(_mm512_min_epu32(p0, m), _mm512_min_epu32(p1, m)));
}
inline v_int16x32 operator * (const v_int16x32& a, const v_int16x32& b)
inline v_int16x32 v_mul(const v_int16x32& a, const v_int16x32& b)
{
__m512i pl = _mm512_mullo_epi16(a.val, b.val);
__m512i ph = _mm512_mulhi_epi16(a.val, b.val);
@ -733,15 +731,6 @@ inline v_int16x32 operator * (const v_int16x32& a, const v_int16x32& b)
return v_int16x32(_mm512_packs_epi32(p0, p1));
}
inline v_uint8x64& operator *= (v_uint8x64& a, const v_uint8x64& b)
{ a = a * b; return a; }
inline v_int8x64& operator *= (v_int8x64& a, const v_int8x64& b)
{ a = a * b; return a; }
inline v_uint16x32& operator *= (v_uint16x32& a, const v_uint16x32& b)
{ a = a * b; return a; }
inline v_int16x32& operator *= (v_int16x32& a, const v_int16x32& b)
{ a = a * b; return a; }
inline v_int16x32 v_mul_hi(const v_int16x32& a, const v_int16x32& b) { return v_int16x32(_mm512_mulhi_epi16(a.val, b.val)); }
inline v_uint16x32 v_mul_hi(const v_uint16x32& a, const v_uint16x32& b) { return v_uint16x32(_mm512_mulhi_epu16(a.val, b.val)); }
@ -802,13 +791,13 @@ inline void v_mul_expand(const v_int32x16& a, const v_int32x16& b,
/** Bitwise shifts **/
#define OPENCV_HAL_IMPL_AVX512_SHIFT_OP(_Tpuvec, _Tpsvec, suffix) \
inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \
{ return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); } \
inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \
{ return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); } \
inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
inline _Tpuvec v_shr(const _Tpuvec& a, int imm) \
{ return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); } \
inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \
{ return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); } \
template<int imm> \
inline _Tpuvec v_shl(const _Tpuvec& a) \
@ -830,10 +819,10 @@ OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint64x8, v_int64x8, epi64)
/** Bitwise logic **/
#define OPENCV_HAL_IMPL_AVX512_LOGIC_OP(_Tpvec, suffix, not_const) \
OPENCV_HAL_IMPL_AVX512_BIN_OP(&, _Tpvec, _mm512_and_##suffix) \
OPENCV_HAL_IMPL_AVX512_BIN_OP(|, _Tpvec, _mm512_or_##suffix) \
OPENCV_HAL_IMPL_AVX512_BIN_OP(^, _Tpvec, _mm512_xor_##suffix) \
inline _Tpvec operator ~ (const _Tpvec& a) \
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_and, _Tpvec, _mm512_and_##suffix) \
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_or, _Tpvec, _mm512_or_##suffix) \
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_xor, _Tpvec, _mm512_xor_##suffix) \
inline _Tpvec v_not(const _Tpvec& a) \
{ return _Tpvec(_mm512_xor_##suffix(a.val, not_const)); }
OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint8x64, si512, _mm512_set1_epi32(-1))
@ -865,16 +854,16 @@ OPENCV_HAL_IMPL_AVX512_SELECT(v_float64x8, pd, pd)
/** Comparison **/
#define OPENCV_HAL_IMPL_AVX512_CMP_INT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval)); }
#define OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(_Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_INT(==, _MM_CMPINT_EQ, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_INT(!=, _MM_CMPINT_NE, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_INT(<, _MM_CMPINT_LT, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_INT(>, _MM_CMPINT_NLE, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_INT(<=, _MM_CMPINT_LE, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_INT(>=, _MM_CMPINT_NLT, _Tpvec, sufcmp, sufset, tval)
OPENCV_HAL_IMPL_AVX512_CMP_INT(v_eq, _MM_CMPINT_EQ, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_INT(v_ne, _MM_CMPINT_NE, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_INT(v_lt, _MM_CMPINT_LT, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_INT(v_gt, _MM_CMPINT_NLE, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_INT(v_le, _MM_CMPINT_LE, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_INT(v_ge, _MM_CMPINT_NLT, _Tpvec, sufcmp, sufset, tval)
OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint8x64, epu8, epi8, (char)-1)
OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int8x64, epi8, epi8, (char)-1)
@ -886,16 +875,16 @@ OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint64x8, epu64, epi64, (int64)-1)
OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int64x8, epi64, epi64, (int64)-1)
#define OPENCV_HAL_IMPL_AVX512_CMP_FLT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(_mm512_castsi512_##sufcmp(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval))); }
#define OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(_Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_FLT(==, _CMP_EQ_OQ, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_FLT(<, _CMP_LT_OQ, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_FLT(>, _CMP_GT_OQ, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_FLT(<=, _CMP_LE_OQ, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_FLT(>=, _CMP_GE_OQ, _Tpvec, sufcmp, sufset, tval)
OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_eq, _CMP_EQ_OQ, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_ne, _CMP_NEQ_OQ, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_lt, _CMP_LT_OQ, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_gt, _CMP_GT_OQ, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_le, _CMP_LE_OQ, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_ge, _CMP_GE_OQ, _Tpvec, sufcmp, sufset, tval)
OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float32x16, ps, epi32, (int)-1)
OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float64x8, pd, epi64, (int64)-1)
@ -1250,9 +1239,9 @@ OPENCV_HAL_IMPL_AVX512_REDUCE_32(short, min, v_int16x32, min_epi16)
OPENCV_HAL_IMPL_AVX512_REDUCE_32(short, max, v_int16x32, max_epi16)
inline int v_reduce_sum(const v_int16x32& a)
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
inline uint v_reduce_sum(const v_uint16x32& a)
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
#define OPENCV_HAL_IMPL_AVX512_REDUCE_64(sctype, func, _Tpvec, ifunc) \
inline sctype v_reduce_##func(const _Tpvec& a) \
@ -1306,17 +1295,17 @@ inline unsigned v_reduce_sad(const v_int8x64& a, const v_int8x64& b)
return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
}
inline unsigned v_reduce_sad(const v_uint16x32& a, const v_uint16x32& b)
{ return v_reduce_sum(v_add_wrap(a - b, b - a)); }
{ return v_reduce_sum(v_add_wrap(v_sub(a, b), v_sub(b, a))); }
inline unsigned v_reduce_sad(const v_int16x32& a, const v_int16x32& b)
{ return v_reduce_sum(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)))); }
inline unsigned v_reduce_sad(const v_uint32x16& a, const v_uint32x16& b)
{ return v_reduce_sum(v_max(a, b) - v_min(a, b)); }
{ return v_reduce_sum(v_sub(v_max(a, b), v_min(a, b))); }
inline unsigned v_reduce_sad(const v_int32x16& a, const v_int32x16& b)
{ return v_reduce_sum(v_reinterpret_as_u32(v_max(a, b) - v_min(a, b))); }
{ return v_reduce_sum(v_reinterpret_as_u32(v_sub(v_max(a, b), v_min(a, b)))); }
inline float v_reduce_sad(const v_float32x16& a, const v_float32x16& b)
{ return v_reduce_sum((a - b) & v_float32x16(_mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff)))); }
{ return v_reduce_sum(v_and(v_sub(a, b), v_float32x16(_mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff))))); }
inline double v_reduce_sad(const v_float64x8& a, const v_float64x8& b)
{ return v_reduce_sum((a - b) & v_float64x8(_mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffff)))); }
{ return v_reduce_sum(v_and(v_sub(a, b), v_float64x8(_mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffff))))); }
/** Popcount **/
inline v_uint8x64 v_popcount(const v_int8x64& a)
@ -1351,8 +1340,8 @@ inline v_uint16x32 v_popcount(const v_int16x32& a)
_mm512_popcnt_epi32(_mm512_unpackhi_epi16(a.val, zero))));
#else
v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));
p += v_rotate_right<1>(p);
return v_reinterpret_as_u16(p) & v512_setall_u16(0x00ff);
p = v_add(p, v_rotate_right<1>(p));
return v_and(v_reinterpret_as_u16(p), v512_setall_u16(0x00ff));
#endif
}
inline v_uint32x16 v_popcount(const v_int32x16& a)
@ -1361,9 +1350,9 @@ inline v_uint32x16 v_popcount(const v_int32x16& a)
return v_uint32x16(_mm512_popcnt_epi32(a.val));
#else
v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));
p += v_rotate_right<1>(p);
p += v_rotate_right<2>(p);
return v_reinterpret_as_u32(p) & v512_setall_u32(0x000000ff);
p = v_add(p, v_rotate_right<1>(p));
p = v_add(p, v_rotate_right<2>(p));
return v_and(v_reinterpret_as_u32(p), v512_setall_u32(0x000000ff));
#endif
}
inline v_uint64x8 v_popcount(const v_int64x8& a)
@ -1403,9 +1392,9 @@ inline v_uint64x8 v_popcount(const v_uint64x8& a) { return v_popcount(v_reinte
inline _Tpvec v_sqrt(const _Tpvec& x) \
{ return _Tpvec(_mm512_sqrt_##suffix(x.val)); } \
inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
{ return v_fma(a, a, b * b); } \
{ return v_fma(a, a, v_mul(b, b)); } \
inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
{ return v_sqrt(v_fma(a, a, b * b)); }
{ return v_sqrt(v_fma(a, a, v_mul(b, b))); }
OPENCV_HAL_IMPL_AVX512_MULADD(v_float32x16, ps)
OPENCV_HAL_IMPL_AVX512_MULADD(v_float64x8, pd)
@ -1413,7 +1402,7 @@ OPENCV_HAL_IMPL_AVX512_MISC(v_float32x16, ps)
OPENCV_HAL_IMPL_AVX512_MISC(v_float64x8, pd)
inline v_int32x16 v_fma(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)
{ return a * b + c; }
{ return v_add(v_mul(a, b), c); }
inline v_int32x16 v_muladd(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)
{ return v_fma(a, b, c); }
@ -1422,9 +1411,9 @@ inline v_float32x16 v_invsqrt(const v_float32x16& x)
#if CV_AVX_512ER
return v_float32x16(_mm512_rsqrt28_ps(x.val));
#else
v_float32x16 half = x * v512_setall_f32(0.5);
v_float32x16 half = v_mul(x, v512_setall_f32(0.5));
v_float32x16 t = v_float32x16(_mm512_rsqrt14_ps(x.val));
t *= v512_setall_f32(1.5) - ((t * t) * half);
t = v_mul(t, v_sub(v512_setall_f32(1.5), v_mul(v_mul(t, t), half)));
return t;
#endif
}
@ -1434,7 +1423,7 @@ inline v_float64x8 v_invsqrt(const v_float64x8& x)
#if CV_AVX_512ER
return v_float64x8(_mm512_rsqrt28_pd(x.val));
#else
return v512_setall_f64(1.) / v_sqrt(x);
return v_div(v512_setall_f64(1.), v_sqrt(x));
// v_float64x8 half = x * v512_setall_f64(0.5);
// v_float64x8 t = v_float64x8(_mm512_rsqrt14_pd(x.val));
// t *= v512_setall_f64(1.5) - ((t * t) * half);
@ -1482,17 +1471,17 @@ inline v_float64x8 v_abs(const v_float64x8& x)
/** Absolute difference **/
inline v_uint8x64 v_absdiff(const v_uint8x64& a, const v_uint8x64& b)
{ return v_add_wrap(a - b, b - a); }
{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
inline v_uint16x32 v_absdiff(const v_uint16x32& a, const v_uint16x32& b)
{ return v_add_wrap(a - b, b - a); }
{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
inline v_uint32x16 v_absdiff(const v_uint32x16& a, const v_uint32x16& b)
{ return v_max(a, b) - v_min(a, b); }
{ return v_sub(v_max(a, b), v_min(a, b)); }
inline v_uint8x64 v_absdiff(const v_int8x64& a, const v_int8x64& b)
{
v_int8x64 d = v_sub_wrap(a, b);
v_int8x64 m = a < b;
return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
v_int8x64 m = v_lt(a, b);
return v_reinterpret_as_u8(v_sub_wrap(v_xor(d, m), m));
}
inline v_uint16x32 v_absdiff(const v_int16x32& a, const v_int16x32& b)
@ -1500,26 +1489,26 @@ inline v_uint16x32 v_absdiff(const v_int16x32& a, const v_int16x32& b)
inline v_uint32x16 v_absdiff(const v_int32x16& a, const v_int32x16& b)
{
v_int32x16 d = a - b;
v_int32x16 m = a < b;
return v_reinterpret_as_u32((d ^ m) - m);
v_int32x16 d = v_sub(a, b);
v_int32x16 m = v_lt(a, b);
return v_reinterpret_as_u32(v_sub(v_xor(d, m), m));
}
inline v_float32x16 v_absdiff(const v_float32x16& a, const v_float32x16& b)
{ return v_abs(a - b); }
{ return v_abs(v_sub(a, b)); }
inline v_float64x8 v_absdiff(const v_float64x8& a, const v_float64x8& b)
{ return v_abs(a - b); }
{ return v_abs(v_sub(a, b)); }
/** Saturating absolute difference **/
inline v_int8x64 v_absdiffs(const v_int8x64& a, const v_int8x64& b)
{
v_int8x64 d = a - b;
v_int8x64 m = a < b;
return (d ^ m) - m;
v_int8x64 d = v_sub(a, b);
v_int8x64 m = v_lt(a, b);
return v_sub(v_xor(d, m), m);
}
inline v_int16x32 v_absdiffs(const v_int16x32& a, const v_int16x32& b)
{ return v_max(a, b) - v_min(a, b); }
{ return v_sub(v_max(a, b), v_min(a, b)); }
////////// Conversions /////////
@ -1818,7 +1807,7 @@ inline v_float32x16 v_pack_triplets(const v_float32x16& vec)
inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b)
{ return v_int32x16(_mm512_madd_epi16(a.val, b.val)); }
inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c)
{ return v_dotprod(a, b) + c; }
{ return v_add(v_dotprod(a, b), c); }
// 32 >> 64
inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b)
@ -1828,7 +1817,7 @@ inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b)
return v_int64x8(_mm512_add_epi64(even, odd));
}
inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c)
{ return v_dotprod(a, b) + c; }
{ return v_add(v_dotprod(a, b), c); }
// 8 >> 32
inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b)
@ -1844,7 +1833,7 @@ inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b)
return v_uint32x16(_mm512_add_epi32(prod0, prod1));
}
inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b)
{
@ -1859,7 +1848,7 @@ inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b)
return v_int32x16(_mm512_add_epi32(prod0, prod1));
}
inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
// 16 >> 64
inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b)
@ -1883,7 +1872,7 @@ inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b)
));
}
inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b)
{
@ -1893,13 +1882,13 @@ inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b)
return v_int64x8(_mm512_add_epi64(even, odd));
}
inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
// 32 >> 64f
inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b)
{ return v_cvt_f64(v_dotprod(a, b)); }
inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
//////// Fast Dot Product ////////
@ -1944,7 +1933,7 @@ inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32&
return v_uint64x8(_mm512_add_epi64(p15_, p9d_));
}
inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
{ return v_dotprod_expand_fast(a, b) + c; }
{ return v_add(v_dotprod_expand_fast(a, b), c); }
inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b)
{ return v_dotprod_expand(a, b); }
@ -1955,7 +1944,7 @@ inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b,
inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b)
{ return v_dotprod_expand(a, b); }
inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
#define OPENCV_HAL_AVX512_SPLAT2_PS(a, im) \
@ -1969,7 +1958,7 @@ inline v_float32x16 v_matmul(const v_float32x16& v,
v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);
v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);
v_float32x16 v37 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 3);
return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v_mul(v37, m3))));
}
inline v_float32x16 v_matmuladd(const v_float32x16& v,
@ -2070,43 +2059,43 @@ v_uint8x64 v_rshr_pack(const v_uint16x32& a, const v_uint16x32& b)
{
// we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));
return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
v_reinterpret_as_s16((b + delta) >> n));
return v_pack_u(v_reinterpret_as_s16(v_shr(v_add(a, delta), n)),
v_reinterpret_as_s16(v_shr(v_add(b, delta), n)));
}
template<int n> inline
void v_rshr_pack_store(uchar* ptr, const v_uint16x32& a)
{
v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));
v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
v_pack_u_store(ptr, v_reinterpret_as_s16(v_shr(v_add(a, delta), n)));
}
template<int n> inline
v_uint8x64 v_rshr_pack_u(const v_int16x32& a, const v_int16x32& b)
{
v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
return v_pack_u((a + delta) >> n, (b + delta) >> n);
return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
}
template<int n> inline
void v_rshr_pack_u_store(uchar* ptr, const v_int16x32& a)
{
v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
v_pack_u_store(ptr, (a + delta) >> n);
v_pack_u_store(ptr, v_shr(v_add(a, delta), n));
}
template<int n> inline
v_int8x64 v_rshr_pack(const v_int16x32& a, const v_int16x32& b)
{
v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
return v_pack((a + delta) >> n, (b + delta) >> n);
return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
}
template<int n> inline
void v_rshr_pack_store(schar* ptr, const v_int16x32& a)
{
v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
v_pack_store(ptr, (a + delta) >> n);
v_pack_store(ptr, v_shr(v_add(a, delta), n));
}
// 32
@ -2139,43 +2128,43 @@ template<int n> inline
v_uint16x32 v_rshr_pack(const v_uint32x16& a, const v_uint32x16& b)
{
v_uint32x16 delta = v512_setall_u32(1 << (n-1));
return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
v_reinterpret_as_s32((b + delta) >> n));
return v_pack_u(v_reinterpret_as_s32(v_shr(v_add(a, delta), n)),
v_reinterpret_as_s32(v_shr(v_add(b, delta), n)));
}
template<int n> inline
void v_rshr_pack_store(ushort* ptr, const v_uint32x16& a)
{
v_uint32x16 delta = v512_setall_u32(1 << (n-1));
v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
v_pack_u_store(ptr, v_reinterpret_as_s32(v_shr(v_add(a, delta), n)));
}
template<int n> inline
v_uint16x32 v_rshr_pack_u(const v_int32x16& a, const v_int32x16& b)
{
v_int32x16 delta = v512_setall_s32(1 << (n-1));
return v_pack_u((a + delta) >> n, (b + delta) >> n);
return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
}
template<int n> inline
void v_rshr_pack_u_store(ushort* ptr, const v_int32x16& a)
{
v_int32x16 delta = v512_setall_s32(1 << (n-1));
v_pack_u_store(ptr, (a + delta) >> n);
v_pack_u_store(ptr, v_shr(v_add(a, delta), n));
}
template<int n> inline
v_int16x32 v_rshr_pack(const v_int32x16& a, const v_int32x16& b)
{
v_int32x16 delta = v512_setall_s32(1 << (n-1));
return v_pack((a + delta) >> n, (b + delta) >> n);
return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
}
template<int n> inline
void v_rshr_pack_store(short* ptr, const v_int32x16& a)
{
v_int32x16 delta = v512_setall_s32(1 << (n-1));
v_pack_store(ptr, (a + delta) >> n);
v_pack_store(ptr, v_shr(v_add(a, delta), n));
}
// 64
@ -2196,28 +2185,28 @@ template<int n> inline
v_uint32x16 v_rshr_pack(const v_uint64x8& a, const v_uint64x8& b)
{
v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
return v_pack((a + delta) >> n, (b + delta) >> n);
return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
}
template<int n> inline
void v_rshr_pack_store(unsigned* ptr, const v_uint64x8& a)
{
v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
v_pack_store(ptr, (a + delta) >> n);
v_pack_store(ptr, v_shr(v_add(a, delta), n));
}
template<int n> inline
v_int32x16 v_rshr_pack(const v_int64x8& a, const v_int64x8& b)
{
v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
return v_pack((a + delta) >> n, (b + delta) >> n);
return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
}
template<int n> inline
void v_rshr_pack_store(int* ptr, const v_int64x8& a)
{
v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
v_pack_store(ptr, (a + delta) >> n);
v_pack_store(ptr, v_shr(v_add(a, delta), n));
}
// pack boolean

@ -225,32 +225,32 @@ These operations allow to reorder or recombine elements in one or multiple vecto
Element-wise binary and unary operations.
- Arithmetics:
@ref operator +(const v_reg &a, const v_reg &b) "+",
@ref operator -(const v_reg &a, const v_reg &b) "-",
@ref operator *(const v_reg &a, const v_reg &b) "*",
@ref operator /(const v_reg &a, const v_reg &b) "/",
@ref v_add(const v_reg &a, const v_reg &b) "+",
@ref v_sub(const v_reg &a, const v_reg &b) "-",
@ref v_mul(const v_reg &a, const v_reg &b) "*",
@ref v_div(const v_reg &a, const v_reg &b) "/",
@ref v_mul_expand
- Non-saturating arithmetics: @ref v_add_wrap, @ref v_sub_wrap
- Bitwise shifts:
@ref operator <<(const v_reg &a, int s) "<<",
@ref operator >>(const v_reg &a, int s) ">>",
@ref v_shl(const v_reg &a, int s) "<<",
@ref v_shr(const v_reg &a, int s) ">>",
@ref v_shl, @ref v_shr
- Bitwise logic:
@ref operator &(const v_reg &a, const v_reg &b) "&",
@ref operator |(const v_reg &a, const v_reg &b) "|",
@ref operator ^(const v_reg &a, const v_reg &b) "^",
@ref operator ~(const v_reg &a) "~"
@ref v_and(const v_reg &a, const v_reg &b) "&",
@ref v_or(const v_reg &a, const v_reg &b) "|",
@ref v_xor(const v_reg &a, const v_reg &b) "^",
@ref v_not(const v_reg &a) "~"
- Comparison:
@ref operator >(const v_reg &a, const v_reg &b) ">",
@ref operator >=(const v_reg &a, const v_reg &b) ">=",
@ref operator <(const v_reg &a, const v_reg &b) "<",
@ref operator <=(const v_reg &a, const v_reg &b) "<=",
@ref operator ==(const v_reg &a, const v_reg &b) "==",
@ref operator !=(const v_reg &a, const v_reg &b) "!="
@ref v_gt(const v_reg &a, const v_reg &b) ">",
@ref v_ge(const v_reg &a, const v_reg &b) ">=",
@ref v_lt(const v_reg &a, const v_reg &b) "<",
@ref v_le(const v_reg &a, const v_reg &b) "<=",
@ref v_eq(const v_reg &a, const v_reg &b) "==",
@ref v_ne(const v_reg &a, const v_reg &b) "!="
- min/max: @ref v_min, @ref v_max
@ -573,50 +573,43 @@ enum {
/** @brief Add values
For all types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator+(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator+=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_add(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
/** @brief Subtract values
For all types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator-(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator-=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_sub(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
/** @brief Multiply values
For 16- and 32-bit integer types and floating types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator*(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator*=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_mul(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
/** @brief Divide values
For floating types only. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator/(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator/=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_div(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
/** @brief Bitwise AND
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator&(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator&=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_and(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_or(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
/** @brief Bitwise XOR
Only for integer types.*/
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator^(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator^=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_xor(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
/** @brief Bitwise NOT
Only for integer types.*/
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator~(const v_reg<_Tp, n>& a);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_not(const v_reg<_Tp, n>& a);
#ifndef CV_DOXYGEN
@ -639,33 +632,26 @@ __CV_EXPAND(macro_name(double, __VA_ARGS__)) \
CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, __VA_ARGS__) \
CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, __VA_ARGS__) \
#define CV__HAL_INTRIN_IMPL_BIN_OP_(_Tp, bin_op) \
#define CV__HAL_INTRIN_IMPL_BIN_OP_(_Tp, bin_op, func) \
template<int n> inline \
v_reg<_Tp, n> operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
{ \
v_reg<_Tp, n> c; \
for( int i = 0; i < n; i++ ) \
c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
return c; \
} \
template<int n> inline \
v_reg<_Tp, n>& operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
{ \
for( int i = 0; i < n; i++ ) \
a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
return a; \
}
#define CV__HAL_INTRIN_IMPL_BIN_OP(bin_op) CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, bin_op)
#define CV__HAL_INTRIN_IMPL_BIN_OP(bin_op, func) CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, bin_op, func)
CV__HAL_INTRIN_IMPL_BIN_OP(+)
CV__HAL_INTRIN_IMPL_BIN_OP(-)
CV__HAL_INTRIN_IMPL_BIN_OP(*)
CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, /)
CV__HAL_INTRIN_IMPL_BIN_OP(+, v_add)
CV__HAL_INTRIN_IMPL_BIN_OP(-, v_sub)
CV__HAL_INTRIN_IMPL_BIN_OP(*, v_mul)
CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, /, v_div)
#define CV__HAL_INTRIN_IMPL_BIT_OP_(_Tp, bit_op) \
#define CV__HAL_INTRIN_IMPL_BIT_OP_(_Tp, bit_op, func) \
template<int n> CV_INLINE \
v_reg<_Tp, n> operator bit_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
{ \
v_reg<_Tp, n> c; \
typedef typename V_TypeTraits<_Tp>::int_type itype; \
@ -673,29 +659,20 @@ v_reg<_Tp, n> operator bit_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
return c; \
} \
template<int n> CV_INLINE \
v_reg<_Tp, n>& operator bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
{ \
typedef typename V_TypeTraits<_Tp>::int_type itype; \
for( int i = 0; i < n; i++ ) \
a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
return a; \
}
#define CV__HAL_INTRIN_IMPL_BIT_OP(bit_op) \
CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) \
CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) /* TODO: FIXIT remove this after masks refactoring */
#define CV__HAL_INTRIN_IMPL_BIT_OP(bit_op, func) \
CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op, func) \
CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op, func) /* TODO: FIXIT remove this after masks refactoring */
CV__HAL_INTRIN_IMPL_BIT_OP(&)
CV__HAL_INTRIN_IMPL_BIT_OP(|)
CV__HAL_INTRIN_IMPL_BIT_OP(^)
CV__HAL_INTRIN_IMPL_BIT_OP(&, v_and)
CV__HAL_INTRIN_IMPL_BIT_OP(|, v_or)
CV__HAL_INTRIN_IMPL_BIT_OP(^, v_xor)
#define CV__HAL_INTRIN_IMPL_BITWISE_NOT_(_Tp, dummy) \
#define CV__HAL_INTRIN_IMPL_BITWISE_NOT_(_Tp, dummy, dummy2) \
template<int n> CV_INLINE \
v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) \
v_reg<_Tp, n> v_not(const v_reg<_Tp, n>& a) \
{ \
v_reg<_Tp, n> c; \
for( int i = 0; i < n; i++ ) \
@ -703,7 +680,7 @@ v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) \
return c; \
} \
CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BITWISE_NOT_, ~)
CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BITWISE_NOT_, ~, v_not)
#endif // !CV_DOXYGEN
@ -760,7 +737,6 @@ OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
* @note Similar to the behavior of std::log(), \f$ \ln(0) = -\infty \f$.
*/
OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
#define OPENCV_HAL_MATH_HAVE_LOG 1
/**
* @brief Error function.
@ -771,9 +747,7 @@ OPENCV_HAL_IMPL_MATH_FUNC(v_erf, std::erf, _Tp)
//! @cond IGNORED
OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
#define OPENCV_HAL_MATH_HAVE_SIN 1
OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
#define OPENCV_HAL_MATH_HAVE_COS 1
//! @endcond
/** @brief Absolute value of elements
@ -897,9 +871,9 @@ inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
//! @brief Helper macro
//! @ingroup core_hal_intrin_impl
#define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
#define OPENCV_HAL_IMPL_CMP_OP(cmp_op, func) \
template<typename _Tp, int n> \
inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
{ \
typedef typename V_TypeTraits<_Tp>::int_type itype; \
v_reg<_Tp, n> c; \
@ -911,28 +885,28 @@ inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>
/** @brief Less-than comparison
For all types except 64-bit integer values. */
OPENCV_HAL_IMPL_CMP_OP(<)
OPENCV_HAL_IMPL_CMP_OP(<, v_lt)
/** @brief Greater-than comparison
For all types except 64-bit integer values. */
OPENCV_HAL_IMPL_CMP_OP(>)
OPENCV_HAL_IMPL_CMP_OP(>, v_gt)
/** @brief Less-than or equal comparison
For all types except 64-bit integer values. */
OPENCV_HAL_IMPL_CMP_OP(<=)
OPENCV_HAL_IMPL_CMP_OP(<=, v_le)
/** @brief Greater-than or equal comparison
For all types except 64-bit integer values. */
OPENCV_HAL_IMPL_CMP_OP(>=)
OPENCV_HAL_IMPL_CMP_OP(>=, v_ge)
/** @brief Equal comparison */
OPENCV_HAL_IMPL_CMP_OP(==)
OPENCV_HAL_IMPL_CMP_OP(==, v_eq)
/** @brief Not equal comparison */
OPENCV_HAL_IMPL_CMP_OP(!=)
OPENCV_HAL_IMPL_CMP_OP(!=, v_ne)
template<int n>
inline v_reg<float, n> v_not_nan(const v_reg<float, n>& a)
@ -1301,8 +1275,8 @@ template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
//! @brief Helper macro
//! @ingroup core_hal_intrin_impl
#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op, func) \
template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, int imm) \
{ \
v_reg<_Tp, n> c; \
for( int i = 0; i < n; i++ ) \
@ -1313,12 +1287,12 @@ template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg
/** @brief Bitwise shift left
For 16-, 32- and 64-bit integer values. */
OPENCV_HAL_IMPL_SHIFT_OP(<< )
OPENCV_HAL_IMPL_SHIFT_OP(<<, v_shl)
/** @brief Bitwise shift right
For 16-, 32- and 64-bit integer values. */
OPENCV_HAL_IMPL_SHIFT_OP(>> )
OPENCV_HAL_IMPL_SHIFT_OP(>>, v_shr)
//! @brief Helper macro
//! @ingroup core_hal_intrin_impl
@ -2942,7 +2916,7 @@ OPENCV_HAL_IMPL_C_REINTERPRET(int64, s64)
//! @ingroup core_hal_intrin_impl
#define OPENCV_HAL_IMPL_C_SHIFTL(_Tp) \
template<int shift, int n> inline v_reg<_Tp, n> v_shl(const v_reg<_Tp, n>& a) \
{ return a << shift; }
{ return v_shl(a, shift); }
//! @name Left shift
//! @{
@ -2959,7 +2933,7 @@ OPENCV_HAL_IMPL_C_SHIFTL(int64)
//! @ingroup core_hal_intrin_impl
#define OPENCV_HAL_IMPL_C_SHIFTR(_Tp) \
template<int shift, int n> inline v_reg<_Tp, n> v_shr(const v_reg<_Tp, n>& a) \
{ return a >> shift; }
{ return v_shr(a, shift); }
//! @name Right shift
//! @{
@ -3285,7 +3259,7 @@ inline v_reg<float, n> v_matmuladd(const v_reg<float, n>& v,
template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b)
{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_mul(v_cvt_f64_high(a), v_cvt_f64_high(b))); }
template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b,
const v_reg<double, n/2>& c)
{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }

@ -746,53 +746,51 @@ OPENCV_HAL_IMPL_LASX_ZIP(v_float64x4)
/** Arithmetics **/
#define OPENCV_HAL_IMPL_LASX_BIN_OP(bin_op, _Tpvec, intrin) \
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(a.val, b.val)); } \
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
{ a.val = intrin(a.val, b.val); return a; }
OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint8x32, __lasx_xvsadd_bu)
OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint8x32, __lasx_xvssub_bu)
OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int8x32, __lasx_xvsadd_b)
OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int8x32, __lasx_xvssub_b)
OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint16x16, __lasx_xvsadd_hu)
OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint16x16, __lasx_xvssub_hu)
OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int16x16, __lasx_xvsadd_h)
OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int16x16, __lasx_xvssub_h)
OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint32x8, __lasx_xvadd_w)
OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint32x8, __lasx_xvsub_w)
OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_uint32x8, __lasx_xvmul_w)
OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int32x8, __lasx_xvadd_w)
OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int32x8, __lasx_xvsub_w)
OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_int32x8, __lasx_xvmul_w)
OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint64x4, __lasx_xvadd_d)
OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint64x4, __lasx_xvsub_d)
OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int64x4, __lasx_xvadd_d)
OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int64x4, __lasx_xvsub_d)
OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_float32x8, __lasx_xvfadd_s)
OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_float32x8, __lasx_xvfsub_s)
OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_float32x8, __lasx_xvfmul_s)
OPENCV_HAL_IMPL_LASX_BIN_OP(/, v_float32x8, __lasx_xvfdiv_s)
OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_float64x4, __lasx_xvfadd_d)
OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_float64x4, __lasx_xvfsub_d)
OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_float64x4, __lasx_xvfmul_d)
OPENCV_HAL_IMPL_LASX_BIN_OP(/, v_float64x4, __lasx_xvfdiv_d)
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(a.val, b.val)); }
OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint8x32, __lasx_xvsadd_bu)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint8x32, __lasx_xvssub_bu)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int8x32, __lasx_xvsadd_b)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int8x32, __lasx_xvssub_b)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint16x16, __lasx_xvsadd_hu)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint16x16, __lasx_xvssub_hu)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int16x16, __lasx_xvsadd_h)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int16x16, __lasx_xvssub_h)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint32x8, __lasx_xvadd_w)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint32x8, __lasx_xvsub_w)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_uint32x8, __lasx_xvmul_w)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int32x8, __lasx_xvadd_w)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int32x8, __lasx_xvsub_w)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_int32x8, __lasx_xvmul_w)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint64x4, __lasx_xvadd_d)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint64x4, __lasx_xvsub_d)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int64x4, __lasx_xvadd_d)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int64x4, __lasx_xvsub_d)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_float32x8, __lasx_xvfadd_s)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_float32x8, __lasx_xvfsub_s)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_float32x8, __lasx_xvfmul_s)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_div, v_float32x8, __lasx_xvfdiv_s)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_float64x4, __lasx_xvfadd_d)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_float64x4, __lasx_xvfsub_d)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_float64x4, __lasx_xvfmul_d)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_div, v_float64x4, __lasx_xvfdiv_d)
// saturating multiply 8-bit, 16-bit
inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b)
inline v_uint8x32 v_mul(const v_uint8x32& a, const v_uint8x32& b)
{
v_uint16x16 c, d;
v_mul_expand(a, b, c, d);
return v_pack(c, d);
}
inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b)
inline v_int8x32 v_mul(const v_int8x32& a, const v_int8x32& b)
{
v_int16x16 c, d;
v_mul_expand(a, b, c, d);
return v_pack(c, d);
}
inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b)
inline v_uint16x16 v_mul(const v_uint16x16& a, const v_uint16x16& b)
{
__m256i pl = __lasx_xvmul_h(a.val, b.val);
__m256i ph = __lasx_xvmuh_hu(a.val, b.val);
@ -800,7 +798,7 @@ inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b)
__m256i p1 = __lasx_xvilvh_h(ph, pl);
return v_uint16x16(_v256_packs_epu32(p0, p1));
}
inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b)
inline v_int16x16 v_mul(const v_int16x16& a, const v_int16x16& b)
{
__m256i pl = __lasx_xvmul_h(a.val, b.val);
__m256i ph = __lasx_xvmuh_h(a.val, b.val);
@ -808,14 +806,6 @@ inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b)
__m256i p1 = __lasx_xvilvh_h(ph, pl);
return v_int16x16(_lasx_packs_w(p0, p1));
}
inline v_uint8x32& operator *= (v_uint8x32& a, const v_uint8x32& b)
{ a = a * b; return a; }
inline v_int8x32& operator *= (v_int8x32& a, const v_int8x32& b)
{ a = a * b; return a; }
inline v_uint16x16& operator *= (v_uint16x16& a, const v_uint16x16& b)
{ a = a * b; return a; }
inline v_int16x16& operator *= (v_int16x16& a, const v_int16x16& b)
{ a = a * b; return a; }
/** Non-saturating arithmetics **/
@ -904,13 +894,13 @@ inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return
/** Bitwise shifts **/
#define OPENCV_HAL_IMPL_LASX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \
{ return _Tpuvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \
{ return _Tpsvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
inline _Tpuvec V_shr(const _Tpuvec& a, int imm) \
{ return _Tpuvec(__lasx_xvsrl_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \
{ return _Tpsvec(srai(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
template<int imm> \
inline _Tpuvec v_shl(const _Tpuvec& a) \
@ -932,10 +922,10 @@ OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint64x4, v_int64x4, d, __lasx_xvsra_d)
/** Bitwise logic **/
#define OPENCV_HAL_IMPL_LASX_LOGIC_OP(_Tpvec, suffix, not_const) \
OPENCV_HAL_IMPL_LASX_BIN_OP(&, _Tpvec, __lasx_xvand_##suffix) \
OPENCV_HAL_IMPL_LASX_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix) \
OPENCV_HAL_IMPL_LASX_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix) \
inline _Tpvec operator ~ (const _Tpvec& a) \
OPENCV_HAL_IMPL_LASX_BIN_OP(v_and, _Tpvec, __lasx_xvand_##suffix) \
OPENCV_HAL_IMPL_LASX_BIN_OP(v_or, _Tpvec, __lasx_xvor_##suffix) \
OPENCV_HAL_IMPL_LASX_BIN_OP(v_xor, _Tpvec, __lasx_xvxor_##suffix) \
inline _Tpvec v_not(const _Tpvec& a) \
{ return _Tpvec(__lasx_xvnori_b(a.val, 0)); }
OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint8x32, v, __lasx_xvreplgr2vr_w(-1))
@ -948,16 +938,14 @@ OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint64x4, v, __lasx_xvreplgr2vr_d(-1))
OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int64x4, v, __lasx_xvreplgr2vr_d(-1))
#define OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(bin_op, _Tpvec, intrin, cast) \
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val)))); } \
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
{ __m256i c = intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val))); a.val = cast(c); return a; }
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val)))); }
#define OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(_Tpvec, suffix, not_const, cast) \
OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(&, _Tpvec, __lasx_xvand_##suffix, cast) \
OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix, cast) \
OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix, cast) \
inline _Tpvec operator ~ (const _Tpvec& a) \
OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(v_and, _Tpvec, __lasx_xvand_##suffix, cast) \
OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(v_or, _Tpvec, __lasx_xvor_##suffix, cast) \
OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(v_xor, _Tpvec, __lasx_xvxor_##suffix, cast) \
inline _Tpvec v_not(const _Tpvec& a) \
{ return _Tpvec(__lasx_xvxor_##suffix(*((__m256i*)(&a.val)), not_const)); }
OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(v_float32x8, v, __lasx_xvreplgr2vr_w(-1), _lasx_256_castsi256_ps)
@ -983,25 +971,25 @@ inline v_float64x4 v_select(const v_float64x4 &mask, const v_float64x4 &a, const
/** Comparison **/
#define OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpvec) \
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
{ return ~(a == b); } \
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
{ return b > a; } \
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
{ return ~(a < b); } \
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
{ return b >= a; }
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
{ return v_not(v_eq(a, b)); } \
inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
{ return v_gt(b, a); } \
inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
{ return v_not(v_lt(a, b)); } \
inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
{ return v_ge(b, a); }
#define OPENCV_HAL_IMPL_LASX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, usuffix) \
inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b) \
{ return _Tpuvec(__lasx_xvseq_##suffix(a.val, b.val)); } \
inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b) \
{ \
return _Tpuvec(__lasx_xvslt_##usuffix(b.val, a.val)); \
} \
inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b) \
{ return _Tpsvec(__lasx_xvseq_##suffix(a.val, b.val)); } \
inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b) \
{ return _Tpsvec(__lasx_xvslt_##suffix(b.val, a.val)); } \
OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpuvec) \
OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpsvec)
@ -1011,37 +999,37 @@ OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint16x16, v_int16x16, h, hu)
OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint32x8, v_int32x8, w, wu)
#define OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(_Tpvec, suffix) \
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(__lasx_xvseq_##suffix(a.val, b.val)); } \
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
{ return ~(a == b); }
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
{ return v_not(v_eq(a, b)); }
OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(v_uint64x4, d)
OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(v_int64x4, d)
#define OPENCV_HAL_IMPL_LASX_CMP_FLT(bin_op, suffix, _Tpvec, ssuffix) \
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(__lasx_##suffix##_##ssuffix(a.val, b.val)); }
#define OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(_Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LASX_CMP_FLT(==, xvfcmp_ceq, _Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LASX_CMP_FLT(!=, xvfcmp_cne, _Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LASX_CMP_FLT(<, xvfcmp_clt, _Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LASX_CMP_FLT(<=, xvfcmp_cle, _Tpvec, ssuffix)
OPENCV_HAL_IMPL_LASX_CMP_FLT(v_eq, xvfcmp_ceq, _Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LASX_CMP_FLT(v_ne, xvfcmp_cne, _Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LASX_CMP_FLT(v_lt, xvfcmp_clt, _Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LASX_CMP_FLT(v_le, xvfcmp_cle, _Tpvec, ssuffix)
OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(v_float32x8, s)
OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(v_float64x4, d)
inline v_float32x8 operator > (const v_float32x8 &a, const v_float32x8 &b)
inline v_float32x8 v_gt(const v_float32x8 &a, const v_float32x8 &b)
{ return v_float32x8(__lasx_xvfcmp_clt_s(b.val, a.val)); }
inline v_float32x8 operator >= (const v_float32x8 &a, const v_float32x8 &b)
inline v_float32x8 v_ge(const v_float32x8 &a, const v_float32x8 &b)
{ return v_float32x8(__lasx_xvfcmp_cle_s(b.val, a.val)); }
inline v_float64x4 operator > (const v_float64x4 &a, const v_float64x4 &b)
inline v_float64x4 v_gt(const v_float64x4 &a, const v_float64x4 &b)
{ return v_float64x4(__lasx_xvfcmp_clt_d(b.val, a.val)); }
inline v_float64x4 operator >= (const v_float64x4 &a, const v_float64x4 &b)
inline v_float64x4 v_ge(const v_float64x4 &a, const v_float64x4 &b)
{ return v_float64x4(__lasx_xvfcmp_cle_d(b.val, a.val)); }
inline v_float32x8 v_not_nan(const v_float32x8& a)
@ -1309,9 +1297,9 @@ inline unsigned v_reduce_sum(const v_uint32x8& a)
{ return v_reduce_sum(v_reinterpret_as_s32(a)); }
inline int v_reduce_sum(const v_int16x16& a)
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
inline unsigned v_reduce_sum(const v_uint16x16& a)
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
inline float v_reduce_sum(const v_float32x8& a)
{
@ -1379,27 +1367,27 @@ inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b)
{
v_uint32x8 l, h;
v_expand(v_add_wrap(a - b, b - a), l, h);
return v_reduce_sum(l + h);
v_expand(v_add_wrap(v_sub(a, b), v_sub(b, a)), l, h);
return v_reduce_sum(v_add(l, h));
}
inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b)
{
v_uint32x8 l, h;
v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h);
return v_reduce_sum(l + h);
return v_reduce_sum(v_add(l, h));
}
inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b)
{
return v_reduce_sum(v_max(a, b) - v_min(a, b));
return v_reduce_sum(v_sub(v_max(a, b), v_min(a, b)));
}
inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b)
{
v_int32x8 m = a < b;
return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m));
v_int32x8 m = v_lt(a, b);
return v_reduce_sum(v_reinterpret_as_u32(v_sub(v_xor(v_sub(a, b), m), m)));
}
inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
{
v_float32x8 a_b = a - b;
v_float32x8 a_b = v_sub(a, b);
return v_reduce_sum(v_float32x8(*((__m256i*)&a_b.val) & __lasx_xvreplgr2vr_w(0x7fffffff)));
}
@ -1503,9 +1491,9 @@ OPENCV_HAL_IMPL_LASX_CHECK_SHORT(v_int16x16)
inline _Tpvec v_sqrt(const _Tpvec& x) \
{ return _Tpvec(__lasx_xvfsqrt_##suffix(x.val)); } \
inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
{ return v_fma(a, a, b * b); } \
{ return v_fma(a, a, v_mul(b, b)); } \
inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
{ return v_sqrt(v_fma(a, a, b*b)); }
{ return v_sqrt(v_fma(a, a, v_mul(b, b))); }
OPENCV_HAL_IMPL_LASX_MULADD(v_float32x8, s)
OPENCV_HAL_IMPL_LASX_MULADD(v_float64x4, d)
@ -1556,20 +1544,20 @@ inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b)
{ return (v_uint32x8)__lasx_xvabsd_w(a.val, b.val); }
inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
{ return v_abs(a - b); }
{ return v_abs(v_sub(a, b)); }
inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
{ return v_abs(a - b); }
{ return v_abs(v_sub(a, b)); }
/** Saturating absolute difference **/
inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b)
{
v_int8x32 d = a - b;
v_int8x32 m = a < b;
return (d ^ m) - m;
v_int8x32 d = v_sub(a, b);
v_int8x32 m = v_lt(a, b);
return v_sub(v_xor(d, m), m);
}
inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b)
{ return v_max(a, b) - v_min(a, b); }
{ return v_sub(v_max(a, b), v_min(a, b)); }
////////// Conversions /////////
@ -1891,7 +1879,7 @@ inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
{ return v_int32x8(__lasx_xvadd_w(__lasx_xvmulwev_w_h(a.val, b.val), __lasx_xvmulwod_w_h(a.val, b.val))); }
inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
{ return v_dotprod(a, b) + c; }
{ return v_add(v_dotprod(a, b), c); }
// 32 >> 64
inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
@ -1915,7 +1903,7 @@ inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
return v_uint32x8(__lasx_xvadd_w(prod0, prod1));
}
inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
{
@ -1926,7 +1914,7 @@ inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
return v_int32x8(__lasx_xvadd_w(prod0, prod1));
}
inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
// 16 >> 64
inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
@ -1938,7 +1926,7 @@ inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
return v_uint64x4(__lasx_xvadd_d(prod0, prod1));
}
inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
{
@ -1950,13 +1938,13 @@ inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
}
inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
// 32 >> 64f
inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b)
{ return v_cvt_f64(v_dotprod(a, b)); }
inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
//////// Fast Dot Product ////////
@ -1993,7 +1981,7 @@ inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16&
return v_uint64x4(__lasx_xvadd_d(__lasx_xvilvl_d(prod1, prod0), __lasx_xvilvh_d(prod1, prod0)));
}
inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
{ return v_dotprod_expand_fast(a, b) + c; }
{ return v_add(v_dotprod_expand_fast(a, b), c); }
inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
{
@ -2004,7 +1992,7 @@ inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
return v_int64x4(__lasx_xvadd_d(lo, hi));
}
inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
{ return v_dotprod_expand_fast(a, b) + c; }
{ return v_add(v_dotprod_expand_fast(a, b), c); }
// 32 >> 64f
inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b)
@ -2024,7 +2012,7 @@ inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0,
v_float32x8 v15 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0x55);
v_float32x8 v26 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xAA);
v_float32x8 v37 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xFF);
return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v_mul(v37, m3))));
}
inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0,

@ -525,53 +525,51 @@ OPENCV_HAL_IMPL_LSX_ZIP(v_float64x2)
/** Arithmetics **/
#define OPENCV_HAL_IMPL_LSX_BIN_OP(bin_op, _Tpvec, intrin) \
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(a.val, b.val)); } \
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
{ a.val = intrin(a.val, b.val); return a; }
OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint8x16, __lsx_vsadd_bu)
OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint8x16, __lsx_vssub_bu)
OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int8x16, __lsx_vsadd_b)
OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int8x16, __lsx_vssub_b)
OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint16x8, __lsx_vsadd_hu)
OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint16x8, __lsx_vssub_hu)
OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int16x8, __lsx_vsadd_h)
OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int16x8, __lsx_vssub_h)
OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint32x4, __lsx_vadd_w)
OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint32x4, __lsx_vsub_w)
OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_uint32x4, __lsx_vmul_w)
OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int32x4, __lsx_vadd_w)
OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int32x4, __lsx_vsub_w)
OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_int32x4, __lsx_vmul_w)
OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint64x2, __lsx_vadd_d)
OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint64x2, __lsx_vsub_d)
OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int64x2, __lsx_vadd_d)
OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int64x2, __lsx_vsub_d)
OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_float32x4, __lsx_vfadd_s)
OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_float32x4, __lsx_vfsub_s)
OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_float32x4, __lsx_vfmul_s)
OPENCV_HAL_IMPL_LSX_BIN_OP(/, v_float32x4, __lsx_vfdiv_s)
OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_float64x2, __lsx_vfadd_d)
OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_float64x2, __lsx_vfsub_d)
OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_float64x2, __lsx_vfmul_d)
OPENCV_HAL_IMPL_LSX_BIN_OP(/, v_float64x2, __lsx_vfdiv_d)
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(a.val, b.val)); }
OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint8x16, __lsx_vsadd_bu)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint8x16, __lsx_vssub_bu)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int8x16, __lsx_vsadd_b)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int8x16, __lsx_vssub_b)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint16x8, __lsx_vsadd_hu)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint16x8, __lsx_vssub_hu)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int16x8, __lsx_vsadd_h)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int16x8, __lsx_vssub_h)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint32x4, __lsx_vadd_w)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint32x4, __lsx_vsub_w)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_uint32x4, __lsx_vmul_w)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int32x4, __lsx_vadd_w)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int32x4, __lsx_vsub_w)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_int32x4, __lsx_vmul_w)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint64x2, __lsx_vadd_d)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint64x2, __lsx_vsub_d)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int64x2, __lsx_vadd_d)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int64x2, __lsx_vsub_d)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_float32x4, __lsx_vfadd_s)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_float32x4, __lsx_vfsub_s)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_float32x4, __lsx_vfmul_s)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_div, v_float32x4, __lsx_vfdiv_s)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_float64x2, __lsx_vfadd_d)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_float64x2, __lsx_vfsub_d)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_float64x2, __lsx_vfmul_d)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_div, v_float64x2, __lsx_vfdiv_d)
// saturating multiply 8-bit, 16-bit
inline v_uint8x16 operator * (const v_uint8x16& a, const v_uint8x16& b)
inline v_uint8x16 v_mul(const v_uint8x16& a, const v_uint8x16& b)
{
v_uint16x8 c, d;
v_mul_expand(a, b, c, d);
return v_pack(c, d);
}
inline v_int8x16 operator * (const v_int8x16& a, const v_int8x16& b)
inline v_int8x16 v_mul(const v_int8x16& a, const v_int8x16& b)
{
v_int16x8 c, d;
v_mul_expand(a, b, c, d);
return v_pack(c, d);
}
inline v_uint16x8 operator * (const v_uint16x8& a, const v_uint16x8& b)
inline v_uint16x8 v_mul(const v_uint16x8& a, const v_uint16x8& b)
{
__m128i a0 = a.val, b0 = b.val;
__m128i pev = __lsx_vmulwev_w_hu(a0, b0);
@ -580,7 +578,7 @@ inline v_uint16x8 operator * (const v_uint16x8& a, const v_uint16x8& b)
__m128i ph = __lsx_vilvh_w(pod, pev);
return (v_uint16x8)__lsx_vssrlrni_hu_w(ph, pl, 0);
}
inline v_int16x8 operator * (const v_int16x8& a, const v_int16x8& b)
inline v_int16x8 v_mul(const v_int16x8& a, const v_int16x8& b)
{
__m128i a0 = a.val, b0 = b.val;
__m128i pev = __lsx_vmulwev_w_h(a0, b0);
@ -589,14 +587,6 @@ inline v_int16x8 operator * (const v_int16x8& a, const v_int16x8& b)
__m128i ph = __lsx_vilvh_w(pod, pev);
return (v_int16x8)__lsx_vssrarni_h_w(ph, pl, 0);
}
inline v_uint8x16& operator *= (v_uint8x16& a, const v_uint8x16& b)
{ a = a * b; return a; }
inline v_int8x16& operator *= (v_int8x16& a, const v_int8x16& b)
{ a = a * b; return a; }
inline v_uint16x8& operator *= (v_uint16x8& a, const v_uint16x8& b)
{ a = a * b; return a; }
inline v_int16x8& operator *= (v_int16x8& a, const v_int16x8& b)
{ a = a * b; return a; }
/** Non-saturating arithmetics **/
@ -681,13 +671,13 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
/** Bitwise shifts **/
#define OPENCV_HAL_IMPL_LSX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \
{ return _Tpuvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \
inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \
{ return _Tpsvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \
inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
inline _Tpuvec v_shr(const _Tpuvec& a, int imm) \
{ return _Tpuvec(__lsx_vsrl_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \
inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \
{ return _Tpsvec(srai(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \
template<int imm> \
inline _Tpuvec v_shl(const _Tpuvec& a) \
@ -708,10 +698,10 @@ OPENCV_HAL_IMPL_LSX_SHIFT_OP(v_uint64x2, v_int64x2, d, __lsx_vsra_d)
/** Bitwise logic **/
#define OPENCV_HAL_IMPL_LSX_LOGIC_OP(_Tpvec, suffix) \
OPENCV_HAL_IMPL_LSX_BIN_OP(&, _Tpvec, __lsx_vand_##suffix) \
OPENCV_HAL_IMPL_LSX_BIN_OP(|, _Tpvec, __lsx_vor_##suffix) \
OPENCV_HAL_IMPL_LSX_BIN_OP(^, _Tpvec, __lsx_vxor_##suffix) \
inline _Tpvec operator ~(const _Tpvec& a) \
OPENCV_HAL_IMPL_LSX_BIN_OP(v_and, _Tpvec, __lsx_vand_##suffix) \
OPENCV_HAL_IMPL_LSX_BIN_OP(v_or, _Tpvec, __lsx_vor_##suffix) \
OPENCV_HAL_IMPL_LSX_BIN_OP(v_xor, _Tpvec, __lsx_vxor_##suffix) \
inline _Tpvec v_not(const _Tpvec& a) \
{ return _Tpvec(__lsx_vnori_b(a.val, 0)); } \
OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint8x16, v)
@ -724,18 +714,14 @@ OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint64x2, v)
OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_int64x2, v)
#define OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(bin_op, _Tpvec, intrin, cast) \
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin((__m128i)(a.val), (__m128i)(b.val))); } \
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
{ __m128i c = intrin((__m128i)(a.val), (__m128i)b.val); \
a.val = cast(c); \
return a;}
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin((__m128i)(a.val), (__m128i)(b.val))); }
#define OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(_Tpvec, cast) \
OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(&, _Tpvec, __lsx_vand_v, cast) \
OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(|, _Tpvec, __lsx_vor_v, cast) \
OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(^, _Tpvec, __lsx_vxor_v, cast) \
inline _Tpvec operator ~ (const _Tpvec& a) \
OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(v_and, _Tpvec, __lsx_vand_v, cast) \
OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(v_or, _Tpvec, __lsx_vor_v, cast) \
OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(v_xor, _Tpvec, __lsx_vxor_v, cast) \
inline _Tpvec v_not(const _Tpvec& a) \
{ return _Tpvec(__lsx_vnori_b((__m128i)(a.val), 0)); } \
OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(v_float32x4, _lsx_128_castsi128_ps)
@ -760,23 +746,23 @@ inline v_float64x2 v_select(const v_float64x2 &mask, const v_float64x2 &a, const
/** Comparison **/
#define OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpvec) \
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
{ return ~( a == b ); } \
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
{ return b > a ; } \
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
{ return ~(a < b); } \
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
{ return b >= a; } \
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
{ return v_not(v_eq(a, b)); } \
inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
{ return v_gt(b, a); } \
inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
{ return v_not(v_lt(a, b)); } \
inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
{ return v_ge(b, a); } \
#define OPENCV_HAL_IMPL_LSX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, usuffix) \
inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b) \
{ return _Tpuvec(__lsx_vseq_##suffix(a.val, b.val)); } \
inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b) \
{ return _Tpuvec(__lsx_vslt_##usuffix(b.val, a.val)); } \
inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b) \
{ return _Tpsvec(__lsx_vseq_##suffix(a.val, b.val)); } \
inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b) \
{ return _Tpsvec(__lsx_vslt_##suffix(b.val, a.val)); } \
OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpuvec) \
OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpsvec)
@ -786,37 +772,37 @@ OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint16x8, v_int16x8, h, hu)
OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint32x4, v_int32x4, w, wu)
#define OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(_Tpvec, suffix) \
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(__lsx_vseq_##suffix(a.val, b.val)); } \
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
{ return ~(a == b); }
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
{ return v_not(v_eq(a, b)); }
OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(v_uint64x2, d)
OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(v_int64x2, d)
#define OPENCV_HAL_IMPL_LSX_CMP_FLT(bin_op, suffix, _Tpvec, ssuffix) \
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(__lsx_##suffix##_##ssuffix(a.val, b.val)); } \
#define OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(_Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LSX_CMP_FLT(==, vfcmp_ceq, _Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LSX_CMP_FLT(!=, vfcmp_cne, _Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LSX_CMP_FLT(<, vfcmp_clt, _Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LSX_CMP_FLT(<=, vfcmp_cle, _Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LSX_CMP_FLT(v_eq, vfcmp_ceq, _Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LSX_CMP_FLT(v_ne, vfcmp_cne, _Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LSX_CMP_FLT(v_lt, vfcmp_clt, _Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LSX_CMP_FLT(v_le, vfcmp_cle, _Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(v_float32x4, s)
OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(v_float64x2, d)
inline v_float32x4 operator > (const v_float32x4 &a, const v_float32x4 &b)
inline v_float32x4 v_gt(const v_float32x4 &a, const v_float32x4 &b)
{ return v_float32x4(__lsx_vfcmp_clt_s(b.val, a.val)); }
inline v_float32x4 operator >= (const v_float32x4 &a, const v_float32x4 &b)
inline v_float32x4 v_ge(const v_float32x4 &a, const v_float32x4 &b)
{ return v_float32x4(__lsx_vfcmp_cle_s(b.val, a.val)); }
inline v_float64x2 operator > (const v_float64x2 &a, const v_float64x2 &b)
inline v_float64x2 v_gt(const v_float64x2 &a, const v_float64x2 &b)
{ return v_float64x2(__lsx_vfcmp_clt_d(b.val, a.val)); }
inline v_float64x2 operator >= (const v_float64x2 &a, const v_float64x2 &b)
inline v_float64x2 v_ge(const v_float64x2 &a, const v_float64x2 &b)
{ return v_float64x2(__lsx_vfcmp_cle_d(b.val, a.val)); }
inline v_float32x4 v_not_nan(const v_float32x4& a)
@ -1188,7 +1174,7 @@ inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
{
v_float32x4 a_b = a - b;
v_float32x4 a_b = v_sub(a, b);
return v_reduce_sum(v_float32x4((__m128i)a_b.val & __lsx_vreplgr2vr_w(0x7fffffff)));
}
@ -1295,9 +1281,9 @@ OPENCV_HAL_IMPL_LSX_CHECK(v_float64x2, 3)
inline _Tpvec v_sqrt(const _Tpvec& x) \
{ return _Tpvec(__lsx_vfsqrt_##suffix(x.val)); } \
inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
{ return v_fma(a, a, b * b); } \
{ return v_fma(a, a, v_mul(b, b)); } \
inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
{ return v_sqrt(v_fma(a, a, b * b)); }
{ return v_sqrt(v_fma(a, a, v_mul(b, b))); }
OPENCV_HAL_IMPL_LSX_MULADD(v_float32x4, s)
OPENCV_HAL_IMPL_LSX_MULADD(v_float64x2, d)
@ -1349,20 +1335,20 @@ inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
{ return (v_uint32x4)__lsx_vabsd_w(a.val, b.val); }
inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
{ return v_abs(a - b); }
{ return v_abs(v_sub(a, b)); }
inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
{ return v_abs(a - b); }
{ return v_abs(v_sub(a, b)); }
/** Saturating absolute difference **/
inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
{
v_int8x16 d = a - b;
v_int8x16 m = a < b;
return (d ^ m) - m;
v_int8x16 d = v_sub(a, b);
v_int8x16 m = v_lt(a, b);
return v_sub(v_xor(d, m), m);
}
inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
{ return v_max(a, b) - v_min(a, b); }
{ return v_sub(v_max(a, b), v_min(a, b)); }
///////// Conversions /////////
@ -1673,7 +1659,7 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
}
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
{ return v_dotprod_expand(a, b) + c ;}
{ return v_add(v_dotprod_expand(a, b), c) ;}
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
{
@ -1685,7 +1671,7 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
return v_int32x4(__lsx_vadd_w(prod0, prod1));
}
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
// 16 >> 64
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
@ -1698,7 +1684,7 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
return v_uint64x2(__lsx_vadd_d(prod0, prod1));
}
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
{
@ -1710,13 +1696,13 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
return v_int64x2(__lsx_vadd_d(prod0, prod1));
}
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
//32 >> 64f
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
{ return v_cvt_f64(v_dotprod(a, b)); }
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
///////// Fast Dot Product //////
@ -1755,7 +1741,7 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b
return v_uint64x2(__lsx_vilvl_d(__lsx_vhaddw_qu_du(prod0, prod0), __lsx_vhaddw_qu_du(prod1, prod1)));
}
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
{ return v_dotprod_expand_fast(a, b) + c; }
{ return v_add(v_dotprod_expand_fast(a, b), c); }
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
{
@ -1767,7 +1753,7 @@ inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
return v_int64x2(__lsx_vadd_d(lo, hi));
}
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
{ return v_dotprod_expand_fast(a, b) + c; }
{ return v_add(v_dotprod_expand_fast(a, b), c); }
// 32 >> 64f
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)

@ -345,53 +345,46 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
}
#define OPENCV_HAL_IMPL_MSA_BIN_OP(bin_op, _Tpvec, intrin) \
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
{ \
return _Tpvec(intrin(a.val, b.val)); \
} \
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
{ \
a.val = intrin(a.val, b.val); \
return a; \
}
OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint8x16, msa_qaddq_u8)
OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint8x16, msa_qsubq_u8)
OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int8x16, msa_qaddq_s8)
OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int8x16, msa_qsubq_s8)
OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint16x8, msa_qaddq_u16)
OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint16x8, msa_qsubq_u16)
OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int16x8, msa_qaddq_s16)
OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int16x8, msa_qsubq_s16)
OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int32x4, msa_addq_s32)
OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int32x4, msa_subq_s32)
OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_int32x4, msa_mulq_s32)
OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint32x4, msa_addq_u32)
OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint32x4, msa_subq_u32)
OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_uint32x4, msa_mulq_u32)
OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float32x4, msa_addq_f32)
OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float32x4, msa_subq_f32)
OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float32x4, msa_mulq_f32)
OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int64x2, msa_addq_s64)
OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int64x2, msa_subq_s64)
OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint64x2, msa_addq_u64)
OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint64x2, msa_subq_u64)
OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float32x4, msa_divq_f32)
OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float64x2, msa_addq_f64)
OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float64x2, msa_subq_f64)
OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float64x2, msa_mulq_f64)
OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float64x2, msa_divq_f64)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint8x16, msa_qaddq_u8)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint8x16, msa_qsubq_u8)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int8x16, msa_qaddq_s8)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int8x16, msa_qsubq_s8)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint16x8, msa_qaddq_u16)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint16x8, msa_qsubq_u16)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int16x8, msa_qaddq_s16)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int16x8, msa_qsubq_s16)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int32x4, msa_addq_s32)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int32x4, msa_subq_s32)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_int32x4, msa_mulq_s32)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint32x4, msa_addq_u32)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint32x4, msa_subq_u32)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_uint32x4, msa_mulq_u32)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_float32x4, msa_addq_f32)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_float32x4, msa_subq_f32)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_float32x4, msa_mulq_f32)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int64x2, msa_addq_s64)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int64x2, msa_subq_s64)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint64x2, msa_addq_u64)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint64x2, msa_subq_u64)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_div, v_float32x4, msa_divq_f32)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_float64x2, msa_addq_f64)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_float64x2, msa_subq_f64)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_float64x2, msa_mulq_f64)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_div, v_float64x2, msa_divq_f64)
// saturating multiply 8-bit, 16-bit
#define OPENCV_HAL_IMPL_MSA_MUL_SAT(_Tpvec, _Tpwvec) \
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
{ \
_Tpwvec c, d; \
v_mul_expand(a, b, c, d); \
return v_pack(c, d); \
} \
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
{a = a * b; return a; }
}
OPENCV_HAL_IMPL_MSA_MUL_SAT(v_int8x16, v_int16x8)
OPENCV_HAL_IMPL_MSA_MUL_SAT(v_uint8x16, v_uint16x8)
@ -546,13 +539,13 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
return v_int64x2(msa_hadd_s64(prod, prod));
}
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
// 32 >> 64f
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
{ return v_cvt_f64(v_dotprod(a, b)); }
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
//////// Fast Dot Product ////////
@ -596,10 +589,10 @@ inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b,
{ return v_dotprod_expand(a, b, c); }
#define OPENCV_HAL_IMPL_MSA_LOGIC_OP(_Tpvec, _Tpv, suffix) \
OPENCV_HAL_IMPL_MSA_BIN_OP(&, _Tpvec, msa_andq_##suffix) \
OPENCV_HAL_IMPL_MSA_BIN_OP(|, _Tpvec, msa_orrq_##suffix) \
OPENCV_HAL_IMPL_MSA_BIN_OP(^, _Tpvec, msa_eorq_##suffix) \
inline _Tpvec operator ~ (const _Tpvec& a) \
OPENCV_HAL_IMPL_MSA_BIN_OP(v_and, _Tpvec, msa_andq_##suffix) \
OPENCV_HAL_IMPL_MSA_BIN_OP(v_or, _Tpvec, msa_orrq_##suffix) \
OPENCV_HAL_IMPL_MSA_BIN_OP(v_xor, _Tpvec, msa_eorq_##suffix) \
inline _Tpvec v_not(const _Tpvec& a) \
{ \
return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_u8(MSA_TPV_REINTERPRET(v16u8, a.val)))); \
}
@ -614,21 +607,16 @@ OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint64x2, v2u64, u64)
OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int64x2, v2i64, s64)
#define OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(bin_op, intrin) \
inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
inline v_float32x4 bin_op(const v_float32x4& a, const v_float32x4& b) \
{ \
return v_float32x4(MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val)))); \
} \
inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
{ \
a.val = MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val))); \
return a; \
}
OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(&, msa_andq_s32)
OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(|, msa_orrq_s32)
OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(^, msa_eorq_s32)
OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(v_and, msa_andq_s32)
OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(v_or, msa_orrq_s32)
OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(v_xor, msa_eorq_s32)
inline v_float32x4 operator ~ (const v_float32x4& a)
inline v_float32x4 v_not(const v_float32x4& a)
{
return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
}
@ -659,21 +647,16 @@ OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_sqrt, msa_sqrtq_f64)
OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_invsqrt, msa_rsqrtq_f64)
#define OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(bin_op, intrin) \
inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
inline v_float64x2 bin_op(const v_float64x2& a, const v_float64x2& b) \
{ \
return v_float64x2(MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val)))); \
} \
inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
{ \
a.val = MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val))); \
return a; \
}
OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(&, msa_andq_s64)
OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(|, msa_orrq_s64)
OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(^, msa_eorq_s64)
OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(v_and, msa_andq_s64)
OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(v_or, msa_orrq_s64)
OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(v_xor, msa_eorq_s64)
inline v_float64x2 operator ~ (const v_float64x2& a)
inline v_float64x2 v_not(const v_float64x2& a)
{
return v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
}
@ -704,17 +687,17 @@ OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_min, msa_minq_f64)
OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_max, msa_maxq_f64)
#define OPENCV_HAL_IMPL_MSA_INT_CMP_OP(_Tpvec, _Tpv, suffix, not_suffix) \
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ceqq_##suffix(a.val, b.val))); } \
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_##not_suffix(msa_ceqq_##suffix(a.val, b.val)))); } \
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cltq_##suffix(a.val, b.val))); } \
inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgtq_##suffix(a.val, b.val))); } \
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cleq_##suffix(a.val, b.val))); } \
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgeq_##suffix(a.val, b.val))); }
OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint8x16, v16u8, u8, u8)
@ -821,9 +804,9 @@ inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_
// trade efficiency for convenience
#define OPENCV_HAL_IMPL_MSA_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
inline _Tpvec operator << (const _Tpvec& a, int n) \
inline _Tpvec v_shl(const _Tpvec& a, int n) \
{ return _Tpvec(msa_shlq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \
inline _Tpvec operator >> (const _Tpvec& a, int n) \
inline _Tpvec v_shr(const _Tpvec& a, int n) \
{ return _Tpvec(msa_shrq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \
template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
{ return _Tpvec(msa_shlq_n_##suffix(a.val, n)); } \

@ -373,70 +373,50 @@ inline v_float64x2 v_setall_f64(double v) { return v_float64x2(vfmv_v_f_f64m1(v,
#define OPENCV_HAL_IMPL_RISCVV_BIN_OP(bin_op, _Tpvec, intrin) \
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
{ \
return _Tpvec(intrin(a.val, b.val)); \
} \
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
{ \
a.val = intrin(a.val, b.val); \
return a; \
}
#define OPENCV_HAL_IMPL_RISCVV_BIN_OPN(bin_op, _Tpvec, intrin, num) \
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
{ \
return _Tpvec(intrin(a.val, b.val, num)); \
} \
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
{ \
a.val = intrin(a.val, b.val, num); \
return a; \
}
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint8x16, vsaddu_vv_u8m1, 16)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint8x16, vssubu_vv_u8m1, 16)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int8x16, vsadd_vv_i8m1, 16)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int8x16, vssub_vv_i8m1, 16)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint16x8, vsaddu_vv_u16m1, 8)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint16x8, vssubu_vv_u16m1, 8)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int16x8, vsadd_vv_i16m1, 8)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int16x8, vssub_vv_i16m1, 8)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int32x4, vadd_vv_i32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int32x4, vsub_vv_i32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_int32x4, vmul_vv_i32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint32x4, vadd_vv_u32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint32x4, vsub_vv_u32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_uint32x4, vmul_vv_u32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int64x2, vadd_vv_i64m1, 2)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int64x2, vsub_vv_i64m1, 2)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint64x2, vadd_vv_u64m1, 2)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint64x2, vsub_vv_u64m1, 2)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float32x4, vfadd_vv_f32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float32x4, vfsub_vv_f32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float32x4, vfmul_vv_f32m1, 4)
inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
{
return v_float32x4(vfdiv_vv_f32m1(a.val, b.val, 4));
}
inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint8x16, vsaddu_vv_u8m1, 16)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint8x16, vssubu_vv_u8m1, 16)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int8x16, vsadd_vv_i8m1, 16)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int8x16, vssub_vv_i8m1, 16)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint16x8, vsaddu_vv_u16m1, 8)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint16x8, vssubu_vv_u16m1, 8)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int16x8, vsadd_vv_i16m1, 8)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int16x8, vssub_vv_i16m1, 8)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int32x4, vadd_vv_i32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int32x4, vsub_vv_i32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_int32x4, vmul_vv_i32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint32x4, vadd_vv_u32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint32x4, vsub_vv_u32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_uint32x4, vmul_vv_u32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int64x2, vadd_vv_i64m1, 2)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int64x2, vsub_vv_i64m1, 2)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint64x2, vadd_vv_u64m1, 2)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint64x2, vsub_vv_u64m1, 2)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_float32x4, vfadd_vv_f32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_float32x4, vfsub_vv_f32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_float32x4, vfmul_vv_f32m1, 4)
inline v_float32x4 v_div(const v_float32x4& a, const v_float32x4& b)
{
a.val = vfdiv_vv_f32m1(a.val, b.val, 4);
return a;
return v_float32x4(vfdiv_vv_f32m1(a.val, b.val, 4));
}
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float64x2, vfadd_vv_f64m1, 2)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float64x2, vfsub_vv_f64m1, 2)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float64x2, vfmul_vv_f64m1, 2)
inline v_float64x2 operator / (const v_float64x2& a, const v_float64x2& b)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_float64x2, vfadd_vv_f64m1, 2)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_float64x2, vfsub_vv_f64m1, 2)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_float64x2, vfmul_vv_f64m1, 2)
inline v_float64x2 v_div(const v_float64x2& a, const v_float64x2& b)
{
return v_float64x2(vfdiv_vv_f64m1(a.val, b.val, 2));
}
inline v_float64x2& operator /= (v_float64x2& a, const v_float64x2& b)
{
a.val = vfdiv_vv_f64m1(a.val, b.val, 2);
return a;
}
// TODO: exp, log, sin, cos
#define OPENCV_HAL_IMPL_RISCVV_BIN_FUNC(_Tpvec, func, intrin) \
@ -562,10 +542,10 @@ inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_
}
#define OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(_Tpvec, suffix, num) \
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(&, _Tpvec, vand_vv_##suffix, num) \
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(|, _Tpvec, vor_vv_##suffix, num) \
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(^, _Tpvec, vxor_vv_##suffix, num) \
inline _Tpvec operator ~ (const _Tpvec & a) \
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_and, _Tpvec, vand_vv_##suffix, num) \
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_or, _Tpvec, vor_vv_##suffix, num) \
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_xor, _Tpvec, vxor_vv_##suffix, num) \
inline _Tpvec v_not(const _Tpvec & a) \
{ \
return _Tpvec(vnot_v_##suffix(a.val, num)); \
}
@ -580,41 +560,31 @@ OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int32x4, i32m1, 4)
OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int64x2, i64m1, 2)
#define OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(bin_op, intrin) \
inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
inline v_float32x4 bin_op(const v_float32x4& a, const v_float32x4& b) \
{ \
return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4))); \
} \
inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
{ \
a.val = vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4)); \
return a; \
}
OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(&, vand_vv_i32m1)
OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(|, vor_vv_i32m1)
OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(^, vxor_vv_i32m1)
OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(v_and, vand_vv_i32m1)
OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(v_or, vor_vv_i32m1)
OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(v_xor, vxor_vv_i32m1)
inline v_float32x4 operator ~ (const v_float32x4& a)
inline v_float32x4 v_not(const v_float32x4& a)
{
return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 4)));
}
#define OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(bin_op, intrin) \
inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
inline v_float64x2 bin_op(const v_float64x2& a, const v_float64x2& b) \
{ \
return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2))); \
} \
inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
{ \
a.val = vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2)); \
return a; \
}
OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(&, vand_vv_i64m1)
OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(|, vor_vv_i64m1)
OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(^, vxor_vv_i64m1)
OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(v_and, vand_vv_i64m1)
OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(v_or, vor_vv_i64m1)
OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(v_xor, vxor_vv_i64m1)
inline v_float64x2 operator ~ (const v_float64x2& a)
inline v_float64x2 v_not(const v_float64x2& a)
{
return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a.val), 2)));
}
@ -1174,32 +1144,32 @@ OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int32x4, v_uint32x4)
OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint32x4, v_uint32x4)
#define OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(_Tpvec, _Tp, _T, num, uv) \
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
{ \
vbool##_T##_t mask = vmseq_vv_##_Tp##_b##_T(a.val, b.val, num); \
return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
} \
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
{ \
vbool##_T##_t mask = vmsne_vv_##_Tp##_b##_T(a.val, b.val, num); \
return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
} \
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
{ \
vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(a.val, b.val, num); \
return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
} \
inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b) \
{ \
vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(b.val, a.val, num); \
return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
} \
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
{ \
vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(a.val, b.val, num); \
return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
} \
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
{ \
vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(b.val, a.val, num); \
return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
@ -1215,37 +1185,37 @@ OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint32x4, u32m1, 32, 4, u_vv_)
OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint64x2, u64m1, 64, 2, u_vv_)
//TODO: ==
inline v_float32x4 operator == (const v_float32x4& a, const v_float32x4& b)
inline v_float32x4 v_eq(const v_float32x4& a, const v_float32x4& b)
{
vbool32_t mask = vmfeq_vv_f32m1_b32(a.val, b.val, 4);
vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
}
inline v_float32x4 operator != (const v_float32x4& a, const v_float32x4& b)
inline v_float32x4 v_ne(const v_float32x4& a, const v_float32x4& b)
{
vbool32_t mask = vmfne_vv_f32m1_b32(a.val, b.val, 4);
vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
}
inline v_float32x4 operator < (const v_float32x4& a, const v_float32x4& b)
inline v_float32x4 v_lt(const v_float32x4& a, const v_float32x4& b)
{
vbool32_t mask = vmflt_vv_f32m1_b32(a.val, b.val, 4);
vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
}
inline v_float32x4 operator <= (const v_float32x4& a, const v_float32x4& b)
inline v_float32x4 v_le(const v_float32x4& a, const v_float32x4& b)
{
vbool32_t mask = vmfle_vv_f32m1_b32(a.val, b.val, 4);
vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
}
inline v_float32x4 operator > (const v_float32x4& a, const v_float32x4& b)
inline v_float32x4 v_gt(const v_float32x4& a, const v_float32x4& b)
{
vbool32_t mask = vmfgt_vv_f32m1_b32(a.val, b.val, 4);
vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
}
inline v_float32x4 operator >= (const v_float32x4& a, const v_float32x4& b)
inline v_float32x4 v_ge(const v_float32x4& a, const v_float32x4& b)
{
vbool32_t mask = vmfge_vv_f32m1_b32(a.val, b.val, 4);
vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
@ -1259,37 +1229,37 @@ inline v_float32x4 v_not_nan(const v_float32x4& a)
}
//TODO: ==
inline v_float64x2 operator == (const v_float64x2& a, const v_float64x2& b)
inline v_float64x2 v_eq(const v_float64x2& a, const v_float64x2& b)
{
vbool64_t mask = vmfeq_vv_f64m1_b64(a.val, b.val, 2);
vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
}
inline v_float64x2 operator != (const v_float64x2& a, const v_float64x2& b)
inline v_float64x2 v_ne(const v_float64x2& a, const v_float64x2& b)
{
vbool64_t mask = vmfne_vv_f64m1_b64(a.val, b.val, 2);
vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
}
inline v_float64x2 operator < (const v_float64x2& a, const v_float64x2& b)
inline v_float64x2 v_lt(const v_float64x2& a, const v_float64x2& b)
{
vbool64_t mask = vmflt_vv_f64m1_b64(a.val, b.val, 2);
vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
}
inline v_float64x2 operator <= (const v_float64x2& a, const v_float64x2& b)
inline v_float64x2 v_le(const v_float64x2& a, const v_float64x2& b)
{
vbool64_t mask = vmfle_vv_f64m1_b64(a.val, b.val, 2);
vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
}
inline v_float64x2 operator > (const v_float64x2& a, const v_float64x2& b)
inline v_float64x2 v_gt(const v_float64x2& a, const v_float64x2& b)
{
vbool64_t mask = vmfgt_vv_f64m1_b64(a.val, b.val, 2);
vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
}
inline v_float64x2 operator >= (const v_float64x2& a, const v_float64x2& b)
inline v_float64x2 v_ge(const v_float64x2& a, const v_float64x2& b)
{
vbool64_t mask = vmfge_vv_f64m1_b64(a.val, b.val, 2);
vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
@ -1331,13 +1301,13 @@ OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(float, f32)
#define OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(_Tpvec, suffix, _T, num) \
inline _Tpvec operator << (const _Tpvec& a, int n) \
inline _Tpvec v_shl(const _Tpvec& a, int n) \
{ return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); } \
template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
{ return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); }
#define OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(_Tpvec, suffix, _T, num, intric) \
inline _Tpvec operator >> (const _Tpvec& a, int n) \
inline _Tpvec v_shr(const _Tpvec& a, int n) \
{ return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); } \
template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
{ return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); }\
@ -2037,13 +2007,11 @@ OPENCV_HAL_IMPL_RISCVV_PACK_U(16, 8, 32, 4, unsigned short)
// saturating multiply 8-bit, 16-bit
#define OPENCV_HAL_IMPL_RISCVV_MUL_SAT(_Tpvec, num, mul, cvt) \
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
{ \
auto res = mul(a.val, b.val, num); \
return _Tpvec(cvt(res, 0, num)); \
} \
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
{ a = a * b; return a; }
}
OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int8x16, 16, vwmul_vv_i16m2, vnclip_wx_i8m1)
OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint8x16, 16, vwmulu_vv_u16m2, vnclipu_wx_u8m1)
@ -2845,7 +2813,7 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
{ return v_cvt_f64(v_dotprod(a, b)); }
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b,
const v_float64x2& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
{
vint64m2_t v1 = vwmul_vv_i64m2(a.val, b.val, 4);
@ -2854,7 +2822,7 @@ inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
}
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
{ v_float64x2 res = v_dotprod_expand_fast(a, b);
return res + c; }
return v_add(res, c); }
#endif
////// FP16 support ///////
#if __riscv_v == 7000

@ -735,53 +735,46 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
}
#define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
{ \
return _Tpvec(intrin(a.val, b.val)); \
} \
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
{ \
a.val = intrin(a.val, b.val); \
return a; \
}
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8)
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8)
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8)
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16)
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16)
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32)
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32)
OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint32x4, _v128_mullo_epi32)
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int32x4, _v128_mullo_epi32)
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps)
OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps)
OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps)
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd)
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd)
OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd)
OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd)
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64)
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64)
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64)
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
}
OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint8x16, _mm_adds_epu8)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint8x16, _mm_subs_epu8)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int8x16, _mm_adds_epi8)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int8x16, _mm_subs_epi8)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint16x8, _mm_adds_epu16)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint16x8, _mm_subs_epu16)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int16x8, _mm_adds_epi16)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int16x8, _mm_subs_epi16)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint32x4, _mm_add_epi32)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint32x4, _mm_sub_epi32)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_uint32x4, _v128_mullo_epi32)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int32x4, _mm_add_epi32)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int32x4, _mm_sub_epi32)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_int32x4, _v128_mullo_epi32)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_float32x4, _mm_add_ps)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_float32x4, _mm_sub_ps)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_float32x4, _mm_mul_ps)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_div, v_float32x4, _mm_div_ps)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_float64x2, _mm_add_pd)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_float64x2, _mm_sub_pd)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_float64x2, _mm_mul_pd)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_div, v_float64x2, _mm_div_pd)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint64x2, _mm_add_epi64)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint64x2, _mm_sub_epi64)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int64x2, _mm_add_epi64)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int64x2, _mm_sub_epi64)
// saturating multiply 8-bit, 16-bit
#define OPENCV_HAL_IMPL_SSE_MUL_SAT(_Tpvec, _Tpwvec) \
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
{ \
_Tpwvec c, d; \
v_mul_expand(a, b, c, d); \
return v_pack(c, d); \
} \
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
{ a = a * b; return a; }
}
OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint8x16, v_uint16x8)
OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int8x16, v_int16x8)
@ -845,7 +838,7 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) { return v_
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
{ return v_int32x4(_mm_madd_epi16(a.val, b.val)); }
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
{ return v_dotprod(a, b) + c; }
{ return v_add(v_dotprod(a, b), c); }
// 32 >> 64
inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
@ -872,7 +865,7 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
#endif
}
inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
{ return v_dotprod(a, b) + c; }
{ return v_add(v_dotprod(a, b), c); }
// 8 >> 32
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
@ -886,7 +879,7 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
return v_uint32x4(_mm_add_epi32(p0, p1));
}
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
{
@ -899,7 +892,7 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
return v_int32x4(_mm_add_epi32(p0, p1));
}
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
// 16 >> 64
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
@ -911,14 +904,14 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
v_expand(c, c0, c1);
v_expand(d, d0, d1);
c0 += c1; d0 += d1;
c0 = v_add(c0, c1); d0 = v_add(d0, d1);
return v_uint64x2(_mm_add_epi64(
_mm_unpacklo_epi64(c0.val, d0.val),
_mm_unpackhi_epi64(c0.val, d0.val)
));
}
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
{
@ -931,7 +924,7 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
));
}
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
// 32 >> 64f
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
@ -939,8 +932,8 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
#if CV_SSE4_1
return v_cvt_f64(v_dotprod(a, b));
#else
v_float64x2 c = v_cvt_f64(a) * v_cvt_f64(b);
v_float64x2 d = v_cvt_f64_high(a) * v_cvt_f64_high(b);
v_float64x2 c = v_mul(v_cvt_f64(a), v_cvt_f64(b));
v_float64x2 d = v_mul(v_cvt_f64_high(a), v_cvt_f64_high(b));
return v_float64x2(_mm_add_pd(
_mm_unpacklo_pd(c.val, d.val),
@ -949,7 +942,7 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
#endif
}
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
//////// Fast Dot Product ////////
@ -957,13 +950,13 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, cons
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
{ return v_dotprod(a, b); }
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
{ return v_dotprod(a, b) + c; }
{ return v_add(v_dotprod(a, b), c); }
// 32 >> 64
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
{ return v_dotprod(a, b); }
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
{ return v_dotprod_fast(a, b) + c; }
{ return v_add(v_dotprod_fast(a, b), c); }
// 8 >> 32
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
@ -977,7 +970,7 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b
return v_uint32x4(_mm_add_epi32(p0, p1));
}
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
{ return v_dotprod_expand_fast(a, b) + c; }
{ return v_add(v_dotprod_expand_fast(a, b), c); }
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
{
@ -994,7 +987,7 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
#endif
}
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
{ return v_dotprod_expand_fast(a, b) + c; }
{ return v_add(v_dotprod_expand_fast(a, b), c); }
// 16 >> 64
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
@ -1006,34 +999,34 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b
v_expand(c, c0, c1);
v_expand(d, d0, d1);
c0 += c1; d0 += d1;
return c0 + d0;
c0 = v_add(c0, c1); d0 = v_add(d0, d1);
return v_add(c0, d0);
}
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
{ return v_dotprod_expand_fast(a, b) + c; }
{ return v_add(v_dotprod_expand_fast(a, b), c); }
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
{
v_int32x4 prod = v_dotprod(a, b);
v_int64x2 c, d;
v_expand(prod, c, d);
return c + d;
return v_add(c, d);
}
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
{ return v_dotprod_expand_fast(a, b) + c; }
{ return v_add(v_dotprod_expand_fast(a, b), c); }
// 32 >> 64f
v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c);
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_mul(v_cvt_f64_high(a), v_cvt_f64_high(b))); }
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }
#define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \
inline _Tpvec operator ~ (const _Tpvec& a) \
OPENCV_HAL_IMPL_SSE_BIN_OP(v_and, _Tpvec, _mm_and_##suffix) \
OPENCV_HAL_IMPL_SSE_BIN_OP(v_or, _Tpvec, _mm_or_##suffix) \
OPENCV_HAL_IMPL_SSE_BIN_OP(v_xor, _Tpvec, _mm_xor_##suffix) \
inline _Tpvec v_not(const _Tpvec& a) \
{ \
return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \
}
@ -1182,58 +1175,58 @@ inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b)
}
#define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \
inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b) \
{ return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \
inline _Tpuvec v_ne(const _Tpuvec& a, const _Tpuvec& b) \
{ \
__m128i not_mask = _mm_set1_epi32(-1); \
return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
} \
inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b) \
{ return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \
inline _Tpsvec v_ne(const _Tpsvec& a, const _Tpsvec& b) \
{ \
__m128i not_mask = _mm_set1_epi32(-1); \
return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
} \
inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \
inline _Tpuvec v_lt(const _Tpuvec& a, const _Tpuvec& b) \
{ \
__m128i smask = _mm_set1_##suffix(sbit); \
return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \
} \
inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b) \
{ \
__m128i smask = _mm_set1_##suffix(sbit); \
return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \
} \
inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \
inline _Tpuvec v_le(const _Tpuvec& a, const _Tpuvec& b) \
{ \
__m128i smask = _mm_set1_##suffix(sbit); \
__m128i not_mask = _mm_set1_epi32(-1); \
__m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \
return _Tpuvec(_mm_xor_si128(res, not_mask)); \
} \
inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \
inline _Tpuvec v_ge(const _Tpuvec& a, const _Tpuvec& b) \
{ \
__m128i smask = _mm_set1_##suffix(sbit); \
__m128i not_mask = _mm_set1_epi32(-1); \
__m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \
return _Tpuvec(_mm_xor_si128(res, not_mask)); \
} \
inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \
inline _Tpsvec v_lt(const _Tpsvec& a, const _Tpsvec& b) \
{ \
return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \
} \
inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b) \
{ \
return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \
} \
inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \
inline _Tpsvec v_le(const _Tpsvec& a, const _Tpsvec& b) \
{ \
__m128i not_mask = _mm_set1_epi32(-1); \
return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \
} \
inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \
inline _Tpsvec v_ge(const _Tpsvec& a, const _Tpsvec& b) \
{ \
__m128i not_mask = _mm_set1_epi32(-1); \
return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \
@ -1244,17 +1237,17 @@ OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768)
OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000)
#define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \
inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); }
OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
@ -1262,17 +1255,17 @@ OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)
#if CV_SSE4_1
#define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_eq (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(_mm_cmpeq_epi64(a.val, b.val)); } \
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
{ return ~(a == b); }
inline _Tpvec v_ne (const _Tpvec& a, const _Tpvec& b) \
{ return v_not(v_eq(a, b)); }
#else
#define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
{ __m128i cmp = _mm_cmpeq_epi32(a.val, b.val); \
return _Tpvec(_mm_and_si128(cmp, _mm_shuffle_epi32(cmp, _MM_SHUFFLE(2, 3, 0, 1)))); } \
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
{ return ~(a == b); }
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
{ return v_not(v_eq(a, b)); }
#endif
OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2)
@ -1311,17 +1304,17 @@ inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
/** Absolute difference **/
inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
{ return v_add_wrap(a - b, b - a); }
{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
{ return v_add_wrap(a - b, b - a); }
{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
{ return v_max(a, b) - v_min(a, b); }
{ return v_sub(v_max(a, b), v_min(a, b)); }
inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
{
v_int8x16 d = v_sub_wrap(a, b);
v_int8x16 m = a < b;
return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
v_int8x16 m = v_lt(a, b);
return v_reinterpret_as_u8(v_sub_wrap(v_xor(d, m), m));
}
inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
{
@ -1329,25 +1322,25 @@ inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
}
inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
{
v_int32x4 d = a - b;
v_int32x4 m = a < b;
return v_reinterpret_as_u32((d ^ m) - m);
v_int32x4 d = v_sub(a, b);
v_int32x4 m = v_lt(a, b);
return v_reinterpret_as_u32(v_sub(v_xor(d, m), m));
}
/** Saturating absolute difference **/
inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
{
v_int8x16 d = a - b;
v_int8x16 m = a < b;
return (d ^ m) - m;
v_int8x16 d = v_sub(a, b);
v_int8x16 m = v_lt(a, b);
return v_sub(v_xor(d, m), m);
}
inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
{ return v_max(a, b) - v_min(a, b); }
{ return v_sub(v_max(a, b), v_min(a, b)); }
inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
{
return a * b + c;
return v_add(v_mul(a, b), c);
}
inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
@ -1381,12 +1374,12 @@ inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
} \
inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
{ \
_Tpvec res = v_fma(a, a, b*b); \
_Tpvec res = v_fma(a, a, v_mul(b, b)); \
return _Tpvec(_mm_sqrt_##suffix(res.val)); \
} \
inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
{ \
return v_fma(a, a, b*b); \
return v_fma(a, a, v_mul(b, b)); \
} \
inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
{ \
@ -1397,19 +1390,19 @@ OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((
OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1))
#define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \
{ \
return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
} \
inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \
{ \
return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
} \
inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
inline _Tpuvec v_shr(const _Tpuvec& a, int imm) \
{ \
return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
} \
inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \
{ \
return _Tpsvec(srai(a.val, imm)); \
} \
@ -1711,9 +1704,9 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_N
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)
inline int v_reduce_sum(const v_int16x8& a)
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
inline unsigned v_reduce_sum(const v_uint16x8& a)
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
inline uint64 v_reduce_sum(const v_uint64x2& a)
{
@ -1770,13 +1763,13 @@ inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
{
v_uint32x4 l, h;
v_expand(v_absdiff(a, b), l, h);
return v_reduce_sum(l + h);
return v_reduce_sum(v_add(l, h));
}
inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
{
v_uint32x4 l, h;
v_expand(v_absdiff(a, b), l, h);
return v_reduce_sum(l + h);
return v_reduce_sum(v_add(l, h));
}
inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
{
@ -1805,15 +1798,15 @@ inline v_uint8x16 v_popcount(const v_uint8x16& a)
inline v_uint16x8 v_popcount(const v_uint16x8& a)
{
v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
p += v_rotate_right<1>(p);
return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
p = v_add(p, v_rotate_right<1>(p));
return v_and(v_reinterpret_as_u16(p), v_setall_u16(0x00ff));
}
inline v_uint32x4 v_popcount(const v_uint32x4& a)
{
v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
p += v_rotate_right<1>(p);
p += v_rotate_right<2>(p);
return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
p = v_add(p, v_rotate_right<1>(p));
p = v_add(p, v_rotate_right<2>(p));
return v_and(v_reinterpret_as_u32(p), v_setall_u32(0x000000ff));
}
inline v_uint64x2 v_popcount(const v_uint64x2& a)
{

@ -513,48 +513,44 @@ inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d)
/* Element-wise binary and unary operations */
/** Arithmetics **/
#define OPENCV_HAL_IMPL_VSX_BIN_OP(bin_op, _Tpvec, intrin) \
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(a.val, b.val)); } \
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
{ a.val = intrin(a.val, b.val); return a; }
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint8x16, vec_adds)
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint8x16, vec_subs)
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int8x16, vec_adds)
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int8x16, vec_subs)
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint16x8, vec_adds)
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint16x8, vec_subs)
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int16x8, vec_adds)
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int16x8, vec_subs)
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint32x4, vec_add)
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint32x4, vec_sub)
OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_uint32x4, vec_mul)
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int32x4, vec_add)
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int32x4, vec_sub)
OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_int32x4, vec_mul)
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float32x4, vec_add)
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float32x4, vec_sub)
OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float32x4, vec_mul)
OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float32x4, vec_div)
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float64x2, vec_add)
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float64x2, vec_sub)
OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float64x2, vec_mul)
OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float64x2, vec_div)
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint64x2, vec_add)
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint64x2, vec_sub)
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int64x2, vec_add)
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int64x2, vec_sub)
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(a.val, b.val)); }
OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint8x16, vec_adds)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint8x16, vec_subs)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int8x16, vec_adds)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int8x16, vec_subs)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint16x8, vec_adds)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint16x8, vec_subs)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int16x8, vec_adds)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int16x8, vec_subs)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint32x4, vec_add)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint32x4, vec_sub)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_uint32x4, vec_mul)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int32x4, vec_add)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int32x4, vec_sub)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_int32x4, vec_mul)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_float32x4, vec_add)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_float32x4, vec_sub)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_float32x4, vec_mul)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_div, v_float32x4, vec_div)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_float64x2, vec_add)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_float64x2, vec_sub)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_float64x2, vec_mul)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_div, v_float64x2, vec_div)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint64x2, vec_add)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint64x2, vec_sub)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int64x2, vec_add)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int64x2, vec_sub)
// saturating multiply
#define OPENCV_HAL_IMPL_VSX_MUL_SAT(_Tpvec, _Tpwvec) \
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
{ \
_Tpwvec c, d; \
v_mul_expand(a, b, c, d); \
return v_pack(c, d); \
} \
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
{ a = a * b; return a; }
}
OPENCV_HAL_IMPL_VSX_MUL_SAT(v_int8x16, v_int16x8)
OPENCV_HAL_IMPL_VSX_MUL_SAT(v_uint8x16, v_uint16x8)
@ -596,9 +592,9 @@ OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_mul_wrap, vec_mul)
/** Bitwise shifts **/
#define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpvec, shr, splfunc) \
inline _Tpvec operator << (const _Tpvec& a, int imm) \
inline _Tpvec v_shl(const _Tpvec& a, int imm) \
{ return _Tpvec(vec_sl(a.val, splfunc(imm))); } \
inline _Tpvec operator >> (const _Tpvec& a, int imm) \
inline _Tpvec v_shr(const _Tpvec& a, int imm) \
{ return _Tpvec(shr(a.val, splfunc(imm))); } \
template<int imm> inline _Tpvec v_shl(const _Tpvec& a) \
{ return _Tpvec(vec_sl(a.val, splfunc(imm))); } \
@ -617,10 +613,10 @@ OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int64x2, vec_sra, vec_udword2_sp)
/** Bitwise logic **/
#define OPENCV_HAL_IMPL_VSX_LOGIC_OP(_Tpvec) \
OPENCV_HAL_IMPL_VSX_BIN_OP(&, _Tpvec, vec_and) \
OPENCV_HAL_IMPL_VSX_BIN_OP(|, _Tpvec, vec_or) \
OPENCV_HAL_IMPL_VSX_BIN_OP(^, _Tpvec, vec_xor) \
inline _Tpvec operator ~ (const _Tpvec& a) \
OPENCV_HAL_IMPL_VSX_BIN_OP(v_and, _Tpvec, vec_and) \
OPENCV_HAL_IMPL_VSX_BIN_OP(v_or, _Tpvec, vec_or) \
OPENCV_HAL_IMPL_VSX_BIN_OP(v_xor, _Tpvec, vec_xor) \
inline _Tpvec v_not(const _Tpvec& a) \
{ return _Tpvec(vec_not(a.val)); }
OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint8x16)
@ -650,17 +646,17 @@ OPENCV_HAL_IMPL_VSX_SELECT(v_float64x2, vec_bdword2_c)
/** Comparison **/
#define OPENCV_HAL_IMPL_VSX_INT_CMP_OP(_Tpvec) \
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vec_cmpeq(a.val, b.val)); } \
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec V_ne(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vec_cmpne(a.val, b.val)); } \
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vec_cmplt(a.val, b.val)); } \
inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec V_gt(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vec_cmpgt(a.val, b.val)); } \
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vec_cmple(a.val, b.val)); } \
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vec_cmpge(a.val, b.val)); }
OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint8x16)
@ -1060,7 +1056,7 @@ OPENCV_HAL_IMPL_VSX_MULADD(v_float32x4)
OPENCV_HAL_IMPL_VSX_MULADD(v_float64x2)
inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
{ return a * b + c; }
{ return v_add(v_mul(a, b), c); }
// TODO: exp, log, sin, cos
@ -1089,12 +1085,12 @@ inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
{ return v_reinterpret_as_u32(v_max(a, b) - v_min(a, b)); }
{ return v_reinterpret_as_u32(v_sub(v_max(a, b), v_min(a, b))); }
inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
{ return v_abs(a - b); }
{ return v_abs(v_sub(a, b)); }
inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
{ return v_abs(a - b); }
{ return v_abs(v_sub(a, b)); }
/** Absolute difference for signed integers **/
inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
@ -1442,7 +1438,7 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
return v_int64x2(vec_add(even, odd));
}
inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
{ return v_dotprod(a, b) + c; }
{ return v_add(v_dotprod(a, b), c); }
// 8 >> 32
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
@ -1485,7 +1481,7 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
return v_uint64x2(vec_add(s0, s1));
}
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
{
@ -1495,13 +1491,13 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
return v_int64x2(vec_add(vec_mergeh(c.val, d.val), vec_mergel(c.val, d.val)));
}
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
// 32 >> 64f
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
{ return v_cvt_f64(v_dotprod(a, b)); }
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
//////// Fast Dot Product ////////
@ -1531,7 +1527,7 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, vec_int4_z)));
}
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
{ return v_dotprod_expand_fast(a, b) + c; }
{ return v_add(v_dotprod_expand_fast(a, b), c); }
// 16 >> 64
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
@ -1544,10 +1540,10 @@ inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
v_int32x4 prod = v_dotprod(a, b);
v_int64x2 c, d;
v_expand(prod, c, d);
return c + d;
return v_add(c, d);
}
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
{ return v_dotprod_expand_fast(a, b) + c; }
{ return v_add(v_dotprod_expand_fast(a, b), c); }
// 32 >> 64f
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)

@ -849,53 +849,46 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
}
#define OPENCV_HAL_IMPL_WASM_BIN_OP(bin_op, _Tpvec, intrin) \
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
{ \
return _Tpvec(intrin(a.val, b.val)); \
} \
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
{ \
a.val = intrin(a.val, b.val); \
return a; \
}
OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint8x16, wasm_u8x16_add_saturate)
OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint8x16, wasm_u8x16_sub_saturate)
OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int8x16, wasm_i8x16_add_saturate)
OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int8x16, wasm_i8x16_sub_saturate)
OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint16x8, wasm_u16x8_add_saturate)
OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint16x8, wasm_u16x8_sub_saturate)
OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int16x8, wasm_i16x8_add_saturate)
OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int16x8, wasm_i16x8_sub_saturate)
OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint32x4, wasm_i32x4_add)
OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint32x4, wasm_i32x4_sub)
OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_uint32x4, wasm_i32x4_mul)
OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int32x4, wasm_i32x4_add)
OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int32x4, wasm_i32x4_sub)
OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_int32x4, wasm_i32x4_mul)
OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_float32x4, wasm_f32x4_add)
OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_float32x4, wasm_f32x4_sub)
OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_float32x4, wasm_f32x4_mul)
OPENCV_HAL_IMPL_WASM_BIN_OP(/, v_float32x4, wasm_f32x4_div)
OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint64x2, wasm_i64x2_add)
OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint64x2, wasm_i64x2_sub)
OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int64x2, wasm_i64x2_add)
OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int64x2, wasm_i64x2_sub)
OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_float64x2, wasm_f64x2_add)
OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_float64x2, wasm_f64x2_sub)
OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_float64x2, wasm_f64x2_mul)
OPENCV_HAL_IMPL_WASM_BIN_OP(/, v_float64x2, wasm_f64x2_div)
}
OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_uint8x16, wasm_u8x16_add_saturate)
OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_uint8x16, wasm_u8x16_sub_saturate)
OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_int8x16, wasm_i8x16_add_saturate)
OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_int8x16, wasm_i8x16_sub_saturate)
OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_uint16x8, wasm_u16x8_add_saturate)
OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_uint16x8, wasm_u16x8_sub_saturate)
OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_int16x8, wasm_i16x8_add_saturate)
OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_int16x8, wasm_i16x8_sub_saturate)
OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_uint32x4, wasm_i32x4_add)
OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_uint32x4, wasm_i32x4_sub)
OPENCV_HAL_IMPL_WASM_BIN_OP(v_mul, v_uint32x4, wasm_i32x4_mul)
OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_int32x4, wasm_i32x4_add)
OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_int32x4, wasm_i32x4_sub)
OPENCV_HAL_IMPL_WASM_BIN_OP(v_mul, v_int32x4, wasm_i32x4_mul)
OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_float32x4, wasm_f32x4_add)
OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_float32x4, wasm_f32x4_sub)
OPENCV_HAL_IMPL_WASM_BIN_OP(v_mul, v_float32x4, wasm_f32x4_mul)
OPENCV_HAL_IMPL_WASM_BIN_OP(v_div, v_float32x4, wasm_f32x4_div)
OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_uint64x2, wasm_i64x2_add)
OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_uint64x2, wasm_i64x2_sub)
OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_int64x2, wasm_i64x2_add)
OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_int64x2, wasm_i64x2_sub)
OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_float64x2, wasm_f64x2_add)
OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_float64x2, wasm_f64x2_sub)
OPENCV_HAL_IMPL_WASM_BIN_OP(v_mul, v_float64x2, wasm_f64x2_mul)
OPENCV_HAL_IMPL_WASM_BIN_OP(v_div, v_float64x2, wasm_f64x2_div)
// saturating multiply 8-bit, 16-bit
#define OPENCV_HAL_IMPL_WASM_MUL_SAT(_Tpvec, _Tpwvec) \
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
{ \
_Tpwvec c, d; \
v_mul_expand(a, b, c, d); \
return v_pack(c, d); \
} \
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
{ a = a * b; return a; }
}
OPENCV_HAL_IMPL_WASM_MUL_SAT(v_uint8x16, v_uint16x8)
OPENCV_HAL_IMPL_WASM_MUL_SAT(v_int8x16, v_int16x8)
@ -986,7 +979,7 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
}
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
{ return v_dotprod(a, b) + c; }
{ return v_add(v_dotprod(a, b), c); }
inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
{
@ -1000,7 +993,7 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
}
inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
{
return v_dotprod(a, b) + c;
return v_add(v_dotprod(a, b), c);
}
// 8 >> 32
@ -1010,13 +1003,13 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
v128_t a1 = wasm_u16x8_shr(a.val, 8);
v128_t b0 = wasm_u16x8_shr(wasm_i16x8_shl(b.val, 8), 8);
v128_t b1 = wasm_u16x8_shr(b.val, 8);
return v_uint32x4((
v_dotprod(v_int16x8(a0), v_int16x8(b0)) +
v_dotprod(v_int16x8(a1), v_int16x8(b1))).val
return v_uint32x4((v_add(
v_dotprod(v_int16x8(a0), v_int16x8(b0)),
v_dotprod(v_int16x8(a1), v_int16x8(b1)))).val
);
}
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
{
@ -1024,13 +1017,13 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
v128_t a1 = wasm_i16x8_shr(a.val, 8);
v128_t b0 = wasm_i16x8_shr(wasm_i16x8_shl(b.val, 8), 8);
v128_t b1 = wasm_i16x8_shr(b.val, 8);
return v_int32x4(
v_dotprod(v_int16x8(a0), v_int16x8(b0)) +
return v_int32x4(v_add(
v_dotprod(v_int16x8(a0), v_int16x8(b0)),
v_dotprod(v_int16x8(a1), v_int16x8(b1))
);
));
}
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
// 16 >> 64
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
@ -1039,13 +1032,13 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
v128_t a1 = wasm_u32x4_shr(a.val, 16);
v128_t b0 = wasm_u32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
v128_t b1 = wasm_u32x4_shr(b.val, 16);
return v_uint64x2((
v_dotprod(v_int32x4(a0), v_int32x4(b0)) +
return v_uint64x2((v_add(
v_dotprod(v_int32x4(a0), v_int32x4(b0)),
v_dotprod(v_int32x4(a1), v_int32x4(b1))).val
);
));
}
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
{
@ -1053,20 +1046,20 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
v128_t a1 = wasm_i32x4_shr(a.val, 16);
v128_t b0 = wasm_i32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
v128_t b1 = wasm_i32x4_shr(b.val, 16);
return v_int64x2((
v_dotprod(v_int32x4(a0), v_int32x4(b0)) +
return v_int64x2((v_add(
v_dotprod(v_int32x4(a0), v_int32x4(b0)),
v_dotprod(v_int32x4(a1), v_int32x4(b1)))
);
));
}
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
// 32 >> 64f
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
{ return v_cvt_f64(v_dotprod(a, b)); }
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
{ return v_dotprod_expand(a, b) + c; }
{ return v_add(v_dotprod_expand(a, b), c); }
//////// Fast Dot Product ////////
@ -1109,10 +1102,10 @@ inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b,
{ return v_dotprod_expand(a, b, c); }
#define OPENCV_HAL_IMPL_WASM_LOGIC_OP(_Tpvec) \
OPENCV_HAL_IMPL_WASM_BIN_OP(&, _Tpvec, wasm_v128_and) \
OPENCV_HAL_IMPL_WASM_BIN_OP(|, _Tpvec, wasm_v128_or) \
OPENCV_HAL_IMPL_WASM_BIN_OP(^, _Tpvec, wasm_v128_xor) \
inline _Tpvec operator ~ (const _Tpvec& a) \
OPENCV_HAL_IMPL_WASM_BIN_OP(v_and, _Tpvec, wasm_v128_and) \
OPENCV_HAL_IMPL_WASM_BIN_OP(v_or, _Tpvec, wasm_v128_or) \
OPENCV_HAL_IMPL_WASM_BIN_OP(v_xor, _Tpvec, wasm_v128_xor) \
inline _Tpvec v_not(const _Tpvec& a) \
{ \
return _Tpvec(wasm_v128_not(a.val)); \
}
@ -1215,17 +1208,17 @@ OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint16x8, i16x8, (short)0x8000)
OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint32x4, i32x4, (int)0x80000000)
#define OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(_Tpvec, suffix, esuffix) \
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(wasm_##esuffix##_eq(a.val, b.val)); } \
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(wasm_##esuffix##_ne(a.val, b.val)); } \
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(wasm_##suffix##_lt(a.val, b.val)); } \
inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(wasm_##suffix##_gt(a.val, b.val)); } \
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(wasm_##suffix##_le(a.val, b.val)); } \
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(wasm_##suffix##_ge(a.val, b.val)); }
OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint8x16, u8x16, i8x16)
@ -1238,10 +1231,10 @@ OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_float32x4, f32x4, f32x4)
OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_float64x2, f64x2, f64x2)
#define OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(_Tpvec, cast) \
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
{ return cast(v_reinterpret_as_f64(a) == v_reinterpret_as_f64(b)); } \
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
{ return cast(v_reinterpret_as_f64(a) != v_reinterpret_as_f64(b)); }
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
{ return cast(v_eq(v_reinterpret_as_f64(a), v_reinterpret_as_f64(b))); } \
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
{ return cast(v_ne(v_reinterpret_as_f64(a), v_reinterpret_as_f64(b))); }
OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(v_uint64x2, v_reinterpret_as_u64)
OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(v_int64x2, v_reinterpret_as_s64)
@ -1299,17 +1292,17 @@ OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_mul_wrap, wasm_i16x8_mul)
/** Absolute difference **/
inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
{ return v_add_wrap(a - b, b - a); }
{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
{ return v_add_wrap(a - b, b - a); }
{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
{ return v_max(a, b) - v_min(a, b); }
{ return v_sub(v_max(a, b), v_min(a, b)); }
inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
{
v_int8x16 d = v_sub_wrap(a, b);
v_int8x16 m = a < b;
return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
v_int8x16 m = v_lt(a, b);
return v_reinterpret_as_u8(v_sub_wrap(v_xor(d, m), m));
}
inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
{
@ -1317,25 +1310,25 @@ inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
}
inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
{
v_int32x4 d = a - b;
v_int32x4 m = a < b;
return v_reinterpret_as_u32((d ^ m) - m);
v_int32x4 d = v_sub(a, b);
v_int32x4 m = v_lt(a, b);
return v_reinterpret_as_u32(v_sub(v_xor(d, m), m));
}
/** Saturating absolute difference **/
inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
{
v_int8x16 d = a - b;
v_int8x16 m = a < b;
return (d ^ m) - m;
v_int8x16 d = v_sub(a, b);
v_int8x16 m = v_lt(a, b);
return v_sub(v_xor(d, m), m);
}
inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
{ return v_max(a, b) - v_min(a, b); }
{ return v_sub(v_max(a, b), v_min(a, b)); }
inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
{
return a * b + c;
return v_add(v_mul(a, b), c);
}
inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
@ -1345,12 +1338,12 @@ inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x
inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
{
return a * b + c;
return v_add(v_mul(a, b), c);
}
inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
{
return a * b + c;
return v_add(v_mul(a, b), c);
}
inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
@ -1386,19 +1379,19 @@ OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(v_float32x4, f32x4)
OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(v_float64x2, f64x2)
#define OPENCV_HAL_IMPL_WASM_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, ssuffix) \
inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \
{ \
return _Tpuvec(wasm_##suffix##_shl(a.val, imm)); \
} \
inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \
{ \
return _Tpsvec(wasm_##suffix##_shl(a.val, imm)); \
} \
inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
inline _Tpuvec V_shr(const _Tpuvec& a, int imm) \
{ \
return _Tpuvec(wasm_##ssuffix##_shr(a.val, imm)); \
} \
inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \
{ \
return _Tpsvec(wasm_##suffix##_shr(a.val, imm)); \
} \
@ -1694,7 +1687,7 @@ inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
v_expand(v_absdiff(a, b), l16, h16);
v_expand(l16, l16_l32, l16_h32);
v_expand(h16, h16_l32, h16_h32);
return v_reduce_sum(l16_l32+l16_h32+h16_l32+h16_h32);
return v_reduce_sum(v_add(v_add(l16_l32, l16_h32), v_add(h16_l32, h16_h32)));
}
inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
{
@ -1703,19 +1696,19 @@ inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
v_expand(v_absdiff(a, b), l16, h16);
v_expand(l16, l16_l32, l16_h32);
v_expand(h16, h16_l32, h16_h32);
return v_reduce_sum(l16_l32+l16_h32+h16_l32+h16_h32);
return v_reduce_sum(v_add(v_add(l16_l32, l16_h32), v_add(h16_l32, h16_h32)));
}
inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
{
v_uint32x4 l, h;
v_expand(v_absdiff(a, b), l, h);
return v_reduce_sum(l + h);
return v_reduce_sum(v_add(l, h));
}
inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
{
v_uint32x4 l, h;
v_expand(v_absdiff(a, b), l, h);
return v_reduce_sum(l + h);
return v_reduce_sum(v_add(l, h));
}
inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
{
@ -1744,15 +1737,15 @@ inline v_uint8x16 v_popcount(const v_uint8x16& a)
inline v_uint16x8 v_popcount(const v_uint16x8& a)
{
v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
p += v_rotate_right<1>(p);
return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
p = v_add(p, v_rotate_right<1>(p));
return v_and(v_reinterpret_as_u16(p), v_setall_u16(0x00ff));
}
inline v_uint32x4 v_popcount(const v_uint32x4& a)
{
v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
p += v_rotate_right<1>(p);
p += v_rotate_right<2>(p);
return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
p = v_add(p, v_rotate_right<1>(p));
p = v_add(p, v_rotate_right<2>(p));
return v_and(v_reinterpret_as_u32(p), v_setall_u32(0x000000ff));
}
inline v_uint64x2 v_popcount(const v_uint64x2& a)
{

@ -157,7 +157,7 @@ public:
q0 = v_max(q0, v_min(a, v0_));
q1 = v_min(q1, v_max(b, v0_));
}
q0 = v_max(q0, v_setzero_s16() - q1);
q0 = v_max(q0, v_sub(v_setzero_s16(), q1));
curr[j + k] = (uchar)(v_reduce_max(q0) - 1);
}
}

@ -2618,8 +2618,8 @@ public:
v_uint32 r0, r1, r2, r3;
v_expand(vx_load(S0), r0, r1);
v_expand(vx_load(S1), r2, r3);
r0 += r2; r1 += r3;
v_rshr_pack_store<2>(D, r0 + v_rotate_left<1>(r1, r0));
r0 = v_add(r0, r2); r1 = v_add(r1, r3);
v_rshr_pack_store<2>(D, v_add(r0, v_rotate_left<1>(r1, r0)));
}
#else
v_rshr_pack_store<2>(D, v_add(v_add(v_add(v_load_expand(S0), v_load_expand(S0 + 3)), v_load_expand(S1)), v_load_expand(S1 + 3)));

@ -130,9 +130,9 @@ struct Integral_SIMD<uchar, int, double>
el8 = v_add(el8, v_rotate_left<1>(el8));
el8 = v_add(el8, v_rotate_left<2>(el8));
#if CV_SIMD_WIDTH >= 32
el8 += v_rotate_left<4>(el8);
el8 = v_add(el8, v_rotate_left<4>(el8));
#if CV_SIMD_WIDTH == 64
el8 += v_rotate_left<8>(el8);
el8 = v_add(el8, v_rotate_left<8>(el8));
#endif
#endif
v_expand(el8, el4l, el4h);
@ -188,11 +188,11 @@ struct Integral_SIMD<uchar, int, double>
el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1));
el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2));
#if CV_SIMD_WIDTH >= 32
el8_1 += v_rotate_left<4>(el8_1);
el8_2 += v_rotate_left<4>(el8_2);
el8_1 = v_add(el8_1, v_rotate_left<4>(el8_1));
el8_2 = v_add(el8_2, v_rotate_left<4>(el8_2));
#if CV_SIMD_WIDTH == 64
el8_1 += v_rotate_left<8>(el8_1);
el8_2 += v_rotate_left<8>(el8_2);
el8_1 = v_add(el8_1, v_rotate_left<8>(el8_1));
el8_2 = v_add(el8_2, v_rotate_left<8>(el8_2));
#endif
#endif
v_expand(el8_1, el4l_1, el4h_1);
@ -350,9 +350,9 @@ struct Integral_SIMD<uchar, int, double>
prev.val = _mm256_permute2x128_si256(el4h.val, el4h.val, 0x31);
#else
#if CV_SIMD_WIDTH >= 32
el8 += v_rotate_left<4>(el8);
el8 = v_add(el8, v_rotate_left<4>(el8));
#if CV_SIMD_WIDTH == 64
el8 += v_rotate_left<8>(el8);
el8 = v_add(el8, v_rotate_left<8>(el8));
#endif
#endif
v_expand(el8, el4l, el4h);
@ -364,7 +364,7 @@ struct Integral_SIMD<uchar, int, double>
prev = v_combine_high(el4h, el4h);
#else
v_int32 t = v_rotate_right<12>(el4h);
t |= v_rotate_left<4>(t);
t = v_or(t, v_rotate_left<4>(t));
prev = v_combine_low(t, t);
#endif
#endif
@ -442,9 +442,9 @@ struct Integral_SIMD<uchar, float, double>
el8 = v_add(el8, v_rotate_left<1>(el8));
el8 = v_add(el8, v_rotate_left<2>(el8));
#if CV_SIMD_WIDTH >= 32
el8 += v_rotate_left<4>(el8);
el8 = v_add(el8, v_rotate_left<4>(el8));
#if CV_SIMD_WIDTH == 64
el8 += v_rotate_left<8>(el8);
el8 = v_add(el8, v_rotate_left<8>(el8));
#endif
#endif
v_int32 el4li, el4hi;
@ -501,11 +501,11 @@ struct Integral_SIMD<uchar, float, double>
el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1));
el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2));
#if CV_SIMD_WIDTH >= 32
el8_1 += v_rotate_left<4>(el8_1);
el8_2 += v_rotate_left<4>(el8_2);
el8_1 = v_add(el8_1, v_rotate_left<4>(el8_1));
el8_2 = v_add(el8_2, v_rotate_left<4>(el8_2));
#if CV_SIMD_WIDTH == 64
el8_1 += v_rotate_left<8>(el8_1);
el8_2 += v_rotate_left<8>(el8_2);
el8_1 = v_add(el8_1, v_rotate_left<8>(el8_1));
el8_2 = v_add(el8_2, v_rotate_left<8>(el8_2));
#endif
#endif
v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2;
@ -590,13 +590,13 @@ struct Integral_SIMD<uchar, float, double>
el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2));
el8_3 = v_add(el8_3, v_rotate_left<2>(el8_3));
#if CV_SIMD_WIDTH >= 32
el8_1 += v_rotate_left<4>(el8_1);
el8_2 += v_rotate_left<4>(el8_2);
el8_3 += v_rotate_left<4>(el8_3);
el8_1 = v_add(el8_1, v_rotate_left<4>(el8_1));
el8_2 = v_add(el8_2, v_rotate_left<4>(el8_2));
el8_3 = v_add(el8_3, v_rotate_left<4>(el8_3));
#if CV_SIMD_WIDTH == 64
el8_1 += v_rotate_left<8>(el8_1);
el8_2 += v_rotate_left<8>(el8_2);
el8_3 += v_rotate_left<8>(el8_3);
el8_1 = v_add(el8_1, v_rotate_left<8>(el8_1));
el8_2 = v_add(el8_2, v_rotate_left<8>(el8_2));
el8_3 = v_add(el8_3, v_rotate_left<8>(el8_3));
#endif
#endif
v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2, el4li_3, el4hi_3;
@ -663,9 +663,9 @@ struct Integral_SIMD<uchar, float, double>
prev.val = _mm256_permute2f128_ps(el4h.val, el4h.val, 0x31);
#else
#if CV_SIMD_WIDTH >= 32
el8 += v_rotate_left<4>(el8);
el8 = v_add(el8, v_rotate_left<4>(el8));
#if CV_SIMD_WIDTH == 64
el8 += v_rotate_left<8>(el8);
el8 = v_add(el8, v_rotate_left<8>(el8));
#endif
#endif
v_int32 el4li, el4hi;
@ -678,7 +678,7 @@ struct Integral_SIMD<uchar, float, double>
prev = v_combine_high(el4h, el4h);
#else
v_float32 t = v_rotate_right<12>(el4h);
t |= v_rotate_left<4>(t);
t = v_or(t, v_rotate_left<4>(t));
prev = v_combine_low(t, t);
#endif
#endif
@ -770,9 +770,9 @@ struct Integral_SIMD<uchar, double, double>
el8 = v_add(el8, v_rotate_left<1>(el8));
el8 = v_add(el8, v_rotate_left<2>(el8));
#if CV_SIMD_WIDTH >= 32
el8 += v_rotate_left<4>(el8);
el8 = v_add(el8, v_rotate_left<4>(el8));
#if CV_SIMD_WIDTH == 64
el8 += v_rotate_left<8>(el8);
el8 = v_add(el8, v_rotate_left<8>(el8));
#endif
#endif
v_int32 el4li, el4hi;
@ -843,11 +843,11 @@ struct Integral_SIMD<uchar, double, double>
el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1));
el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2));
#if CV_SIMD_WIDTH >= 32
el8_1 += v_rotate_left<4>(el8_1);
el8_2 += v_rotate_left<4>(el8_2);
el8_1 = v_add(el8_1, v_rotate_left<4>(el8_1));
el8_2 = v_add(el8_2, v_rotate_left<4>(el8_2));
#if CV_SIMD_WIDTH == 64
el8_1 += v_rotate_left<8>(el8_1);
el8_2 += v_rotate_left<8>(el8_2);
el8_1 = v_add(el8_1, v_rotate_left<8>(el8_1));
el8_2 = v_add(el8_2, v_rotate_left<8>(el8_2));
#endif
#endif
v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2;
@ -958,13 +958,13 @@ struct Integral_SIMD<uchar, double, double>
el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2));
el8_3 = v_add(el8_3, v_rotate_left<2>(el8_3));
#if CV_SIMD_WIDTH >= 32
el8_1 += v_rotate_left<4>(el8_1);
el8_2 += v_rotate_left<4>(el8_2);
el8_3 += v_rotate_left<4>(el8_3);
el8_1 = v_add(el8_1, v_rotate_left<4>(el8_1));
el8_2 = v_add(el8_2, v_rotate_left<4>(el8_2));
el8_3 = v_add(el8_3, v_rotate_left<4>(el8_3));
#if CV_SIMD_WIDTH == 64
el8_1 += v_rotate_left<8>(el8_1);
el8_2 += v_rotate_left<8>(el8_2);
el8_3 += v_rotate_left<8>(el8_3);
el8_1 = v_add(el8_1, v_rotate_left<8>(el8_1));
el8_2 = v_add(el8_2, v_rotate_left<8>(el8_2));
el8_3 = v_add(el8_3, v_rotate_left<8>(el8_3));
#endif
#endif
v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2, el4li_3, el4hi_3;
@ -1058,9 +1058,9 @@ struct Integral_SIMD<uchar, double, double>
prev_1.val = prev_2.val = el4hh.val;
#else
#if CV_SIMD_WIDTH >= 32
el8 += v_rotate_left<4>(el8);
el8 = v_add(el8, v_rotate_left<4>(el8));
#if CV_SIMD_WIDTH == 64
el8 += v_rotate_left<8>(el8);
el8 = v_add(el8, v_rotate_left<8>(el8));
#endif
#endif
v_int32 el4li, el4hi;

Loading…
Cancel
Save