From f0d29cd33c5d2b8f6c9c5c3177cbb3a359ee6b33 Mon Sep 17 00:00:00 2001 From: HAN Liutong Date: Mon, 8 Aug 2022 02:09:54 +0000 Subject: [PATCH 1/8] Add more universal intrinsic implementations for RVV. --- .../core/include/opencv2/core/hal/intrin.hpp | 16 +- .../opencv2/core/hal/intrin_rvv_scalable.hpp | 695 ++++++++++++++++++ modules/core/test/test_intrin_utils.hpp | 248 ++++++- 3 files changed, 950 insertions(+), 9 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index a04de3a12d..c12140bbf8 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -537,7 +537,7 @@ namespace CV__SIMD_NAMESPACE { inline v_float32 vx_setall_f32(float v) { return VXPREFIX(_setall_f32)(v); } inline v_int64 vx_setall_s64(int64 v) { return VXPREFIX(_setall_s64)(v); } inline v_uint64 vx_setall_u64(uint64 v) { return VXPREFIX(_setall_u64)(v); } -#if CV_SIMD_64F +#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F inline v_float64 vx_setall_f64(double v) { return VXPREFIX(_setall_f64)(v); } #endif //! @} @@ -554,7 +554,7 @@ namespace CV__SIMD_NAMESPACE { inline v_float32 vx_setzero_f32() { return VXPREFIX(_setzero_f32)(); } inline v_int64 vx_setzero_s64() { return VXPREFIX(_setzero_s64)(); } inline v_uint64 vx_setzero_u64() { return VXPREFIX(_setzero_u64)(); } -#if CV_SIMD_64F +#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F inline v_float64 vx_setzero_f64() { return VXPREFIX(_setzero_f64)(); } #endif //! @} @@ -571,7 +571,7 @@ namespace CV__SIMD_NAMESPACE { inline v_float32 vx_load(const float * ptr) { return VXPREFIX(_load)(ptr); } inline v_int64 vx_load(const int64 * ptr) { return VXPREFIX(_load)(ptr); } inline v_uint64 vx_load(const uint64 * ptr) { return VXPREFIX(_load)(ptr); } -#if CV_SIMD_64F +#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F inline v_float64 vx_load(const double * ptr) { return VXPREFIX(_load)(ptr); } #endif //! @} @@ -588,7 +588,7 @@ namespace CV__SIMD_NAMESPACE { inline v_float32 vx_load_aligned(const float * ptr) { return VXPREFIX(_load_aligned)(ptr); } inline v_int64 vx_load_aligned(const int64 * ptr) { return VXPREFIX(_load_aligned)(ptr); } inline v_uint64 vx_load_aligned(const uint64 * ptr) { return VXPREFIX(_load_aligned)(ptr); } -#if CV_SIMD_64F +#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F inline v_float64 vx_load_aligned(const double * ptr) { return VXPREFIX(_load_aligned)(ptr); } #endif //! @} @@ -605,7 +605,7 @@ namespace CV__SIMD_NAMESPACE { inline v_float32 vx_load_low(const float * ptr) { return VXPREFIX(_load_low)(ptr); } inline v_int64 vx_load_low(const int64 * ptr) { return VXPREFIX(_load_low)(ptr); } inline v_uint64 vx_load_low(const uint64 * ptr) { return VXPREFIX(_load_low)(ptr); } -#if CV_SIMD_64F +#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F inline v_float64 vx_load_low(const double * ptr) { return VXPREFIX(_load_low)(ptr); } #endif //! @} @@ -622,7 +622,7 @@ namespace CV__SIMD_NAMESPACE { inline v_float32 vx_load_halves(const float * ptr0, const float * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); } inline v_int64 vx_load_halves(const int64 * ptr0, const int64 * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); } inline v_uint64 vx_load_halves(const uint64 * ptr0, const uint64 * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); } -#if CV_SIMD_64F +#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F inline v_float64 vx_load_halves(const double * ptr0, const double * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); } #endif //! @} @@ -639,7 +639,7 @@ namespace CV__SIMD_NAMESPACE { inline v_float32 vx_lut(const float* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); } inline v_int64 vx_lut(const int64 * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); } inline v_uint64 vx_lut(const uint64 * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); } -#if CV_SIMD_64F +#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F inline v_float64 vx_lut(const double* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); } #endif //! @} @@ -656,7 +656,7 @@ namespace CV__SIMD_NAMESPACE { inline v_float32 vx_lut_pairs(const float* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); } inline v_int64 vx_lut_pairs(const int64 * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); } inline v_uint64 vx_lut_pairs(const uint64 * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); } -#if CV_SIMD_64F +#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F inline v_float64 vx_lut_pairs(const double* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); } #endif //! @} diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp index 30c7524699..728112bc99 100644 --- a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp @@ -284,6 +284,64 @@ inline v_float64 v_reinterpret_as_f64(const v_float32& v) \ } #endif +//////////// Extract ////////////// + +#define OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(_Tpvec, _Tp, suffix, vl) \ +template \ +inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b, int i = s) \ +{ \ + return vslideup(vslidedown(v_setzero_##suffix(), a, i, vl), b, VTraits<_Tpvec>::vlanes() - i, vl); \ +} \ +template inline _Tp v_extract_n(_Tpvec v, int i = s) \ +{ \ + return vmv_x(vslidedown(v_setzero_##suffix(), v, i, vl)); \ +} + + +OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint8, uchar, u8, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int8, schar, s8, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint16, ushort, u16, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int16, short, s16, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint32, unsigned int, u32, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int32, int, s32, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint64, uint64, u64, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int64, int64, s64, VTraits::vlanes()) + +#define OPENCV_HAL_IMPL_RVV_EXTRACT_FP(_Tpvec, _Tp, suffix, vl) \ +template \ +inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b, int i = s) \ +{ \ + return vslideup(vslidedown(v_setzero_##suffix(), a, i, vl), b, VTraits<_Tpvec>::vlanes() - i, vl); \ +} \ +template inline _Tp v_extract_n(_Tpvec v, int i = s) \ +{ \ + return vfmv_f(vslidedown(v_setzero_##suffix(), v, i, vl)); \ +} + +OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float32, float, f32, VTraits::vlanes()) +#if CV_SIMD_SCALABLE_64F +OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float64, double, f64, VTraits::vlanes()) +#endif + +#define OPENCV_HAL_IMPL_RVV_EXTRACT(_Tpvec, _Tp, vl) \ +inline _Tp v_extract_highest(_Tpvec v) \ +{ \ + return v_extract_n(v, vl-1); \ +} + +OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint8, uchar, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_EXTRACT(v_int8, schar, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint16, ushort, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_EXTRACT(v_int16, short, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint32, unsigned int, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_EXTRACT(v_int32, int, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint64, uint64, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_EXTRACT(v_int64, int64, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_EXTRACT(v_float32, float, VTraits::vlanes()) +#if CV_SIMD_SCALABLE_64F +OPENCV_HAL_IMPL_RVV_EXTRACT(v_float64, double, VTraits::vlanes()) +#endif + ////////////// Load/Store ////////////// #define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(_Tpvec, _nTpvec, _Tp, hvl, vl, width, suffix, vmv) \ @@ -387,6 +445,9 @@ OPENCV_HAL_IMPL_RVV_LUT(v_int16, short, m2) OPENCV_HAL_IMPL_RVV_LUT(v_int32, int, m1) OPENCV_HAL_IMPL_RVV_LUT(v_int64, int64_t, mf2) OPENCV_HAL_IMPL_RVV_LUT(v_float32, float, m1) +#if CV_SIMD_SCALABLE_64F +OPENCV_HAL_IMPL_RVV_LUT(v_float64, double, mf2) +#endif inline v_uint8 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); } inline v_uint8 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); } @@ -401,6 +462,219 @@ inline v_uint64 v_lut(const uint64* tab, const int* idx) { return v_reinterpret_ inline v_uint64 v_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); } inline v_uint64 v_lut_quads(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_quads((const int64_t*)tab, idx)); } +////////////// Pack boolean //////////////////// +/* TODO */ + +////////////// Arithmetics ////////////// +#define OPENCV_HAL_IMPL_RVV_BIN_OP(_Tpvec, ocv_intrin, rvv_intrin) \ +inline _Tpvec v_##ocv_intrin(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return rvv_intrin(a, b, VTraits<_Tpvec>::vlanes()); \ +} + +OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, add, vsaddu) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, sub, vssubu) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, div, vdivu) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, add, vsadd) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, sub, vssub) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, div, vdiv) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, add, vsaddu) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, sub, vssubu) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, div, vdivu) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, add, vsadd) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, sub, vssub) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, div, vdiv) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, add, vadd) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, sub, vsub) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, mul, vmul) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, div, vdivu) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_int32, add, vadd) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_int32, sub, vsub) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_int32, mul, vmul) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_int32, div, vdiv) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, add, vfadd) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, sub, vfsub) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, mul, vfmul) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, div, vfdiv) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint64, add, vadd) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint64, sub, vsub) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint64, mul, vmul) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint64, div, vdivu) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_int64, add, vadd) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_int64, sub, vsub) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_int64, mul, vmul) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_int64, div, vdiv) + +#if CV_SIMD_SCALABLE_64F +OPENCV_HAL_IMPL_RVV_BIN_OP(v_float64, add, vfadd) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_float64, sub, vfsub) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_float64, mul, vfmul) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_float64, div, vfdiv) +#endif + +#define OPENCV_HAL_IMPL_RVV_BIN_MADD(_Tpvec, rvv_add) \ +template \ +inline _Tpvec v_add(_Tpvec f1, _Tpvec f2, Args... vf) { \ + return v_add(rvv_add(f1, f2, VTraits<_Tpvec>::vlanes()), vf...); \ +} +#define OPENCV_HAL_IMPL_RVV_BIN_MMUL(_Tpvec, rvv_mul) \ +template \ +inline _Tpvec v_mul(_Tpvec f1, _Tpvec f2, Args... vf) { \ + return v_mul(rvv_mul(f1, f2, VTraits<_Tpvec>::vlanes()), vf...); \ +} +OPENCV_HAL_IMPL_RVV_BIN_MADD(v_uint8, vsaddu) +OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int8, vsadd) +OPENCV_HAL_IMPL_RVV_BIN_MADD(v_uint16, vsaddu) +OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int16, vsadd) +OPENCV_HAL_IMPL_RVV_BIN_MADD(v_uint32, vadd) +OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int32, vadd) +OPENCV_HAL_IMPL_RVV_BIN_MADD(v_float32, vfadd) +OPENCV_HAL_IMPL_RVV_BIN_MADD(v_uint64, vadd) +OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int64, vadd) + +OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_uint32, vmul) +OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_int32, vmul) +OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_float32, vfmul) +OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_uint64, vmul) +OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_int64, vmul) +#if CV_SIMD_SCALABLE_64F +OPENCV_HAL_IMPL_RVV_BIN_MADD(v_float64, vfadd) +OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_float64, vfmul) +#endif + +#define OPENCV_HAL_IMPL_RVV_MUL_EXPAND(_Tpvec, _Tpwvec, _TpwvecM2, suffix, wmul) \ +inline void v_mul_expand(const _Tpvec& a, const _Tpvec& b, _Tpwvec& c, _Tpwvec& d) \ +{ \ + _TpwvecM2 temp = wmul(a, b, VTraits<_Tpvec>::vlanes()); \ + c = vget_##suffix##m1(temp, 0); \ + d = vget_##suffix##m1(temp, 1); \ +} + +OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint8, v_uint16, vuint16m2_t, u16, vwmulu) +OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int8, v_int16, vint16m2_t, i16, vwmul) +OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint16, v_uint32, vuint32m2_t, u32, vwmulu) +OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int16, v_int32, vint32m2_t, i32, vwmul) +OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint32, v_uint64, vuint64m2_t, u64, vwmulu) +OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int32, v_int64, vint64m2_t, i64, vwmul) + + +inline v_int16 v_mul_hi(const v_int16& a, const v_int16& b) +{ + return vmulh(a, b, VTraits::vlanes()); +} +inline v_uint16 v_mul_hi(const v_uint16& a, const v_uint16& b) +{ + return vmulhu(a, b, VTraits::vlanes()); +} + +////////////// Arithmetics (wrap)////////////// +OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, add_wrap, vadd) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, add_wrap, vadd) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, add_wrap, vadd) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, add_wrap, vadd) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, sub_wrap, vsub) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, sub_wrap, vsub) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, sub_wrap, vsub) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, sub_wrap, vsub) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, mul_wrap, vmul) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, mul_wrap, vmul) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, mul_wrap, vmul) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, mul_wrap, vmul) + +//////// Saturating Multiply //////// +// TODO + +////////////// Bitwise logic ////////////// + +#define OPENCV_HAL_IMPL_RVV_LOGIC_OP(_Tpvec, vl) \ +inline _Tpvec v_and(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return vand(a, b, vl); \ +} \ +inline _Tpvec v_or(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return vor(a, b, vl); \ +} \ +inline _Tpvec v_xor(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return vxor(a, b, vl); \ +} \ +inline _Tpvec v_not (const _Tpvec& a) \ +{ \ + return vnot(a, vl); \ +} + +OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint8, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int8, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint16, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int16, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint32, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int32, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint64, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int64, VTraits::vlanes()) + +#define OPENCV_HAL_IMPL_RVV_FLT32_BIT_OP(op, vl) \ +inline v_float32 v_##op (const v_float32& a, const v_float32& b) \ +{ \ + return vreinterpret_v_i32m1_f32m1(v##op(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b), vl)); \ +} +OPENCV_HAL_IMPL_RVV_FLT32_BIT_OP(and, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_FLT32_BIT_OP(or, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_FLT32_BIT_OP(xor, VTraits::vlanes()) + +inline v_float32 v_not(const v_float32& a) +{ + return vreinterpret_v_i32m1_f32m1(vnot(vreinterpret_v_f32m1_i32m1(a), VTraits::vlanes())); +} + +#if CV_SIMD_SCALABLE_64F +#define OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(op, vl) \ +inline v_float64 v_##op (const v_float64& a, const v_float64& b) \ +{ \ + return vreinterpret_v_i64m1_f64m1(v##op(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b), vl)); \ +} +OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(and, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(or, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(xor, VTraits::vlanes()) +inline v_float64 v_not (const v_float64& a) +{ + return vreinterpret_v_i64m1_f64m1(vnot(vreinterpret_v_f64m1_i64m1(a), VTraits::vlanes())); +} +#endif + +////////////// Bitwise shifts ////////////// + +#define OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(_Tpvec, vl) \ +template inline _Tpvec v_shl(const _Tpvec& a) \ +{ \ + return _Tpvec(vsll(a, uint8_t(n), vl)); \ +} \ +template inline _Tpvec v_shr(const _Tpvec& a) \ +{ \ + return _Tpvec(vsrl(a, uint8_t(n), vl)); \ +} + +#define OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(_Tpvec, vl) \ +template inline _Tpvec v_shl(const _Tpvec& a) \ +{ \ + return _Tpvec(vsll(a, uint8_t(n), vl)); \ +} \ +template inline _Tpvec v_shr(const _Tpvec& a) \ +{ \ + return _Tpvec(vsra(a, uint8_t(n), vl)); \ +} + +OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint8, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint16, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint32, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint64, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int8, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int16, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int32, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int64, VTraits::vlanes()) + +////////////// Comparison ////////////// +// TODO ////////////// Min/Max ////////////// @@ -433,6 +707,363 @@ OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64, v_min, vfmin, VTraits::vlanes OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64, v_max, vfmax, VTraits::vlanes()) #endif +////////////// Reduce ////////////// + +#define OPENCV_HAL_IMPL_RVV_REDUCE_SUM(_Tpvec, _wTpvec, _nwTpvec, scalartype, wsuffix, vl, red) \ +inline scalartype v_reduce_sum(const _Tpvec& a) \ +{ \ + _nwTpvec zero = vmv_v_x_##wsuffix##m1(0, vl); \ + _nwTpvec res = vmv_v_x_##wsuffix##m1(0, vl); \ + res = v##red(res, a, zero, vl); \ + return (scalartype)v_get0(res); \ +} +OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint8, v_uint16, vuint16m1_t, unsigned, u16, VTraits::vlanes(), wredsumu) +OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int8, v_int16, vint16m1_t, int, i16, VTraits::vlanes(), wredsum) +OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint16, v_uint32, vuint32m1_t, unsigned, u32, VTraits::vlanes(), wredsumu) +OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int16, v_int32, vint32m1_t, int, i32, VTraits::vlanes(), wredsum) +OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint32, v_uint64, vuint64m1_t, unsigned, u64, VTraits::vlanes(), wredsumu) +OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int32, v_int64, vint64m1_t, int, i64, VTraits::vlanes(), wredsum) +OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint64, v_uint64, vuint64m1_t, uint64, u64, VTraits::vlanes(), redsum) +OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int64, v_int64, vint64m1_t, int64, i64, VTraits::vlanes(), redsum) + +#define OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(_Tpvec, _wTpvec, _nwTpvec, scalartype, wsuffix, vl) \ +inline scalartype v_reduce_sum(const _Tpvec& a) \ +{ \ + _nwTpvec zero = vfmv_v_f_##wsuffix##m1(0, vl); \ + _nwTpvec res = vfmv_v_f_##wsuffix##m1(0, vl); \ + res = vfredosum(res, a, zero, vl); \ + return (scalartype)v_get0(res); \ +} +OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float32, v_float32, vfloat32m1_t, float, f32, VTraits::vlanes()) +#if CV_SIMD_SCALABLE_64F +OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float64, v_float64, vfloat64m1_t, double, f64, VTraits::vlanes()) +#endif + +#define OPENCV_HAL_IMPL_RVV_REDUCE(_Tpvec, func, scalartype, suffix, vl, red) \ +inline scalartype v_reduce_##func(const _Tpvec& a) \ +{ \ + _Tpvec res = _Tpvec(v##red(a, a, a, vl)); \ + return (scalartype)v_get0(res); \ +} + +OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8, min, uchar, u8, VTraits::vlanes(), redminu) +OPENCV_HAL_IMPL_RVV_REDUCE(v_int8, min, schar, i8, VTraits::vlanes(), redmin) +OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16, min, ushort, u16, VTraits::vlanes(), redminu) +OPENCV_HAL_IMPL_RVV_REDUCE(v_int16, min, short, i16, VTraits::vlanes(), redmin) +OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32, min, unsigned, u32, VTraits::vlanes(), redminu) +OPENCV_HAL_IMPL_RVV_REDUCE(v_int32, min, int, i32, VTraits::vlanes(), redmin) +OPENCV_HAL_IMPL_RVV_REDUCE(v_float32, min, float, f32, VTraits::vlanes(), fredmin) +OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8, max, uchar, u8, VTraits::vlanes(), redmaxu) +OPENCV_HAL_IMPL_RVV_REDUCE(v_int8, max, schar, i8, VTraits::vlanes(), redmax) +OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16, max, ushort, u16, VTraits::vlanes(), redmaxu) +OPENCV_HAL_IMPL_RVV_REDUCE(v_int16, max, short, i16, VTraits::vlanes(), redmax) +OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32, max, unsigned, u32, VTraits::vlanes(), redmaxu) +OPENCV_HAL_IMPL_RVV_REDUCE(v_int32, max, int, i32, VTraits::vlanes(), redmax) +OPENCV_HAL_IMPL_RVV_REDUCE(v_float32, max, float, f32, VTraits::vlanes(), fredmax) + +//TODO: v_reduce_sum4 + +////////////// Square-Root ////////////// + +inline v_float32 v_sqrt(const v_float32& x) +{ + return vfsqrt(x, VTraits::vlanes()); +} + +inline v_float32 v_invsqrt(const v_float32& x) +{ + v_float32 one = v_setall_f32(1.0f); + return v_div(one, v_sqrt(x)); +} + +#if CV_SIMD_SCALABLE_64F +inline v_float64 v_sqrt(const v_float64& x) +{ + return vfsqrt(x, VTraits::vlanes()); +} + +inline v_float64 v_invsqrt(const v_float64& x) +{ + v_float64 one = v_setall_f64(1.0f); + return v_div(one, v_sqrt(x)); +} +#endif + +inline v_float32 v_magnitude(const v_float32& a, const v_float32& b) +{ + v_float32 x = vfmacc(vfmul(a, a, VTraits::vlanes()), b, b, VTraits::vlanes()); + return v_sqrt(x); +} + +inline v_float32 v_sqr_magnitude(const v_float32& a, const v_float32& b) +{ + return v_float32(vfmacc(vfmul(a, a, VTraits::vlanes()), b, b, VTraits::vlanes())); +} + +#if CV_SIMD_SCALABLE_64F +inline v_float64 v_magnitude(const v_float64& a, const v_float64& b) +{ + v_float64 x = vfmacc(vfmul(a, a, VTraits::vlanes()), b, b, VTraits::vlanes()); + return v_sqrt(x); +} + +inline v_float64 v_sqr_magnitude(const v_float64& a, const v_float64& b) +{ + return vfmacc(vfmul(a, a, VTraits::vlanes()), b, b, VTraits::vlanes()); +} +#endif + +////////////// Multiply-Add ////////////// + +inline v_float32 v_fma(const v_float32& a, const v_float32& b, const v_float32& c) +{ + return vfmacc(c, a, b, VTraits::vlanes()); +} +inline v_int32 v_fma(const v_int32& a, const v_int32& b, const v_int32& c) +{ + return vmacc(c, a, b, VTraits::vlanes()); +} + +inline v_float32 v_muladd(const v_float32& a, const v_float32& b, const v_float32& c) +{ + return v_fma(a, b, c); +} + +inline v_int32 v_muladd(const v_int32& a, const v_int32& b, const v_int32& c) +{ + return v_fma(a, b, c); +} + +#if CV_SIMD_SCALABLE_64F +inline v_float64 v_fma(const v_float64& a, const v_float64& b, const v_float64& c) +{ + return vfmacc_vv_f64m1(c, a, b, VTraits::vlanes()); +} + +inline v_float64 v_muladd(const v_float64& a, const v_float64& b, const v_float64& c) +{ + return v_fma(a, b, c); +} +#endif + +////////////// Check all/any ////////////// + +#define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, vl) \ +inline bool v_check_all(const _Tpvec& a) \ +{ \ + return vcpop(vmslt(a, 0, vl), vl) == vl; \ +} \ +inline bool v_check_any(const _Tpvec& a) \ +{ \ + return vcpop(vmslt(a, 0, vl), vl) != 0; \ +} + +OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int8, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int16, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int32, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int64, VTraits::vlanes()) + + +inline bool v_check_all(const v_uint8& a) +{ return v_check_all(v_reinterpret_as_s8(a)); } +inline bool v_check_any(const v_uint8& a) +{ return v_check_any(v_reinterpret_as_s8(a)); } + +inline bool v_check_all(const v_uint16& a) +{ return v_check_all(v_reinterpret_as_s16(a)); } +inline bool v_check_any(const v_uint16& a) +{ return v_check_any(v_reinterpret_as_s16(a)); } + +inline bool v_check_all(const v_uint32& a) +{ return v_check_all(v_reinterpret_as_s32(a)); } +inline bool v_check_any(const v_uint32& a) +{ return v_check_any(v_reinterpret_as_s32(a)); } + +inline bool v_check_all(const v_float32& a) +{ return v_check_all(v_reinterpret_as_s32(a)); } +inline bool v_check_any(const v_float32& a) +{ return v_check_any(v_reinterpret_as_s32(a)); } + +inline bool v_check_all(const v_uint64& a) +{ return v_check_all(v_reinterpret_as_s64(a)); } +inline bool v_check_any(const v_uint64& a) +{ return v_check_any(v_reinterpret_as_s64(a)); } + +#if CV_SIMD_SCALABLE_64F +inline bool v_check_all(const v_float64& a) +{ return v_check_all(v_reinterpret_as_s64(a)); } +inline bool v_check_any(const v_float64& a) +{ return v_check_any(v_reinterpret_as_s64(a)); } +#endif + +////////////// abs ////////////// + +#define OPENCV_HAL_IMPL_RVV_ABSDIFF(_Tpvec, abs) \ +inline _Tpvec v_##abs(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return v_sub(v_max(a, b), v_min(a, b)); \ +} + +OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint8, absdiff) +OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint16, absdiff) +OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint32, absdiff) +OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float32, absdiff) +#if CV_SIMD_SCALABLE_64F +OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float64, absdiff) +#endif +OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int8, absdiffs) +OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int16, absdiffs) + +#define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(_Tpvec, _rTpvec, width) \ +inline _rTpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return vnclipu(vreinterpret_u##width##m2(vwsub_vv(v_max(a, b), v_min(a, b), VTraits<_Tpvec>::vlanes())), 0, VTraits<_Tpvec>::vlanes()); \ +} + +OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int8, v_uint8, 16) +OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int16, v_uint16, 32) +OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int32, v_uint32, 64) + +#define OPENCV_HAL_IMPL_RVV_ABS(_Tprvec, _Tpvec, suffix) \ +inline _Tprvec v_abs(const _Tpvec& a) \ +{ \ + return v_absdiff(a, v_setzero_##suffix()); \ +} + +OPENCV_HAL_IMPL_RVV_ABS(v_uint8, v_int8, s8) +OPENCV_HAL_IMPL_RVV_ABS(v_uint16, v_int16, s16) +OPENCV_HAL_IMPL_RVV_ABS(v_uint32, v_int32, s32) +OPENCV_HAL_IMPL_RVV_ABS(v_float32, v_float32, f32) +#if CV_SIMD_SCALABLE_64F +OPENCV_HAL_IMPL_RVV_ABS(v_float64, v_float64, f64) +#endif + + +#define OPENCV_HAL_IMPL_RVV_REDUCE_SAD(_Tpvec, scalartype) \ +inline scalartype v_reduce_sad(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return v_reduce_sum(v_absdiff(a, b)); \ +} + +OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint8, unsigned) +OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int8, unsigned) +OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint16, unsigned) +OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int16, unsigned) +OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint32, unsigned) +OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int32, unsigned) +OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_float32, float) + +////////////// Select ////////////// + +#define OPENCV_HAL_IMPL_RVV_SELECT(_Tpvec, vl) \ +inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \ +{ \ + return vmerge(vmsne(mask, 0, vl), b, a, vl); \ +} + +OPENCV_HAL_IMPL_RVV_SELECT(v_uint8, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_SELECT(v_uint16, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_SELECT(v_uint32, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_SELECT(v_int8, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_SELECT(v_int16, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_SELECT(v_int32, VTraits::vlanes()) + +inline v_float32 v_select(const v_float32& mask, const v_float32& a, const v_float32& b) \ +{ \ + return vmerge(vmfne(mask, 0, VTraits::vlanes()), b, a, VTraits::vlanes()); \ +} + +#if CV_SIMD_SCALABLE_64F +inline v_float64 v_select(const v_float64& mask, const v_float64& a, const v_float64& b) \ +{ \ + return vmerge(vmfne(mask, 0, VTraits::vlanes()), b, a, VTraits::vlanes()); \ +} +#endif + +////////////// Rotate shift ////////////// + +#define OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(_Tpvec, suffix, vl) \ +template inline _Tpvec v_rotate_right(const _Tpvec& a) \ +{ \ + return vslidedown(vmv_v_x_##suffix##m1(0, vl), a, n, vl); \ +} \ +template inline _Tpvec v_rotate_left(const _Tpvec& a) \ +{ \ + return vslideup(vmv_v_x_##suffix##m1(0, vl), a, n, vl); \ +} \ +template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \ +{ return a; } \ +template inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return vslideup(vslidedown(vmv_v_x_##suffix##m1(0, vl), a, n, vl), b, VTraits<_Tpvec>::vlanes() - n, vl); \ +} \ +template inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return vslideup(vslidedown(vmv_v_x_##suffix##m1(0, vl), b, VTraits<_Tpvec>::vlanes() - n, vl), a, n, vl); \ +} \ +template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \ +{ CV_UNUSED(b); return a; } + +OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint8, u8, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int8, i8, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint16, u16, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int16, i16, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint32, u32, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int32, i32, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint64, u64, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int64, i64, VTraits::vlanes()) + +#define OPENCV_HAL_IMPL_RVV_ROTATE_FP(_Tpvec, suffix, vl) \ +template inline _Tpvec v_rotate_right(const _Tpvec& a) \ +{ \ + return vslidedown(vfmv_v_f_##suffix##m1(0, vl), a, n, vl); \ +} \ +template inline _Tpvec v_rotate_left(const _Tpvec& a) \ +{ \ + return vslideup(vfmv_v_f_##suffix##m1(0, vl), a, n, vl); \ +} \ +template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \ +{ return a; } \ +template inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return vslideup(vslidedown(vfmv_v_f_##suffix##m1(0, vl), a, n, vl), b, VTraits<_Tpvec>::vlanes() - n, vl); \ +} \ +template inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return vslideup(vslidedown(vfmv_v_f_##suffix##m1(0, vl), b, VTraits<_Tpvec>::vlanes() - n, vl), a, n, vl); \ +} \ +template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \ +{ CV_UNUSED(b); return a; } + +OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float32, f32, VTraits::vlanes()) +#if CV_SIMD_SCALABLE_64F +OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float64, f64, VTraits::vlanes()) +#endif + +////////////// Convert to float ////////////// +// TODO + +//////////// Broadcast ////////////// + +#define OPENCV_HAL_IMPL_RVV_BROADCAST(_Tpvec, suffix) \ +template inline _Tpvec v_broadcast_element(_Tpvec v, int i = s) \ +{ \ + return v_setall_##suffix(v_extract_n(v, i)); \ +} \ +inline _Tpvec v_broadcast_highest(_Tpvec v) \ +{ \ + return v_setall_##suffix(v_extract_n(v, VTraits<_Tpvec>::vlanes()-1)); \ +} + +OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint32, u32) +OPENCV_HAL_IMPL_RVV_BROADCAST(v_int32, s32) +OPENCV_HAL_IMPL_RVV_BROADCAST(v_float32, f32) + +////////////// Transpose4x4 ////////////// +// TODO + +////////////// Reverse ////////////// +// TODO //////////// Value reordering //////////// @@ -475,6 +1106,61 @@ inline v_int32 v_load_expand_q(const schar* ptr) return vwcvt_x(vwcvt_x(vle8_v_i8mf4(ptr, VTraits::vlanes()), VTraits::vlanes()), VTraits::vlanes()); } +//////////// PopCount ////////// +// TODO + +//////////// SignMask //////////// +#define OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(_Tpvec) \ +inline int v_signmask(const _Tpvec& a) \ +{ \ + uint8_t ans[4] = {0}; \ + vsm(ans, vmslt(a, 0, VTraits<_Tpvec>::vlanes()), VTraits<_Tpvec>::vlanes()); \ + return *(reinterpret_cast(ans)); \ +} \ +inline int v_scan_forward(const _Tpvec& a) \ +{ \ + return (int)vfirst(vmslt(a, 0, VTraits<_Tpvec>::vlanes()), VTraits<_Tpvec>::vlanes()); \ +} + +OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int8) +OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int16) +OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int32) +OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int64) + +inline int64 v_signmask(const v_uint8& a) +{ return v_signmask(v_reinterpret_as_s8(a)); } +inline int64 v_signmask(const v_uint16& a) +{ return v_signmask(v_reinterpret_as_s16(a)); } +inline int v_signmask(const v_uint32& a) +{ return v_signmask(v_reinterpret_as_s32(a)); } +inline int v_signmask(const v_float32& a) +{ return v_signmask(v_reinterpret_as_s32(a)); } +inline int v_signmask(const v_uint64& a) +{ return v_signmask(v_reinterpret_as_s64(a)); } +#if CV_SIMD_SCALABLE_64F +inline int v_signmask(const v_float64& a) +{ return v_signmask(v_reinterpret_as_s64(a)); } +#endif + +//////////// Scan forward //////////// +inline int v_scan_forward(const v_uint8& a) +{ return v_scan_forward(v_reinterpret_as_s8(a)); } +inline int v_scan_forward(const v_uint16& a) +{ return v_scan_forward(v_reinterpret_as_s16(a)); } +inline int v_scan_forward(const v_uint32& a) +{ return v_scan_forward(v_reinterpret_as_s32(a)); } +inline int v_scan_forward(const v_float32& a) +{ return v_scan_forward(v_reinterpret_as_s32(a)); } +inline int v_scan_forward(const v_uint64& a) +{ return v_scan_forward(v_reinterpret_as_s64(a)); } +#if CV_SIMD_SCALABLE_64F +inline int v_scan_forward(const v_float64& a) +{ return v_scan_forward(v_reinterpret_as_s64(a)); } +#endif + +//////////// Pack triplets //////////// +// TODO + ////// FP16 support /////// @@ -484,6 +1170,15 @@ inline v_float32 v_load_expand(const float16_t* ptr) return vundefined_f32m1(); } +////////////// Rounding ////////////// +// TODO + +//////// Dot Product //////// +// TODO + +//////// Fast Dot Product //////// +// TODO + inline void v_cleanup() {} CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index 763702bf38..d3ced9df87 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -1737,7 +1737,33 @@ void test_hal_intrin_uint8() // typedef v_uint8 R; TheTest() .test_loadstore() + .test_expand() + .test_expand_q() + .test_addsub() + .test_arithm_wrap() + .test_mul_expand() + .test_logic() .test_min_max() + .test_absdiff() + .test_mask() + .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>() + .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>() + .test_extract_n<0>().test_extract_n<1>() + .test_extract_highest() +#if 0 // not implemented in rvv backend yet. + .test_interleave() + .test_mul() + .test_cmp() + .test_dotprod_expand() + .test_reduce() + .test_reduce_sad() + .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>() + .test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>() + .test_pack_b() + .test_unpack() + .test_reverse() + .test_popcount() +#endif ; } @@ -1747,7 +1773,33 @@ void test_hal_intrin_int8() // typedef v_int8 R; TheTest() .test_loadstore() + .test_expand() + .test_expand_q() + .test_addsub() + .test_arithm_wrap() + .test_mul_expand() + .test_logic() .test_min_max() + .test_absdiff() + .test_absdiffs() + .test_abs() + .test_mask() + .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>() + .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>() + .test_extract_n<0>().test_extract_n<1>() + .test_extract_highest() +#if 0 + .test_interleave() + .test_mul() + .test_cmp() + .test_dotprod_expand() + .test_reduce() + .test_reduce_sad() + .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>() + .test_unpack() + .test_reverse() + .test_popcount() +#endif ; } @@ -1759,7 +1811,34 @@ void test_hal_intrin_uint16() // typedef v_uint16 R; TheTest() .test_loadstore() + .test_expand() + .test_addsub() + .test_arithm_wrap() + .test_mul_expand() + .test_mul_hi() + .test_shift<1>() + .test_shift<8>() + .test_logic() .test_min_max() + .test_absdiff() + .test_mask() + .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>() + .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>() + .test_extract_n<0>().test_extract_n<1>() + .test_extract_highest() +#if 0 + .test_interleave() + .test_mul() + .test_cmp() + .test_dotprod_expand() + .test_reduce() + .test_reduce_sad() + .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>() + .test_pack_u<1>().test_pack_u<2>().test_pack_u<7>().test_pack_u<16>() + .test_unpack() + .test_reverse() + .test_popcount() +#endif ; } @@ -1769,7 +1848,36 @@ void test_hal_intrin_int16() // typedef v_int16 R; TheTest() .test_loadstore() + .test_expand() + .test_addsub() + .test_arithm_wrap() + .test_mul_expand() + .test_mul_hi() + .test_shift<1>() + .test_shift<8>() + .test_logic() .test_min_max() + .test_absdiff() + .test_absdiffs() + .test_abs() + .test_mask() + .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>() + .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>() + .test_extract_n<0>().test_extract_n<1>() + .test_extract_highest() +#if 0 + .test_interleave() + .test_mul() + .test_cmp() + .test_dotprod() + .test_dotprod_expand() + .test_reduce() + .test_reduce_sad() + .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>() + .test_unpack() + .test_reverse() + .test_popcount() +#endif ; } @@ -1781,7 +1889,33 @@ void test_hal_intrin_uint32() // typedef v_uint32 R; TheTest() .test_loadstore() + .test_expand() + .test_addsub() + .test_mul() + .test_mul_expand() + .test_shift<1>() + .test_shift<8>() + .test_logic() .test_min_max() + .test_absdiff() + .test_mask() + .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>() + .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>() + .test_extract_n<0>().test_extract_n<1>() + .test_broadcast_element<0>().test_broadcast_element<1>() + .test_extract_highest() + .test_broadcast_highest() +#if 0 + .test_interleave() + .test_cmp() + .test_reduce() + .test_reduce_sad() + .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>() + .test_unpack() + .test_reverse() + .test_transpose() + .test_popcount() +#endif ; } @@ -1791,7 +1925,36 @@ void test_hal_intrin_int32() // typedef v_int32 R; TheTest() .test_loadstore() + .test_expand() + .test_addsub() + .test_mul() + .test_abs() + .test_shift<1>().test_shift<8>() + .test_dotprod_expand_f64() + .test_logic() .test_min_max() + .test_absdiff() + .test_mask() + .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>() + .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>() + .test_extract_n<0>().test_extract_n<1>() + .test_broadcast_element<0>().test_broadcast_element<1>() + .test_extract_highest() + .test_broadcast_highest() +#if 0 + .test_interleave() + .test_cmp() + .test_dotprod() + .test_reduce() + .test_reduce_sad() + .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>() + .test_unpack() + .test_reverse() + .test_float_cvt32() + .test_float_cvt64() + .test_transpose() + .test_popcount() +#endif ; } @@ -1803,7 +1966,20 @@ void test_hal_intrin_uint64() // typedef v_uint64 R; TheTest() .test_loadstore() + .test_addsub() + .test_shift<1>().test_shift<8>() + .test_logic() + .test_extract<0>().test_extract<1>() + .test_rotate<0>().test_rotate<1>() + .test_extract_n<0>().test_extract_n<1>() + .test_extract_highest() ; +#if 0 + #if CV_SIMD_64F + .test_cmp64() + #endif + .test_reverse() +#endif } void test_hal_intrin_int64() @@ -1812,7 +1988,21 @@ void test_hal_intrin_int64() // typedef v_int64 R; TheTest() .test_loadstore() + .test_addsub() + .test_shift<1>().test_shift<8>() + .test_logic() + .test_extract<0>().test_extract<1>() + .test_rotate<0>().test_rotate<1>() + .test_extract_n<0>().test_extract_n<1>() + .test_extract_highest() + .test_cvt64_double() ; +#if 0 + #if CV_SIMD_64F + .test_cmp64() + #endif + .test_reverse() +#endif } //============= Floating point ===================================================================== @@ -1822,18 +2012,61 @@ void test_hal_intrin_float32() // typedef v_float32 R; TheTest() .test_loadstore() + .test_addsub() + .test_mul() + .test_div() + .test_sqrt_abs() .test_min_max() + .test_float_absdiff() + .test_mask() + .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>() + .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>() + .test_extract_n<0>().test_extract_n<1>() + .test_broadcast_element<0>().test_broadcast_element<1>() + .test_extract_highest() + .test_broadcast_highest() +#if 0 + .test_interleave() + .test_interleave_2channel() + .test_cmp() + .test_reduce() + .test_reduce_sad() + .test_unpack() + .test_float_math() + .test_float_cvt64() + .test_matmul() + .test_transpose() + .test_reverse() + .test_reduce_sum4() +#endif ; } void test_hal_intrin_float64() { DUMP_ENTRY(v_float64); -#if CV_SIMD_64F +#if CV_SIMD_SCALABLE_64F // typedef v_float64 R; TheTest() .test_loadstore() + .test_addsub() + .test_mul() + .test_div() + .test_sqrt_abs() .test_min_max() + .test_float_absdiff() + .test_mask() + .test_extract<0>().test_extract<1>() + .test_rotate<0>().test_rotate<1>() + .test_extract_n<0>().test_extract_n<1>() + .test_extract_highest() +#if 0 + .test_cmp() + .test_unpack() + .test_float_cvt32() + .test_float_math() + .test_reverse() +#endif ; #endif @@ -1874,6 +2107,7 @@ void test_hal_intrin_uint8() .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>() .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>() .test_extract_n<0>().test_extract_n<1>().test_extract_n() + .test_extract_highest() //.test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element() #if CV_SIMD_WIDTH == 32 .test_pack<9>().test_pack<10>().test_pack<13>().test_pack<15>() @@ -1914,6 +2148,7 @@ void test_hal_intrin_int8() .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>() .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>() .test_extract_n<0>().test_extract_n<1>().test_extract_n() + .test_extract_highest() //.test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element() ; } @@ -1951,6 +2186,7 @@ void test_hal_intrin_uint16() .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>() .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>() .test_extract_n<0>().test_extract_n<1>().test_extract_n() + .test_extract_highest() //.test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element() ; } @@ -1988,6 +2224,7 @@ void test_hal_intrin_int16() .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>() .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>() .test_extract_n<0>().test_extract_n<1>().test_extract_n() + .test_extract_highest() //.test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element() ; } @@ -2022,6 +2259,8 @@ void test_hal_intrin_uint32() .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>() .test_extract_n<0>().test_extract_n<1>().test_extract_n() .test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element() + .test_extract_highest() + .test_broadcast_highest() .test_transpose() ; } @@ -2058,6 +2297,8 @@ void test_hal_intrin_int32() .test_float_cvt32() .test_float_cvt64() .test_transpose() + .test_extract_highest() + .test_broadcast_highest() ; } @@ -2079,6 +2320,7 @@ void test_hal_intrin_uint64() .test_extract<0>().test_extract<1>() .test_rotate<0>().test_rotate<1>() .test_extract_n<0>().test_extract_n<1>().test_extract_n() + .test_extract_highest() //.test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element() ; } @@ -2099,6 +2341,7 @@ void test_hal_intrin_int64() .test_extract<0>().test_extract<1>() .test_rotate<0>().test_rotate<1>() .test_extract_n<0>().test_extract_n<1>().test_extract_n() + .test_extract_highest() //.test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element() .test_cvt64_double() ; @@ -2134,6 +2377,8 @@ void test_hal_intrin_float32() .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>() .test_extract_n<0>().test_extract_n<1>().test_extract_n() .test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element() + .test_extract_highest() + .test_broadcast_highest() #if CV_SIMD_WIDTH == 32 .test_extract<4>().test_extract<5>().test_extract<6>().test_extract<7>() .test_rotate<4>().test_rotate<5>().test_rotate<6>().test_rotate<7>() @@ -2163,6 +2408,7 @@ void test_hal_intrin_float64() .test_extract<0>().test_extract<1>() .test_rotate<0>().test_rotate<1>() .test_extract_n<0>().test_extract_n<1>().test_extract_n() + .test_extract_highest() //.test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element() #if CV_SIMD_WIDTH == 32 .test_extract<2>().test_extract<3>() From 2fb652ce09c5f46d108c219d4b13cb86681e7e95 Mon Sep 17 00:00:00 2001 From: HAN Liutong Date: Fri, 12 Aug 2022 01:44:30 +0000 Subject: [PATCH 2/8] Add testcase for continuous mul and add. --- modules/core/test/test_intrin_utils.hpp | 31 ++++++++++++++----------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index d3ced9df87..2398d308b9 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -578,16 +578,18 @@ template struct TheTest TheTest & test_addsub() { - Data dataA, dataB; + Data dataA, dataB, dataC; dataB.reverse(); - R a = dataA, b = dataB; + dataA[1] = static_cast(std::numeric_limits::max()); + R a = dataA, b = dataB, c = dataC; - Data resC = v_add(a, b), resD = v_sub(a, b); + Data resD = v_add(a, b), resE = v_add(a, b, c), resF = v_sub(a, b); for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); - EXPECT_EQ(saturate_cast(dataA[i] + dataB[i]), resC[i]); - EXPECT_EQ(saturate_cast(dataA[i] - dataB[i]), resD[i]); + EXPECT_EQ(saturate_cast(dataA[i] + dataB[i]), resD[i]); + EXPECT_EQ(saturate_cast(dataA[i] + dataB[i] + dataC[i]), resE[i]); + EXPECT_EQ(saturate_cast(dataA[i] - dataB[i]), resF[i]); } return *this; @@ -614,16 +616,18 @@ template struct TheTest TheTest & test_mul() { - Data dataA, dataB; + Data dataA, dataB, dataC; dataA[1] = static_cast(std::numeric_limits::max()); dataB.reverse(); - R a = dataA, b = dataB; + R a = dataA, b = dataB, c = dataC; - Data resC = v_mul(a, b); + Data resD = v_mul(a, b); + Data resE = v_mul(a, b, c); for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); - EXPECT_EQ(saturate_cast(dataA[i] * dataB[i]), resC[i]); + EXPECT_EQ(saturate_cast(dataA[i] * dataB[i]), resD[i]); + EXPECT_EQ(saturate_cast(dataA[i] * dataB[i] * dataC[i]), resE[i]); } return *this; @@ -1741,6 +1745,7 @@ void test_hal_intrin_uint8() .test_expand_q() .test_addsub() .test_arithm_wrap() + .test_mul() .test_mul_expand() .test_logic() .test_min_max() @@ -1752,7 +1757,6 @@ void test_hal_intrin_uint8() .test_extract_highest() #if 0 // not implemented in rvv backend yet. .test_interleave() - .test_mul() .test_cmp() .test_dotprod_expand() .test_reduce() @@ -1777,6 +1781,7 @@ void test_hal_intrin_int8() .test_expand_q() .test_addsub() .test_arithm_wrap() + .test_mul() .test_mul_expand() .test_logic() .test_min_max() @@ -1790,7 +1795,6 @@ void test_hal_intrin_int8() .test_extract_highest() #if 0 .test_interleave() - .test_mul() .test_cmp() .test_dotprod_expand() .test_reduce() @@ -1814,6 +1818,7 @@ void test_hal_intrin_uint16() .test_expand() .test_addsub() .test_arithm_wrap() + .test_mul() .test_mul_expand() .test_mul_hi() .test_shift<1>() @@ -1828,7 +1833,6 @@ void test_hal_intrin_uint16() .test_extract_highest() #if 0 .test_interleave() - .test_mul() .test_cmp() .test_dotprod_expand() .test_reduce() @@ -1851,6 +1855,7 @@ void test_hal_intrin_int16() .test_expand() .test_addsub() .test_arithm_wrap() + .test_mul() .test_mul_expand() .test_mul_hi() .test_shift<1>() @@ -1867,7 +1872,7 @@ void test_hal_intrin_int16() .test_extract_highest() #if 0 .test_interleave() - .test_mul() + .test_cmp() .test_dotprod() .test_dotprod_expand() From 80c82e10aa7e9a5d227fe4a2cad0e409c278d6d3 Mon Sep 17 00:00:00 2001 From: HAN Liutong Date: Fri, 12 Aug 2022 01:45:44 +0000 Subject: [PATCH 3/8] Update implementations on arithmetics. --- .../opencv2/core/hal/intrin_rvv_scalable.hpp | 33 +++++++++---------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp index 728112bc99..5b3f1677e9 100644 --- a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp @@ -474,36 +474,26 @@ inline _Tpvec v_##ocv_intrin(const _Tpvec& a, const _Tpvec& b) \ OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, add, vsaddu) OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, sub, vssubu) -OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, div, vdivu) OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, add, vsadd) OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, sub, vssub) -OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, div, vdiv) OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, add, vsaddu) OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, sub, vssubu) -OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, div, vdivu) OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, add, vsadd) OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, sub, vssub) -OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, div, vdiv) OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, add, vadd) OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, sub, vsub) OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, mul, vmul) -OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, div, vdivu) OPENCV_HAL_IMPL_RVV_BIN_OP(v_int32, add, vadd) OPENCV_HAL_IMPL_RVV_BIN_OP(v_int32, sub, vsub) OPENCV_HAL_IMPL_RVV_BIN_OP(v_int32, mul, vmul) -OPENCV_HAL_IMPL_RVV_BIN_OP(v_int32, div, vdiv) OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, add, vfadd) OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, sub, vfsub) OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, mul, vfmul) OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, div, vfdiv) OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint64, add, vadd) OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint64, sub, vsub) -OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint64, mul, vmul) -OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint64, div, vdivu) OPENCV_HAL_IMPL_RVV_BIN_OP(v_int64, add, vadd) OPENCV_HAL_IMPL_RVV_BIN_OP(v_int64, sub, vsub) -OPENCV_HAL_IMPL_RVV_BIN_OP(v_int64, mul, vmul) -OPENCV_HAL_IMPL_RVV_BIN_OP(v_int64, div, vdiv) #if CV_SIMD_SCALABLE_64F OPENCV_HAL_IMPL_RVV_BIN_OP(v_float64, add, vfadd) @@ -514,12 +504,12 @@ OPENCV_HAL_IMPL_RVV_BIN_OP(v_float64, div, vfdiv) #define OPENCV_HAL_IMPL_RVV_BIN_MADD(_Tpvec, rvv_add) \ template \ -inline _Tpvec v_add(_Tpvec f1, _Tpvec f2, Args... vf) { \ +inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \ return v_add(rvv_add(f1, f2, VTraits<_Tpvec>::vlanes()), vf...); \ } #define OPENCV_HAL_IMPL_RVV_BIN_MMUL(_Tpvec, rvv_mul) \ template \ -inline _Tpvec v_mul(_Tpvec f1, _Tpvec f2, Args... vf) { \ +inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \ return v_mul(rvv_mul(f1, f2, VTraits<_Tpvec>::vlanes()), vf...); \ } OPENCV_HAL_IMPL_RVV_BIN_MADD(v_uint8, vsaddu) @@ -535,8 +525,6 @@ OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int64, vadd) OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_uint32, vmul) OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_int32, vmul) OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_float32, vfmul) -OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_uint64, vmul) -OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_int64, vmul) #if CV_SIMD_SCALABLE_64F OPENCV_HAL_IMPL_RVV_BIN_MADD(v_float64, vfadd) OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_float64, vfmul) @@ -555,8 +543,6 @@ OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int8, v_int16, vint16m2_t, i16, vwmul) OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint16, v_uint32, vuint32m2_t, u32, vwmulu) OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int16, v_int32, vint32m2_t, i32, vwmul) OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint32, v_uint64, vuint64m2_t, u64, vwmulu) -OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int32, v_int64, vint64m2_t, i64, vwmul) - inline v_int16 v_mul_hi(const v_int16& a, const v_int16& b) { @@ -582,7 +568,20 @@ OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, mul_wrap, vmul) OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, mul_wrap, vmul) //////// Saturating Multiply //////// -// TODO +#define OPENCV_HAL_IMPL_RVV_MUL_SAT(_Tpvec, _clip, _wmul) \ +inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _clip(_wmul(a, b, VTraits<_Tpvec>::vlanes()), 0, VTraits<_Tpvec>::vlanes()); \ +} \ +template \ +inline _Tpvec v_mul(const _Tpvec& a1, const _Tpvec& a2, const Args&... va) { \ + return v_mul(_clip(_wmul(a1, a2, VTraits<_Tpvec>::vlanes()), 0, VTraits<_Tpvec>::vlanes()), va...); \ +} + +OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint8, vnclipu, vwmulu) +OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int8, vnclip, vwmul) +OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint16, vnclipu, vwmulu) +OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int16, vnclip, vwmul) ////////////// Bitwise logic ////////////// From e65ad44b32136dcec1cb4ae5b249f56b41496949 Mon Sep 17 00:00:00 2001 From: HAN Liutong Date: Fri, 12 Aug 2022 14:12:52 +0000 Subject: [PATCH 4/8] Remove redundant intrinsics. --- .../opencv2/core/hal/intrin_rvv_scalable.hpp | 38 ------------------- 1 file changed, 38 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp index 5b3f1677e9..396a2d68a5 100644 --- a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp @@ -612,34 +612,7 @@ OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int32, VTraits::vlanes()) OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint64, VTraits::vlanes()) OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int64, VTraits::vlanes()) -#define OPENCV_HAL_IMPL_RVV_FLT32_BIT_OP(op, vl) \ -inline v_float32 v_##op (const v_float32& a, const v_float32& b) \ -{ \ - return vreinterpret_v_i32m1_f32m1(v##op(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b), vl)); \ -} -OPENCV_HAL_IMPL_RVV_FLT32_BIT_OP(and, VTraits::vlanes()) -OPENCV_HAL_IMPL_RVV_FLT32_BIT_OP(or, VTraits::vlanes()) -OPENCV_HAL_IMPL_RVV_FLT32_BIT_OP(xor, VTraits::vlanes()) -inline v_float32 v_not(const v_float32& a) -{ - return vreinterpret_v_i32m1_f32m1(vnot(vreinterpret_v_f32m1_i32m1(a), VTraits::vlanes())); -} - -#if CV_SIMD_SCALABLE_64F -#define OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(op, vl) \ -inline v_float64 v_##op (const v_float64& a, const v_float64& b) \ -{ \ - return vreinterpret_v_i64m1_f64m1(v##op(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b), vl)); \ -} -OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(and, VTraits::vlanes()) -OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(or, VTraits::vlanes()) -OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(xor, VTraits::vlanes()) -inline v_float64 v_not (const v_float64& a) -{ - return vreinterpret_v_i64m1_f64m1(vnot(vreinterpret_v_f64m1_i64m1(a), VTraits::vlanes())); -} -#endif ////////////// Bitwise shifts ////////////// @@ -663,11 +636,9 @@ template inline _Tpvec v_shr(const _Tpvec& a) \ return _Tpvec(vsra(a, uint8_t(n), vl)); \ } -OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint8, VTraits::vlanes()) OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint16, VTraits::vlanes()) OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint32, VTraits::vlanes()) OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint64, VTraits::vlanes()) -OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int8, VTraits::vlanes()) OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int16, VTraits::vlanes()) OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int32, VTraits::vlanes()) OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int64, VTraits::vlanes()) @@ -697,10 +668,6 @@ OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32, v_min, vmin, VTraits::vlanes()) OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32, v_max, vmax, VTraits::vlanes()) OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32, v_min, vfmin, VTraits::vlanes()) OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32, v_max, vfmax, VTraits::vlanes()) -OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64, v_min, vminu, VTraits::vlanes()) -OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64, v_max, vmaxu, VTraits::vlanes()) -OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64, v_min, vmin, VTraits::vlanes()) -OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64, v_max, vmax, VTraits::vlanes()) #if CV_SIMD_SCALABLE_64F OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64, v_min, vfmin, VTraits::vlanes()) OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64, v_max, vfmax, VTraits::vlanes()) @@ -722,8 +689,6 @@ OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint16, v_uint32, vuint32m1_t, unsigned, u32, V OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int16, v_int32, vint32m1_t, int, i32, VTraits::vlanes(), wredsum) OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint32, v_uint64, vuint64m1_t, unsigned, u64, VTraits::vlanes(), wredsumu) OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int32, v_int64, vint64m1_t, int, i64, VTraits::vlanes(), wredsum) -OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint64, v_uint64, vuint64m1_t, uint64, u64, VTraits::vlanes(), redsum) -OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int64, v_int64, vint64m1_t, int64, i64, VTraits::vlanes(), redsum) #define OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(_Tpvec, _wTpvec, _nwTpvec, scalartype, wsuffix, vl) \ inline scalartype v_reduce_sum(const _Tpvec& a) \ @@ -734,9 +699,6 @@ inline scalartype v_reduce_sum(const _Tpvec& a) \ return (scalartype)v_get0(res); \ } OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float32, v_float32, vfloat32m1_t, float, f32, VTraits::vlanes()) -#if CV_SIMD_SCALABLE_64F -OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float64, v_float64, vfloat64m1_t, double, f64, VTraits::vlanes()) -#endif #define OPENCV_HAL_IMPL_RVV_REDUCE(_Tpvec, func, scalartype, suffix, vl, red) \ inline scalartype v_reduce_##func(const _Tpvec& a) \ From f572ae3474ae7961636178c51b0d3d7c0ca43d0a Mon Sep 17 00:00:00 2001 From: HAN Liutong Date: Fri, 12 Aug 2022 14:13:26 +0000 Subject: [PATCH 5/8] add missing test cases(v_abs) --- modules/core/test/test_intrin_utils.hpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index 2398d308b9..4af3998c3e 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -703,7 +703,7 @@ template struct TheTest for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); - R_type ssub = dataA[i] - dataB[i] < std::numeric_limits::min() ? std::numeric_limits::min() : dataA[i] - dataB[i]; + R_type ssub = dataA[i] - dataB[i] < std::numeric_limits::lowest() ? std::numeric_limits::lowest() : dataA[i] - dataB[i]; EXPECT_EQ((u_type)std::abs(ssub), resC[i]); } @@ -2018,6 +2018,7 @@ void test_hal_intrin_float32() TheTest() .test_loadstore() .test_addsub() + .test_abs() .test_mul() .test_div() .test_sqrt_abs() @@ -2057,6 +2058,7 @@ void test_hal_intrin_float64() .test_addsub() .test_mul() .test_div() + .test_abs() .test_sqrt_abs() .test_min_max() .test_float_absdiff() @@ -2364,6 +2366,7 @@ void test_hal_intrin_float32() .test_addsub() .test_mul() .test_div() + .test_abs() .test_cmp() .test_sqrt_abs() .test_min_max() @@ -2401,6 +2404,7 @@ void test_hal_intrin_float64() .test_addsub() .test_mul() .test_div() + .test_abs() .test_cmp() .test_sqrt_abs() .test_min_max() From 189f64726437a3756329890ea75c8ca5fde46bcf Mon Sep 17 00:00:00 2001 From: HAN Liutong Date: Wed, 17 Aug 2022 14:38:38 +0000 Subject: [PATCH 6/8] Add implementation for zip, transpose, interleave, reverse and combine. --- .../opencv2/core/hal/intrin_rvv_scalable.hpp | 150 +++++++++++++++++- 1 file changed, 147 insertions(+), 3 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp index 396a2d68a5..7452ad91ad 100644 --- a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp @@ -5,6 +5,7 @@ #include #include #include +#include #ifndef CV_RVV_MAX_VLEN #define CV_RVV_MAX_VLEN 1024 @@ -1020,11 +1021,26 @@ OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint32, u32) OPENCV_HAL_IMPL_RVV_BROADCAST(v_int32, s32) OPENCV_HAL_IMPL_RVV_BROADCAST(v_float32, f32) -////////////// Transpose4x4 ////////////// -// TODO ////////////// Reverse ////////////// -// TODO +#define OPENCV_HAL_IMPL_RVV_REVERSE(_Tpvec, width) \ +inline _Tpvec v_reverse(const _Tpvec& a) \ +{ \ + vuint##width##m1_t vidx = vrsub(vid_v_u##width##m1(VTraits<_Tpvec>::vlanes()), VTraits<_Tpvec>::vlanes()-1, VTraits<_Tpvec>::vlanes()); \ + return vrgather(a, vidx, VTraits<_Tpvec>::vlanes()); \ +} +OPENCV_HAL_IMPL_RVV_REVERSE(v_uint8, 8) +OPENCV_HAL_IMPL_RVV_REVERSE(v_int8, 8) +OPENCV_HAL_IMPL_RVV_REVERSE(v_uint16, 16) +OPENCV_HAL_IMPL_RVV_REVERSE(v_int16, 16) +OPENCV_HAL_IMPL_RVV_REVERSE(v_uint32, 32) +OPENCV_HAL_IMPL_RVV_REVERSE(v_int32, 32) +OPENCV_HAL_IMPL_RVV_REVERSE(v_float32, 32) +OPENCV_HAL_IMPL_RVV_REVERSE(v_uint64, 64) +OPENCV_HAL_IMPL_RVV_REVERSE(v_int64, 64) +#if CV_SIMD_SCALABLE_64F +OPENCV_HAL_IMPL_RVV_REVERSE(v_float64, 64) +#endif //////////// Value reordering //////////// @@ -1067,6 +1083,134 @@ inline v_int32 v_load_expand_q(const schar* ptr) return vwcvt_x(vwcvt_x(vle8_v_i8mf4(ptr, VTraits::vlanes()), VTraits::vlanes()), VTraits::vlanes()); } + +/* void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) + a0 = {A1 A2 A3 A4} + a1 = {B1 B2 B3 B4} +--------------- + {A1 B1 A2 B2} and {A3 B3 A4 B4} +*/ + +#define OPENCV_HAL_IMPL_RVV_ZIP(_Tpvec, _wTpvec, suffix, width, width2, convert2um2, convert2um1) \ +inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) { \ + _wTpvec temp = vreinterpret_##suffix##m2(convert2um2( \ + vor(vzext_vf2(convert2um1(a0), VTraits<_Tpvec>::vlanes()*2), \ + vreinterpret_u##width2##m2(vslide1up(vreinterpret_u##width##m2(vzext_vf2(convert2um1(a1), VTraits<_Tpvec>::vlanes()*2)), 0, VTraits<_Tpvec>::vlanes()*2)), \ + VTraits<_Tpvec>::vlanes()))); \ + b0 = vget_##suffix##m1(temp, 0); \ + b1 = vget_##suffix##m1(temp, 1); \ +} +OPENCV_HAL_IMPL_RVV_ZIP(v_uint8, vuint8m2_t, u8, 8, 16, OPENCV_HAL_NOP, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_RVV_ZIP(v_int8, vint8m2_t, i8, 8, 16, vreinterpret_u8m2, vreinterpret_u8m1) +OPENCV_HAL_IMPL_RVV_ZIP(v_uint16, vuint16m2_t, u16, 16, 32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_RVV_ZIP(v_int16, vint16m2_t, i16, 16, 32, vreinterpret_u16m2, vreinterpret_u16m1) +OPENCV_HAL_IMPL_RVV_ZIP(v_uint32, vuint32m2_t, u32, 32, 64, OPENCV_HAL_NOP, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_RVV_ZIP(v_int32, vint32m2_t, i32, 32, 64, vreinterpret_u32m2, vreinterpret_u32m1) +OPENCV_HAL_IMPL_RVV_ZIP(v_float32, vfloat32m2_t, f32, 32, 64, vreinterpret_u32m2, vreinterpret_u32m1) + +#define OPENCV_HAL_IMPL_RVV_UNPACKS(_Tpvec, width) \ +inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return vslideup(a, b, VTraits<_Tpvec>::vlanes()/2, VTraits<_Tpvec>::vlanes());\ +} \ +inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return vslideup( \ + vslidedown(a, a, VTraits<_Tpvec>::vlanes()/2, VTraits<_Tpvec>::vlanes()), \ + vslidedown(b, b, VTraits<_Tpvec>::vlanes()/2, VTraits<_Tpvec>::vlanes()), \ + VTraits<_Tpvec>::vlanes()/2, \ + VTraits<_Tpvec>::vlanes()); \ +} \ +inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \ +{ \ + c = v_combine_low(a, b); \ + d = v_combine_high(a, b); \ +} + +OPENCV_HAL_IMPL_RVV_UNPACKS(v_uint8, 8) +OPENCV_HAL_IMPL_RVV_UNPACKS(v_int8, 8) +OPENCV_HAL_IMPL_RVV_UNPACKS(v_uint16, 16) +OPENCV_HAL_IMPL_RVV_UNPACKS(v_int16, 16) +OPENCV_HAL_IMPL_RVV_UNPACKS(v_uint32, 32) +OPENCV_HAL_IMPL_RVV_UNPACKS(v_int32, 32) +OPENCV_HAL_IMPL_RVV_UNPACKS(v_float32, 32) +#if CV_SIMD_SCALABLE_64F +OPENCV_HAL_IMPL_RVV_UNPACKS(v_float64, 64) +#endif + +static uint64_t idx_interleave_pairs[] = { \ + 0x0705060403010200, 0x0f0d0e0c0b090a08, 0x1715161413111210, 0x1f1d1e1c1b191a18, \ + 0x2725262423212220, 0x2f2d2e2c2b292a28, 0x3735363433313230, 0x3f3d3e3c3b393a38, \ + 0x4745464443414240, 0x4f4d4e4c4b494a48, 0x5755565453515250, 0x5f5d5e5c5b595a58, \ + 0x6765666463616260, 0x6f6d6e6c6b696a68, 0x7775767473717270, 0x7f7d7e7c7b797a78}; + +static uint64_t idx_interleave_quads[] = { \ + 0x0703060205010400, 0x0f0b0e0a0d090c08, 0x1713161215111410, 0x1f1b1e1a1d191c18, \ + 0x2723262225212420, 0x2f2b2e2a2d292c28, 0x3733363235313430, 0x3f3b3e3a3d393c38, \ + 0x4743464245414440, 0x4f4b4e4a4d494c48, 0x5753565255515450, 0x5f5b5e5a5d595c58, \ + 0x6763666265616460, 0x6f6b6e6a6d696c68, 0x7773767275717470, 0x7f7b7e7a7d797c78}; + +#define OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(_Tpvec, func) \ +inline _Tpvec v_interleave_##func(const _Tpvec& vec) { \ + CV_CheckLE(VTraits<_Tpvec>::vlanes(), VTraits<_Tpvec>::max_nlanes, "RVV implementation only supports VLEN in the range [128, 1024]"); \ + vuint8m1_t vidx = vundefined_u8m1();\ + vidx = vreinterpret_u8m1(vle64_v_u64m1(idx_interleave_##func, 16)); \ + return vrgather(vec, vidx, VTraits::vlanes()); \ +} +OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(v_uint8, pairs) +OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(v_int8, pairs) +OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(v_uint8, quads) +OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(v_int8, quads) + +#define OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(_Tpvec, width, vzext_vfx, func) \ +inline _Tpvec v_interleave_##func(const _Tpvec& vec) { \ + CV_CheckLE(VTraits<_Tpvec>::vlanes(), VTraits<_Tpvec>::max_nlanes, "RVV implementation only supports VLEN in the range [128, 1024]"); \ + vuint##width##m1_t vidx = vundefined_u##width##m1();\ + vidx = vget_u##width##m1(vzext_vfx(vreinterpret_u8m1(vle64_v_u64m1(idx_interleave_##func, 16)), VTraits::vlanes()), 0); \ + return vrgather(vec, vidx, VTraits<_Tpvec>::vlanes()); \ +} + +OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_uint16, 16, vzext_vf2, pairs) +OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_int16, 16, vzext_vf2, pairs) +OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_uint32, 32, vzext_vf4, pairs) +OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_int32, 32, vzext_vf4, pairs) +OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_float32, 32, vzext_vf4, pairs) + +OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_uint16, 16, vzext_vf2, quads) +OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_int16, 16, vzext_vf2, quads) +OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_uint32, 32, vzext_vf4, quads) +OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_int32, 32, vzext_vf4, quads) +OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_float32, 32, vzext_vf4, quads) + +////////////// Transpose4x4 ////////////// +#define OPENCV_HAL_IMPL_RVV_ZIP4(_Tpvec, _wTpvec, suffix, convert2u, convert) \ +static inline void v_zip4(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) { \ + int vl = 4; \ + _wTpvec temp = vreinterpret_##suffix##m2(convert2u( \ + vor(vzext_vf2(convert(a0), vl), \ + vreinterpret_u64m2(vslide1up(vreinterpret_u32m2(vzext_vf2(convert(a1), vl)), 0, vl*2)), \ + vl))); \ + b0 = vget_##suffix##m1(temp, 0); \ + b1 = vget_##suffix##m1(vrgather(temp, vadd(vid_v_u32m2(vl), 4, vl)/*{4,5,6,7} */, vl) ,0); \ +} + +OPENCV_HAL_IMPL_RVV_ZIP4(v_uint32, vuint32m2_t, u32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_RVV_ZIP4(v_int32, vint32m2_t, i32, vreinterpret_u32m2, vreinterpret_u32m1) +OPENCV_HAL_IMPL_RVV_ZIP4(v_float32, vfloat32m2_t, f32, vreinterpret_u32m2, vreinterpret_u32m1) + +#define OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(_Tpvec, suffix) \ +inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, const _Tpvec& a2, const _Tpvec& a3, _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) { \ + _Tpvec t0,t1,t2,t3= vundefined_##suffix##m1(); \ + v_zip4(a0, a2, t0, t2); \ + v_zip4(a1, a3, t1, t3); \ + v_zip4(t0, t1, b0, b1); \ + v_zip4(t2, t3, b2, b3); \ +} + +OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(v_uint32, u32) +OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(v_int32, i32) +OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(v_float32, f32) + //////////// PopCount ////////// // TODO From 8dc332721ffce54c750f1a259690cc36c2b126b5 Mon Sep 17 00:00:00 2001 From: HAN Liutong Date: Wed, 17 Aug 2022 14:39:23 +0000 Subject: [PATCH 7/8] Add testcases for interleave_p&q and enable others testcases. --- modules/core/test/test_intrin_utils.hpp | 120 +++++++++++++++++------- 1 file changed, 87 insertions(+), 33 deletions(-) diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index 4af3998c3e..cb6ac5d901 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -503,6 +503,38 @@ template struct TheTest return *this; } + TheTest & test_interleave_pq() + { + Data dataA; + R a = dataA; + Data resP = v_interleave_pairs(a); + Data resQ = v_interleave_quads(a); + for (int i = 0; i < VTraits::vlanes()/4; ++i) + { + SCOPED_TRACE(cv::format("i=%d", i)); + EXPECT_EQ(resP[4*i], dataA[4*i ]); + EXPECT_EQ(resP[4*i + 1], dataA[4*i+2]); + EXPECT_EQ(resP[4*i + 2], dataA[4*i+1]); + EXPECT_EQ(resP[4*i + 3], dataA[4*i+3]); + } + for (int i = 0; i < VTraits::vlanes(); ++i) + { + printf("%d%s", (int)resQ[i], i == VTraits::vlanes()-1 ? "\n" : " "); + } + for (int i = 0; i < VTraits::vlanes()/8; ++i) + { + SCOPED_TRACE(cv::format("i=%d", i)); + EXPECT_EQ(resQ[8*i], dataA[8*i ]); + EXPECT_EQ(resQ[8*i + 1], dataA[8*i+4]); + EXPECT_EQ(resQ[8*i + 2], dataA[8*i+1]); + EXPECT_EQ(resQ[8*i + 3], dataA[8*i+5]); + EXPECT_EQ(resQ[8*i + 4], dataA[8*i+2]); + EXPECT_EQ(resQ[8*i + 5], dataA[8*i+6]); + EXPECT_EQ(resQ[8*i + 6], dataA[8*i+3]); + EXPECT_EQ(resQ[8*i + 7], dataA[8*i+7]); + } + return *this; + } // float32x4 only TheTest & test_interleave_2channel() @@ -1577,19 +1609,27 @@ template struct TheTest v_transpose4x4(a, b, c, d, e, f, g, h); - Data res[4] = {e, f, g, h}; - // for (int i = 0; i < VTraits::vlanes(); i += 4) - // { - int i = 0; - for (int j = 0; j < 4; ++j) - { - SCOPED_TRACE(cv::format("i=%d j=%d", i, j)); - EXPECT_EQ(dataA[i + j], res[j][i]); - EXPECT_EQ(dataB[i + j], res[j][i + 1]); - EXPECT_EQ(dataC[i + j], res[j][i + 2]); - EXPECT_EQ(dataD[i + j], res[j][i + 3]); - } - // } + // Data res[4] = {e, f, g, h}; // Generates incorrect data in certain RVV case. + Data res0 = e, res1 = f, res2 = g, res3 = h; + EXPECT_EQ(dataA[0], res0[0]); + EXPECT_EQ(dataB[0], res0[1]); + EXPECT_EQ(dataC[0], res0[2]); + EXPECT_EQ(dataD[0], res0[3]); + + EXPECT_EQ(dataA[1], res1[0]); + EXPECT_EQ(dataB[1], res1[1]); + EXPECT_EQ(dataC[1], res1[2]); + EXPECT_EQ(dataD[1], res1[3]); + + EXPECT_EQ(dataA[2], res2[0]); + EXPECT_EQ(dataB[2], res2[1]); + EXPECT_EQ(dataC[2], res2[2]); + EXPECT_EQ(dataD[2], res2[3]); + + EXPECT_EQ(dataA[3], res3[0]); + EXPECT_EQ(dataB[3], res3[1]); + EXPECT_EQ(dataC[3], res3[2]); + EXPECT_EQ(dataD[3], res3[3]); return *this; } @@ -1741,6 +1781,7 @@ void test_hal_intrin_uint8() // typedef v_uint8 R; TheTest() .test_loadstore() + .test_interleave_pq() .test_expand() .test_expand_q() .test_addsub() @@ -1755,6 +1796,8 @@ void test_hal_intrin_uint8() .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>() .test_extract_n<0>().test_extract_n<1>() .test_extract_highest() + .test_unpack() + .test_reverse() #if 0 // not implemented in rvv backend yet. .test_interleave() .test_cmp() @@ -1764,8 +1807,6 @@ void test_hal_intrin_uint8() .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>() .test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>() .test_pack_b() - .test_unpack() - .test_reverse() .test_popcount() #endif ; @@ -1777,6 +1818,7 @@ void test_hal_intrin_int8() // typedef v_int8 R; TheTest() .test_loadstore() + .test_interleave_pq() .test_expand() .test_expand_q() .test_addsub() @@ -1793,6 +1835,8 @@ void test_hal_intrin_int8() .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>() .test_extract_n<0>().test_extract_n<1>() .test_extract_highest() + .test_unpack() + .test_reverse() #if 0 .test_interleave() .test_cmp() @@ -1800,8 +1844,6 @@ void test_hal_intrin_int8() .test_reduce() .test_reduce_sad() .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>() - .test_unpack() - .test_reverse() .test_popcount() #endif ; @@ -1815,6 +1857,7 @@ void test_hal_intrin_uint16() // typedef v_uint16 R; TheTest() .test_loadstore() + .test_interleave_pq() .test_expand() .test_addsub() .test_arithm_wrap() @@ -1831,6 +1874,8 @@ void test_hal_intrin_uint16() .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>() .test_extract_n<0>().test_extract_n<1>() .test_extract_highest() + .test_unpack() + .test_reverse() #if 0 .test_interleave() .test_cmp() @@ -1839,8 +1884,6 @@ void test_hal_intrin_uint16() .test_reduce_sad() .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>() .test_pack_u<1>().test_pack_u<2>().test_pack_u<7>().test_pack_u<16>() - .test_unpack() - .test_reverse() .test_popcount() #endif ; @@ -1852,6 +1895,7 @@ void test_hal_intrin_int16() // typedef v_int16 R; TheTest() .test_loadstore() + .test_interleave_pq() .test_expand() .test_addsub() .test_arithm_wrap() @@ -1870,6 +1914,8 @@ void test_hal_intrin_int16() .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>() .test_extract_n<0>().test_extract_n<1>() .test_extract_highest() + .test_unpack() + .test_reverse() #if 0 .test_interleave() @@ -1879,8 +1925,6 @@ void test_hal_intrin_int16() .test_reduce() .test_reduce_sad() .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>() - .test_unpack() - .test_reverse() .test_popcount() #endif ; @@ -1894,6 +1938,7 @@ void test_hal_intrin_uint32() // typedef v_uint32 R; TheTest() .test_loadstore() + .test_interleave_pq() .test_expand() .test_addsub() .test_mul() @@ -1910,15 +1955,15 @@ void test_hal_intrin_uint32() .test_broadcast_element<0>().test_broadcast_element<1>() .test_extract_highest() .test_broadcast_highest() + .test_unpack() + .test_transpose() + .test_reverse() #if 0 .test_interleave() .test_cmp() .test_reduce() .test_reduce_sad() .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>() - .test_unpack() - .test_reverse() - .test_transpose() .test_popcount() #endif ; @@ -1930,6 +1975,7 @@ void test_hal_intrin_int32() // typedef v_int32 R; TheTest() .test_loadstore() + .test_interleave_pq() .test_expand() .test_addsub() .test_mul() @@ -1946,6 +1992,9 @@ void test_hal_intrin_int32() .test_broadcast_element<0>().test_broadcast_element<1>() .test_extract_highest() .test_broadcast_highest() + .test_unpack() + .test_transpose() + .test_reverse() #if 0 .test_interleave() .test_cmp() @@ -1953,11 +2002,8 @@ void test_hal_intrin_int32() .test_reduce() .test_reduce_sad() .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>() - .test_unpack() - .test_reverse() .test_float_cvt32() .test_float_cvt64() - .test_transpose() .test_popcount() #endif ; @@ -1978,12 +2024,12 @@ void test_hal_intrin_uint64() .test_rotate<0>().test_rotate<1>() .test_extract_n<0>().test_extract_n<1>() .test_extract_highest() + .test_reverse() ; #if 0 #if CV_SIMD_64F .test_cmp64() #endif - .test_reverse() #endif } @@ -2001,12 +2047,12 @@ void test_hal_intrin_int64() .test_extract_n<0>().test_extract_n<1>() .test_extract_highest() .test_cvt64_double() + .test_reverse() ; #if 0 #if CV_SIMD_64F .test_cmp64() #endif - .test_reverse() #endif } @@ -2017,6 +2063,7 @@ void test_hal_intrin_float32() // typedef v_float32 R; TheTest() .test_loadstore() + .test_interleave_pq() .test_addsub() .test_abs() .test_mul() @@ -2031,18 +2078,18 @@ void test_hal_intrin_float32() .test_broadcast_element<0>().test_broadcast_element<1>() .test_extract_highest() .test_broadcast_highest() + .test_unpack() + .test_transpose() + .test_reverse() #if 0 .test_interleave() .test_interleave_2channel() .test_cmp() .test_reduce() .test_reduce_sad() - .test_unpack() .test_float_math() .test_float_cvt64() .test_matmul() - .test_transpose() - .test_reverse() .test_reduce_sum4() #endif ; @@ -2067,12 +2114,12 @@ void test_hal_intrin_float64() .test_rotate<0>().test_rotate<1>() .test_extract_n<0>().test_extract_n<1>() .test_extract_highest() + .test_reverse() #if 0 .test_cmp() .test_unpack() .test_float_cvt32() .test_float_math() - .test_reverse() #endif ; @@ -2091,6 +2138,7 @@ void test_hal_intrin_uint8() TheTest() .test_loadstore() .test_interleave() + .test_interleave_pq() .test_expand() .test_expand_q() .test_addsub() @@ -2132,6 +2180,7 @@ void test_hal_intrin_int8() TheTest() .test_loadstore() .test_interleave() + .test_interleave_pq() .test_expand() .test_expand_q() .test_addsub() @@ -2169,6 +2218,7 @@ void test_hal_intrin_uint16() TheTest() .test_loadstore() .test_interleave() + .test_interleave_pq() .test_expand() .test_addsub() .test_arithm_wrap() @@ -2205,6 +2255,7 @@ void test_hal_intrin_int16() TheTest() .test_loadstore() .test_interleave() + .test_interleave_pq() .test_expand() .test_addsub() .test_arithm_wrap() @@ -2245,6 +2296,7 @@ void test_hal_intrin_uint32() TheTest() .test_loadstore() .test_interleave() + // .test_interleave_pq() //not implemented in AVX .test_expand() .test_addsub() .test_mul() @@ -2279,6 +2331,7 @@ void test_hal_intrin_int32() TheTest() .test_loadstore() .test_interleave() + // .test_interleave_pq() //not implemented in AVX .test_expand() .test_addsub() .test_mul() @@ -2363,6 +2416,7 @@ void test_hal_intrin_float32() .test_loadstore() .test_interleave() .test_interleave_2channel() + // .test_interleave_pq() //not implemented in AVX .test_addsub() .test_mul() .test_div() From b9a1039566a60175fad8dd646e598b2e31ff1f4a Mon Sep 17 00:00:00 2001 From: HAN Liutong Date: Thu, 18 Aug 2022 08:01:09 +0000 Subject: [PATCH 8/8] Remove the test log in test_interleave_pq. --- modules/core/test/test_intrin_utils.hpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index cb6ac5d901..ac05768c35 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -517,10 +517,6 @@ template struct TheTest EXPECT_EQ(resP[4*i + 2], dataA[4*i+1]); EXPECT_EQ(resP[4*i + 3], dataA[4*i+3]); } - for (int i = 0; i < VTraits::vlanes(); ++i) - { - printf("%d%s", (int)resQ[i], i == VTraits::vlanes()-1 ? "\n" : " "); - } for (int i = 0; i < VTraits::vlanes()/8; ++i) { SCOPED_TRACE(cv::format("i=%d", i));