From e52540162fe40a341fc468b90049139b3523a87d Mon Sep 17 00:00:00 2001 From: HAN Liutong Date: Tue, 21 May 2024 19:10:19 +0800 Subject: [PATCH] Merge pull request #25586 from hanliutong:rvv-64f Fix v_round and enable unit tests for scalable universal intrinsic 64F type. #25586 This may be a legacy issue from the previous PR #24325. I don't quite remember why the float 64 part of the unit test was not enabled at that time. Whatever, this patch enables the unit tests for scalable 64F type , and makes the necessary modifications to the RVV backend to make the tests pass. This patch is compiled by GCC 14 and LLVM 17 &18, and tested on QEMU and k230. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [ ] I agree to contribute to the project under Apache 2 License. - [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake --- .../core/hal/intrin_rvv_compat_overloaded.hpp | 7 +++- .../opencv2/core/hal/intrin_rvv_scalable.hpp | 26 ++++++++++++- modules/core/test/test_intrin_utils.hpp | 38 ++++++++++++++++--- modules/core/test/test_operations.cpp | 4 -- 4 files changed, 63 insertions(+), 12 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_compat_overloaded.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_compat_overloaded.hpp index 914ad28978..2a323069fd 100644 --- a/modules/core/include/opencv2/core/hal/intrin_rvv_compat_overloaded.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_rvv_compat_overloaded.hpp @@ -200,9 +200,14 @@ inline static vuint32mf2_t vmul(const vuint32mf2_t & op1, uint32_t op2, size_t v return vmul_vx_u32mf2(op1, op2, vl); } -inline static vuint32mf2_t vreinterpret_u32mf2(vint32mf2_t val) +inline static vuint32mf2_t vreinterpret_u32mf2(const vint32mf2_t& val) { return vreinterpret_v_i32mf2_u32mf2(val); } +inline static vuint32mf2_t vreinterpret_u32mf2(const vuint16mf2_t& val) +{ + return vreinterpret_v_u16mf2_u32mf2(val); +} + #endif //OPENCV_HAL_INTRIN_RVV_COMPAT_OVERLOAD_HPP diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp index 87531ede1e..0159e4325a 100644 --- a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp @@ -1528,6 +1528,26 @@ OPENCV_HAL_IMPL_RVV_ZIP(v_uint32, vuint32m2_t, u32, 32, 64, OPENCV_HAL_NOP, OPEN OPENCV_HAL_IMPL_RVV_ZIP(v_int32, vint32m2_t, i32, 32, 64, vreinterpret_u32m2, vreinterpret_u32m1) OPENCV_HAL_IMPL_RVV_ZIP(v_float32, vfloat32m2_t, f32, 32, 64, vreinterpret_u32m2, vreinterpret_u32m1) +#if CV_SIMD_SCALABLE_64F +inline void v_zip(const v_float64& a0, const v_float64& a1, v_float64& b0, v_float64& b1) { \ + vuint16mf4_t idx0 = vid_v_u16mf4(VTraits::vlanes()); + vuint16mf4_t idx1 = vadd(idx0, VTraits::vlanes(), VTraits::vlanes()); + vuint16mf2_t idx = vreinterpret_u16mf2(( \ + vor(vzext_vf2(idx0, VTraits::vlanes()), \ + vreinterpret_u32mf2(vslide1up(vreinterpret_u16mf2(vzext_vf2(idx1, VTraits::vlanes())), 0, VTraits::vlanes())), \ + VTraits::vlanes()))); +#if 0 + vfloat64m2_t temp = __riscv_vcreate_v_f64m1_f64m2(a0, a1); +#else // TODO: clean up when RVV Intrinsic is frozen. + vfloat64m2_t temp = vlmul_ext_f64m2(a0); + temp = vset(temp, 1, a1); +#endif + temp = vrgatherei16(temp, idx, VTraits::vlanes()*2); + b0 = vget_f64m1(temp, 0); \ + b1 = vget_f64m1(temp, 1); \ +} +#endif + #define OPENCV_HAL_IMPL_RVV_UNPACKS(_Tpvec, width) \ inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \ { \ @@ -1859,12 +1879,14 @@ inline v_int32 v_trunc(const v_float32& a) #if CV_SIMD_SCALABLE_64F inline v_int32 v_round(const v_float64& a) { - return vfncvt_x(vlmul_ext_f64m2(vfadd(a, 1e-6, VTraits::vlanes())), VTraits::vlanes()); + return vfncvt_x(vlmul_ext_f64m2(a), VTraits::vlanes()); } inline v_int32 v_round(const v_float64& a, const v_float64& b) { - return vfncvt_x(vset(vlmul_ext_f64m2(vfadd(a, 1e-6, VTraits::vlanes())), 1, b), VTraits::vlanes()); + // return vfncvt_x(vset(vlmul_ext_f64m2(vfadd(a, 1e-6, VTraits::vlanes())), 1, b), VTraits::vlanes()); + // Fix https://github.com/opencv/opencv/issues/24746 + return vfncvt_x(vset(vlmul_ext_f64m2(a), 1, b), VTraits::vlanes()); } inline v_int32 v_floor(const v_float64& a) diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index a8c565ec46..08138d194d 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -281,7 +281,7 @@ template struct TheTest v_uint64 vu64 = v_reinterpret_as_u64(r1); out.a.clear(); v_store((uint64*)out.a.d, vu64); EXPECT_EQ(data.a, out.a); v_int64 vs64 = v_reinterpret_as_s64(r1); out.a.clear(); v_store((int64*)out.a.d, vs64); EXPECT_EQ(data.a, out.a); v_float32 vf32 = v_reinterpret_as_f32(r1); out.a.clear(); v_store((float*)out.a.d, vf32); EXPECT_EQ(data.a, out.a); -#if CV_SIMD_64F +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) v_float64 vf64 = v_reinterpret_as_f64(r1); out.a.clear(); v_store((double*)out.a.d, vf64); EXPECT_EQ(data.a, out.a); #endif @@ -747,7 +747,7 @@ template struct TheTest TheTest & test_dotprod_expand_f64() { - #if CV_SIMD_64F + #if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) Data dataA, dataB; dataA += std::numeric_limits::max() - VTraits::vlanes(); dataB += std::numeric_limits::min(); @@ -1385,6 +1385,33 @@ template struct TheTest return *this; } +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) + TheTest & test_round_pair_f64() + { + typedef typename V_RegTraits::round_reg Ri; + Data data1, data1_border, data2; + // See https://github.com/opencv/opencv/issues/24213 + // https://github.com/opencv/opencv/issues/24163 + // https://github.com/opencv/opencv/pull/24271 + data1_border *= 0.5; + data1 *= 1.1; + data2 += 10; + R a1 = data1, a1_border = data1_border, a2 = data2; + + Data resA = v_round(a1, a1), + resB = v_round(a1_border, a1_border), + resC = v_round(a2, a2); + + for (int i = 0; i < VTraits::vlanes(); ++i) + { + EXPECT_EQ(cvRound(data1[i]), resA[i]); + EXPECT_EQ(cvRound(data1_border[i]), resB[i]); + EXPECT_EQ(cvRound(data2[i]), resC[i]); + } + + return *this; + } +#endif TheTest & test_float_cvt32() { @@ -1405,7 +1432,7 @@ template struct TheTest TheTest & test_float_cvt64() { -#if CV_SIMD_64F +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) typedef v_float64 Rt; Data dataA; dataA *= 1.1; @@ -1431,7 +1458,7 @@ template struct TheTest TheTest & test_cvt64_double() { -#if CV_SIMD_64F +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) Data dataA(std::numeric_limits::max()), dataB(std::numeric_limits::min()); dataB += VTraits::vlanes(); @@ -1994,7 +2021,7 @@ void test_hal_intrin_float32() void test_hal_intrin_float64() { DUMP_ENTRY(v_float64); -#if CV_SIMD_64F +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) TheTest() .test_loadstore() .test_addsub() @@ -2008,6 +2035,7 @@ void test_hal_intrin_float64() .test_mask() .test_unpack() .test_float_math() + .test_round_pair_f64() .test_float_cvt32() .test_reverse() .test_extract<0>().test_extract<1>() diff --git a/modules/core/test/test_operations.cpp b/modules/core/test/test_operations.cpp index d985a1c2b6..d5622dabb4 100644 --- a/modules/core/test/test_operations.cpp +++ b/modules/core/test/test_operations.cpp @@ -1574,11 +1574,7 @@ TEST(Core_Arithm, scalar_handling_19599) // https://github.com/opencv/opencv/is typedef tuple Arith_Regression24163Param; typedef testing::TestWithParam Core_Arith_Regression24163; -#if defined __riscv -TEST_P(Core_Arith_Regression24163, DISABLED_test_for_ties_to_even) -#else TEST_P(Core_Arith_Regression24163, test_for_ties_to_even) -#endif { const int matDepth = get<0>(GetParam()); const int matHeight= get<1>(GetParam());