From e52540162fe40a341fc468b90049139b3523a87d Mon Sep 17 00:00:00 2001
From: HAN Liutong <liutong2020@iscas.ac.cn>
Date: Tue, 21 May 2024 19:10:19 +0800
Subject: [PATCH] Merge pull request #25586 from hanliutong:rvv-64f

Fix v_round and enable unit tests for scalable universal intrinsic 64F type. #25586

This may be a legacy issue from the previous PR #24325. I don't quite remember why the float 64 part of the unit test was not enabled at that time.

Whatever, this patch enables the unit tests for scalable 64F type , and makes the necessary modifications to the RVV backend to make the tests pass.

This patch is compiled by GCC 14 and LLVM 17 &18, and tested on QEMU and k230.

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
---
 .../core/hal/intrin_rvv_compat_overloaded.hpp |  7 +++-
 .../opencv2/core/hal/intrin_rvv_scalable.hpp  | 26 ++++++++++++-
 modules/core/test/test_intrin_utils.hpp       | 38 ++++++++++++++++---
 modules/core/test/test_operations.cpp         |  4 --
 4 files changed, 63 insertions(+), 12 deletions(-)

diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_compat_overloaded.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_compat_overloaded.hpp
index 914ad28978..2a323069fd 100644
--- a/modules/core/include/opencv2/core/hal/intrin_rvv_compat_overloaded.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv_compat_overloaded.hpp
@@ -200,9 +200,14 @@ inline static vuint32mf2_t vmul(const vuint32mf2_t & op1, uint32_t op2, size_t v
     return vmul_vx_u32mf2(op1, op2, vl);
 }
 
-inline static vuint32mf2_t vreinterpret_u32mf2(vint32mf2_t val)
+inline static vuint32mf2_t vreinterpret_u32mf2(const vint32mf2_t& val)
 {
     return vreinterpret_v_i32mf2_u32mf2(val);
 }
 
+inline static vuint32mf2_t vreinterpret_u32mf2(const vuint16mf2_t& val)
+{
+    return vreinterpret_v_u16mf2_u32mf2(val);
+}
+
 #endif //OPENCV_HAL_INTRIN_RVV_COMPAT_OVERLOAD_HPP
diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
index 87531ede1e..0159e4325a 100644
--- a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
@@ -1528,6 +1528,26 @@ OPENCV_HAL_IMPL_RVV_ZIP(v_uint32, vuint32m2_t, u32, 32, 64, OPENCV_HAL_NOP, OPEN
 OPENCV_HAL_IMPL_RVV_ZIP(v_int32, vint32m2_t, i32, 32, 64, vreinterpret_u32m2, vreinterpret_u32m1)
 OPENCV_HAL_IMPL_RVV_ZIP(v_float32, vfloat32m2_t, f32, 32, 64, vreinterpret_u32m2, vreinterpret_u32m1)
 
+#if CV_SIMD_SCALABLE_64F
+inline void v_zip(const v_float64& a0, const v_float64& a1, v_float64& b0, v_float64& b1) { \
+    vuint16mf4_t idx0 = vid_v_u16mf4(VTraits<v_float64>::vlanes());
+    vuint16mf4_t idx1 = vadd(idx0, VTraits<v_float64>::vlanes(), VTraits<v_float64>::vlanes());
+    vuint16mf2_t idx = vreinterpret_u16mf2(( \
+        vor(vzext_vf2(idx0, VTraits<v_float64>::vlanes()), \
+            vreinterpret_u32mf2(vslide1up(vreinterpret_u16mf2(vzext_vf2(idx1, VTraits<v_float64>::vlanes())), 0, VTraits<v_uint32>::vlanes())), \
+            VTraits<v_uint32>::vlanes())));
+#if 0
+    vfloat64m2_t temp = __riscv_vcreate_v_f64m1_f64m2(a0, a1);
+#else // TODO: clean up when RVV Intrinsic is frozen.
+    vfloat64m2_t temp = vlmul_ext_f64m2(a0);
+    temp = vset(temp, 1, a1);
+#endif
+    temp = vrgatherei16(temp, idx, VTraits<v_float64>::vlanes()*2);
+    b0 = vget_f64m1(temp, 0); \
+    b1 = vget_f64m1(temp, 1); \
+}
+#endif
+
 #define OPENCV_HAL_IMPL_RVV_UNPACKS(_Tpvec, width) \
 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
 { \
@@ -1859,12 +1879,14 @@ inline v_int32 v_trunc(const v_float32& a)
 #if CV_SIMD_SCALABLE_64F
 inline v_int32 v_round(const v_float64& a)
 {
-    return vfncvt_x(vlmul_ext_f64m2(vfadd(a, 1e-6, VTraits<v_float64>::vlanes())), VTraits<v_float32>::vlanes());
+    return vfncvt_x(vlmul_ext_f64m2(a), VTraits<v_float32>::vlanes());
 }
 
 inline v_int32 v_round(const v_float64& a, const v_float64& b)
 {
-    return vfncvt_x(vset(vlmul_ext_f64m2(vfadd(a, 1e-6, VTraits<v_float64>::vlanes())), 1, b), VTraits<v_float32>::vlanes());
+    // return vfncvt_x(vset(vlmul_ext_f64m2(vfadd(a, 1e-6, VTraits<v_float64>::vlanes())), 1, b), VTraits<v_float32>::vlanes());
+    // Fix https://github.com/opencv/opencv/issues/24746
+    return vfncvt_x(vset(vlmul_ext_f64m2(a), 1, b), VTraits<v_float32>::vlanes());
 }
 
 inline v_int32 v_floor(const v_float64& a)
diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp
index a8c565ec46..08138d194d 100644
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -281,7 +281,7 @@ template<typename R> struct TheTest
         v_uint64 vu64 = v_reinterpret_as_u64(r1); out.a.clear(); v_store((uint64*)out.a.d, vu64); EXPECT_EQ(data.a, out.a);
         v_int64 vs64 = v_reinterpret_as_s64(r1); out.a.clear(); v_store((int64*)out.a.d, vs64); EXPECT_EQ(data.a, out.a);
         v_float32 vf32 = v_reinterpret_as_f32(r1); out.a.clear(); v_store((float*)out.a.d, vf32); EXPECT_EQ(data.a, out.a);
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
         v_float64 vf64 = v_reinterpret_as_f64(r1); out.a.clear(); v_store((double*)out.a.d, vf64); EXPECT_EQ(data.a, out.a);
 #endif
 
@@ -747,7 +747,7 @@ template<typename R> struct TheTest
 
     TheTest & test_dotprod_expand_f64()
     {
-    #if CV_SIMD_64F
+    #if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
         Data<R> dataA, dataB;
         dataA += std::numeric_limits<LaneType>::max() - VTraits<R>::vlanes();
         dataB += std::numeric_limits<LaneType>::min();
@@ -1385,6 +1385,33 @@ template<typename R> struct TheTest
 
         return *this;
     }
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    TheTest & test_round_pair_f64()
+    {
+        typedef typename V_RegTraits<R>::round_reg Ri;
+        Data<R> data1, data1_border, data2;
+        // See https://github.com/opencv/opencv/issues/24213
+        // https://github.com/opencv/opencv/issues/24163
+        // https://github.com/opencv/opencv/pull/24271
+        data1_border *= 0.5;
+        data1 *= 1.1;
+        data2 += 10;
+        R a1 = data1, a1_border = data1_border, a2 = data2;
+
+        Data<Ri> resA = v_round(a1, a1),
+                 resB = v_round(a1_border, a1_border),
+                 resC = v_round(a2, a2);
+
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
+        {
+            EXPECT_EQ(cvRound(data1[i]), resA[i]);
+            EXPECT_EQ(cvRound(data1_border[i]), resB[i]);
+            EXPECT_EQ(cvRound(data2[i]), resC[i]);
+        }
+
+        return *this;
+    }
+#endif
 
     TheTest & test_float_cvt32()
     {
@@ -1405,7 +1432,7 @@ template<typename R> struct TheTest
 
     TheTest & test_float_cvt64()
     {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
         typedef v_float64 Rt;
         Data<R> dataA;
         dataA *= 1.1;
@@ -1431,7 +1458,7 @@ template<typename R> struct TheTest
 
     TheTest & test_cvt64_double()
     {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
         Data<R> dataA(std::numeric_limits<LaneType>::max()),
                 dataB(std::numeric_limits<LaneType>::min());
         dataB += VTraits<R>::vlanes();
@@ -1994,7 +2021,7 @@ void test_hal_intrin_float32()
 void test_hal_intrin_float64()
 {
     DUMP_ENTRY(v_float64);
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     TheTest<v_float64>()
         .test_loadstore()
         .test_addsub()
@@ -2008,6 +2035,7 @@ void test_hal_intrin_float64()
         .test_mask()
         .test_unpack()
         .test_float_math()
+        .test_round_pair_f64()
         .test_float_cvt32()
         .test_reverse()
         .test_extract<0>().test_extract<1>()
diff --git a/modules/core/test/test_operations.cpp b/modules/core/test/test_operations.cpp
index d985a1c2b6..d5622dabb4 100644
--- a/modules/core/test/test_operations.cpp
+++ b/modules/core/test/test_operations.cpp
@@ -1574,11 +1574,7 @@ TEST(Core_Arithm, scalar_handling_19599)  // https://github.com/opencv/opencv/is
 typedef tuple<perf::MatDepth,int,int,int> Arith_Regression24163Param;
 typedef testing::TestWithParam<Arith_Regression24163Param> Core_Arith_Regression24163;
 
-#if defined __riscv
-TEST_P(Core_Arith_Regression24163, DISABLED_test_for_ties_to_even)
-#else
 TEST_P(Core_Arith_Regression24163, test_for_ties_to_even)
-#endif
 {
     const int matDepth = get<0>(GetParam());
     const int matHeight= get<1>(GetParam());