Merge pull request #25586 from hanliutong:rvv-64f

Fix v_round and enable unit tests for scalable universal intrinsic 64F type. #25586 This may be a legacy issue from the previous PR #24325. I don't quite remember why the float 64 part of the unit test was not enabled at that time. Whatever, this patch enables the unit tests for scalable 64F type , and makes the necessary modifications to the RVV backend to make the tests pass. This patch is compiled by GCC 14 and LLVM 17 &18, and tested on QEMU and k230. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [ ] I agree to contribute to the project under Apache 2 License. - [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
8 months ago · e52540162f
parent c71d495273
commit e52540162f
4 changed files with 63 additions and 12 deletions
--- a/modules/core/include/opencv2/core/hal/intrin_rvv_compat_overloaded.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv_compat_overloaded.hpp
@ -200,9 +200,14 @@ inline static vuint32mf2_t vmul(const vuint32mf2_t & op1, uint32_t op2, size_t v
    return vmul_vx_u32mf2(op1, op2, vl);
 }

-inline static vuint32mf2_t vreinterpret_u32mf2(vint32mf2_t val)
+inline static vuint32mf2_t vreinterpret_u32mf2(const vint32mf2_t& val)
 {
    return vreinterpret_v_i32mf2_u32mf2(val);
 }

+inline static vuint32mf2_t vreinterpret_u32mf2(const vuint16mf2_t& val)
+{
+    return vreinterpret_v_u16mf2_u32mf2(val);
+}
+
 #endif //OPENCV_HAL_INTRIN_RVV_COMPAT_OVERLOAD_HPP
--- a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
@ -1528,6 +1528,26 @@ OPENCV_HAL_IMPL_RVV_ZIP(v_uint32, vuint32m2_t, u32, 32, 64, OPENCV_HAL_NOP, OPEN
 OPENCV_HAL_IMPL_RVV_ZIP(v_int32, vint32m2_t, i32, 32, 64, vreinterpret_u32m2, vreinterpret_u32m1)
 OPENCV_HAL_IMPL_RVV_ZIP(v_float32, vfloat32m2_t, f32, 32, 64, vreinterpret_u32m2, vreinterpret_u32m1)

+#if CV_SIMD_SCALABLE_64F
+inline void v_zip(const v_float64& a0, const v_float64& a1, v_float64& b0, v_float64& b1) { \
+    vuint16mf4_t idx0 = vid_v_u16mf4(VTraits<v_float64>::vlanes());
+    vuint16mf4_t idx1 = vadd(idx0, VTraits<v_float64>::vlanes(), VTraits<v_float64>::vlanes());
+    vuint16mf2_t idx = vreinterpret_u16mf2(( \
+        vor(vzext_vf2(idx0, VTraits<v_float64>::vlanes()), \
+            vreinterpret_u32mf2(vslide1up(vreinterpret_u16mf2(vzext_vf2(idx1, VTraits<v_float64>::vlanes())), 0, VTraits<v_uint32>::vlanes())), \
+            VTraits<v_uint32>::vlanes())));
+#if 0
+    vfloat64m2_t temp = __riscv_vcreate_v_f64m1_f64m2(a0, a1);
+#else // TODO: clean up when RVV Intrinsic is frozen.
+    vfloat64m2_t temp = vlmul_ext_f64m2(a0);
+    temp = vset(temp, 1, a1);
+#endif
+    temp = vrgatherei16(temp, idx, VTraits<v_float64>::vlanes()*2);
+    b0 = vget_f64m1(temp, 0); \
+    b1 = vget_f64m1(temp, 1); \
+}
+#endif
+
 #define OPENCV_HAL_IMPL_RVV_UNPACKS(_Tpvec, width) \
 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
 { \
@ -1859,12 +1879,14 @@ inline v_int32 v_trunc(const v_float32& a)
 #if CV_SIMD_SCALABLE_64F
 inline v_int32 v_round(const v_float64& a)
 {
-    return vfncvt_x(vlmul_ext_f64m2(vfadd(a, 1e-6, VTraits<v_float64>::vlanes())), VTraits<v_float32>::vlanes());
+    return vfncvt_x(vlmul_ext_f64m2(a), VTraits<v_float32>::vlanes());
 }

 inline v_int32 v_round(const v_float64& a, const v_float64& b)
 {
-    return vfncvt_x(vset(vlmul_ext_f64m2(vfadd(a, 1e-6, VTraits<v_float64>::vlanes())), 1, b), VTraits<v_float32>::vlanes());
+    // return vfncvt_x(vset(vlmul_ext_f64m2(vfadd(a, 1e-6, VTraits<v_float64>::vlanes())), 1, b), VTraits<v_float32>::vlanes());
+    // Fix https://github.com/opencv/opencv/issues/24746
+    return vfncvt_x(vset(vlmul_ext_f64m2(a), 1, b), VTraits<v_float32>::vlanes());
 }

 inline v_int32 v_floor(const v_float64& a)
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@ -281,7 +281,7 @@ template<typename R> struct TheTest
        v_uint64 vu64 = v_reinterpret_as_u64(r1); out.a.clear(); v_store((uint64*)out.a.d, vu64); EXPECT_EQ(data.a, out.a);
        v_int64 vs64 = v_reinterpret_as_s64(r1); out.a.clear(); v_store((int64*)out.a.d, vs64); EXPECT_EQ(data.a, out.a);
        v_float32 vf32 = v_reinterpret_as_f32(r1); out.a.clear(); v_store((float*)out.a.d, vf32); EXPECT_EQ(data.a, out.a);
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
        v_float64 vf64 = v_reinterpret_as_f64(r1); out.a.clear(); v_store((double*)out.a.d, vf64); EXPECT_EQ(data.a, out.a);
 #endif

@ -747,7 +747,7 @@ template<typename R> struct TheTest

    TheTest & test_dotprod_expand_f64()
    {
-    #if CV_SIMD_64F
+    #if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
        Data<R> dataA, dataB;
        dataA += std::numeric_limits<LaneType>::max() - VTraits<R>::vlanes();
        dataB += std::numeric_limits<LaneType>::min();
@ -1385,6 +1385,33 @@ template<typename R> struct TheTest

        return *this;
    }
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    TheTest & test_round_pair_f64()
+    {
+        typedef typename V_RegTraits<R>::round_reg Ri;
+        Data<R> data1, data1_border, data2;
+        // See https://github.com/opencv/opencv/issues/24213
+        // https://github.com/opencv/opencv/issues/24163
+        // https://github.com/opencv/opencv/pull/24271
+        data1_border *= 0.5;
+        data1 *= 1.1;
+        data2 += 10;
+        R a1 = data1, a1_border = data1_border, a2 = data2;
+
+        Data<Ri> resA = v_round(a1, a1),
+                 resB = v_round(a1_border, a1_border),
+                 resC = v_round(a2, a2);
+
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
+        {
+            EXPECT_EQ(cvRound(data1[i]), resA[i]);
+            EXPECT_EQ(cvRound(data1_border[i]), resB[i]);
+            EXPECT_EQ(cvRound(data2[i]), resC[i]);
+        }
+
+        return *this;
+    }
+#endif

    TheTest & test_float_cvt32()
    {
@ -1405,7 +1432,7 @@ template<typename R> struct TheTest

    TheTest & test_float_cvt64()
    {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
        typedef v_float64 Rt;
        Data<R> dataA;
        dataA *= 1.1;
@ -1431,7 +1458,7 @@ template<typename R> struct TheTest

    TheTest & test_cvt64_double()
    {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
        Data<R> dataA(std::numeric_limits<LaneType>::max()),
                dataB(std::numeric_limits<LaneType>::min());
        dataB += VTraits<R>::vlanes();
@ -1994,7 +2021,7 @@ void test_hal_intrin_float32()
 void test_hal_intrin_float64()
 {
    DUMP_ENTRY(v_float64);
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
    TheTest<v_float64>()
        .test_loadstore()
        .test_addsub()
@ -2008,6 +2035,7 @@ void test_hal_intrin_float64()
        .test_mask()
        .test_unpack()
        .test_float_math()
+        .test_round_pair_f64()
        .test_float_cvt32()
        .test_reverse()
        .test_extract<0>().test_extract<1>()
--- a/modules/core/test/test_operations.cpp
+++ b/modules/core/test/test_operations.cpp
@ -1574,11 +1574,7 @@ TEST(Core_Arithm, scalar_handling_19599)  // https://github.com/opencv/opencv/is
 typedef tuple<perf::MatDepth,int,int,int> Arith_Regression24163Param;
 typedef testing::TestWithParam<Arith_Regression24163Param> Core_Arith_Regression24163;

-#if defined __riscv
-TEST_P(Core_Arith_Regression24163, DISABLED_test_for_ties_to_even)
-#else
 TEST_P(Core_Arith_Regression24163, test_for_ties_to_even)
-#endif
 {
    const int matDepth = get<0>(GetParam());
    const int matHeight= get<1>(GetParam());