Merge pull request #24936 from mshabunin:fix-rvv07-scale64f

RISC-V: fix scale64f performance for RVV 0.7
1 year ago · 54b7cafd2a
parent 8ed0319fee 65784dddeb
commit 54b7cafd2a
1 changed files with 7 additions and 7 deletions
--- a/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp
@ -490,12 +490,12 @@ inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)

 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
 {
-    return v_float32x4(vfmacc_vv_f32m1(c.val, a.val, b.val, 4));
+    return v_float32x4(vfmadd_vv_f32m1(a.val, b.val, c.val, 4));
 }

 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
 {
-    return v_int32x4(vmacc_vv_i32m1(c.val, a.val, b.val, 4));
+    return v_int32x4(vmadd_vv_i32m1(a.val, b.val, c.val, 4));
 }

 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
@ -553,7 +553,7 @@ inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)

 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
 {
-    return v_float64x2(vfmacc_vv_f64m1(c.val, a.val, b.val, 2));
+    return v_float64x2(vfmadd_vv_f64m1(a.val, b.val, c.val, 2));
 }

 inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
@ -1429,7 +1429,7 @@ inline _Tpvec v_load_low(const _Tp* ptr) \
 inline _Tpvec v_load_aligned(const _Tp* ptr) \
 { return _Tpvec(vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr, 16))); } \
 inline _Tpvec v_load(const _Tp* ptr) \
-{ return _Tpvec(vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr, 16))); } \
+{ return _Tpvec(vle##elemsize##_v_##len(ptr, num)); } \
 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
 { vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 8);}\
 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
@ -1438,7 +1438,7 @@ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
  a0 = vslidedown_vx_##len(a0, a.val, hnum, num);    \
  vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a0), 8);}\
 inline void v_store(_Tp* ptr, const _Tpvec& a) \
-{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); } \
+{ vse##elemsize##_v_##len(ptr, a.val, num); } \
 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
 { vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); } \
 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
@ -1469,7 +1469,7 @@ inline _Tpvec v_load_low(const _Tp* ptr) \
 inline _Tpvec v_load_aligned(const _Tp* ptr) \
 { return _Tpvec(vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr, 16)))); } \
 inline _Tpvec v_load(const _Tp* ptr) \
-{ return _Tpvec(vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr, 16)))); } \
+{ return _Tpvec(vle##elemsize##_v_##len(ptr, num)); } \
 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
 { vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 8);}\
 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
@ -1478,7 +1478,7 @@ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
  a0 = vslidedown_vx_##len(a0, a.val, hnum, num);    \
  vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a0)), 8);}\
 inline void v_store(_Tp* ptr, const _Tpvec& a) \
-{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); } \
+{ vse##elemsize##_v_##len(ptr, a.val, num); } \
 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
 { vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); } \
 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \