From ce0516282a53947933387bb8e4262a0594ce0913 Mon Sep 17 00:00:00 2001 From: Liutong HAN Date: Thu, 23 Nov 2023 15:06:04 +0800 Subject: [PATCH] Optimize the v_lut for RVV. --- .../opencv2/core/hal/intrin_rvv_scalable.hpp | 74 +++++++++++++------ 1 file changed, 50 insertions(+), 24 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp index c7770334ad..e2475e0e7d 100644 --- a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp @@ -448,29 +448,7 @@ OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64, vfloat64m1_t, double, VTraits::vlanes())), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \ - return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \ -} \ -inline _Tpvec v_lut_pairs(const _Tp* tab, const int* idx) \ -{ \ - std::vector idx_; \ - for (int i = 0; i < VTraits::vlanes(); ++i) { \ - idx_.push_back(idx[i]); \ - idx_.push_back(idx[i]+1); \ - } \ - vuint32##suffix##_t vidx = vmul(vle32_v_u32##suffix(idx_.data(), VTraits<_Tpvec>::vlanes()), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \ - return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \ -} \ -inline _Tpvec v_lut_quads(const _Tp* tab, const int* idx) \ -{ \ - std::vector idx_; \ - for (int i = 0; i < VTraits::vlanes(); ++i) { \ - idx_.push_back(idx[i]); \ - idx_.push_back(idx[i]+1); \ - idx_.push_back(idx[i]+2); \ - idx_.push_back(idx[i]+3); \ - } \ - vuint32##suffix##_t vidx = vmul(vle32_v_u32##suffix(idx_.data(), VTraits<_Tpvec>::vlanes()), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \ + auto vidx = vmul(vreinterpret_u32##suffix(vle32_v_i32##suffix(idx, VTraits<_Tpvec>::vlanes())), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \ return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \ } OPENCV_HAL_IMPL_RVV_LUT(v_int8, schar, m4) @@ -482,6 +460,55 @@ OPENCV_HAL_IMPL_RVV_LUT(v_float32, float, m1) OPENCV_HAL_IMPL_RVV_LUT(v_float64, double, mf2) #endif +#define OPENCV_HAL_IMPL_RVV_LUT_PAIRS(_Tpvec, _Tp, suffix1, suffix2, v_trunc) \ +inline _Tpvec v_lut_pairs(const _Tp* tab, const int* idx) \ +{ \ + auto v0 = vle32_v_u32##suffix1((unsigned*)idx, VTraits<_Tpvec>::vlanes()/2); \ + auto v1 = vadd(v0, 1, VTraits<_Tpvec>::vlanes()/2); \ + auto w0 = vwcvtu_x(v0, VTraits<_Tpvec>::vlanes()/2); \ + auto w1 = vwcvtu_x(v1, VTraits<_Tpvec>::vlanes()/2); \ + auto sh1 = vslide1up(v_trunc(vreinterpret_u32##suffix2(w1)),0, VTraits<_Tpvec>::vlanes()); \ + auto vid = vor(sh1, v_trunc(vreinterpret_u32##suffix2(w0)), VTraits<_Tpvec>::vlanes()); \ + auto vidx = vmul(vid, sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \ + return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \ +} +OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int8, schar, m2, m4, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int16, short, m1, m2, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int32, int, mf2, m1, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float32, float, mf2, m1, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int64, int64_t, mf2, m1, vlmul_trunc_u32mf2) +#if CV_SIMD_SCALABLE_64F +OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float64, double, mf2, m1, vlmul_trunc_u32mf2) +#endif + + +#define OPENCV_HAL_IMPL_RVV_LUT_QUADS(_Tpvec, _Tp, suffix0, suffix1, suffix2, v_trunc) \ +inline _Tpvec v_lut_quads(const _Tp* tab, const int* idx) \ +{ \ + auto v0 = vle32_v_u32##suffix0((unsigned*)idx, VTraits<_Tpvec>::vlanes()/4); \ + auto v1 = vadd(v0, 1, VTraits<_Tpvec>::vlanes()/4); \ + auto v2 = vadd(v0, 2, VTraits<_Tpvec>::vlanes()/4); \ + auto v3 = vadd(v0, 3, VTraits<_Tpvec>::vlanes()/4); \ + auto w0 = vwcvtu_x(v0, VTraits<_Tpvec>::vlanes()/4); \ + auto w1 = vwcvtu_x(v1, VTraits<_Tpvec>::vlanes()/4); \ + auto w2 = vwcvtu_x(v2, VTraits<_Tpvec>::vlanes()/4); \ + auto w3 = vwcvtu_x(v3, VTraits<_Tpvec>::vlanes()/4); \ + auto sh2 = vslide1up(vreinterpret_u32##suffix1(w2),0, VTraits<_Tpvec>::vlanes()/2); \ + auto sh3 = vslide1up(vreinterpret_u32##suffix1(w3),0, VTraits<_Tpvec>::vlanes()/2); \ + auto vid0 = vor(sh2, vreinterpret_u32##suffix1(w0), VTraits<_Tpvec>::vlanes()/2); \ + auto vid1 = vor(sh3, vreinterpret_u32##suffix1(w1), VTraits<_Tpvec>::vlanes()/2); \ + auto wid0 = vwcvtu_x(v_trunc(vid0), VTraits<_Tpvec>::vlanes()/2); \ + auto wid1 = vwcvtu_x(v_trunc(vid1), VTraits<_Tpvec>::vlanes()/2); \ + auto shwid1 = vslide1up(vreinterpret_u32##suffix2(wid1),0, VTraits<_Tpvec>::vlanes()); \ + auto vid = vor(shwid1, vreinterpret_u32##suffix2(wid0), VTraits<_Tpvec>::vlanes()); \ + auto vidx = vmul(vid, sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \ + return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \ +} +OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int8, schar, m1, m2, m4, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int16, short, mf2 , m1, m2, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int32, int, mf2, m1, m1, vlmul_trunc_u32mf2) +OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_float32, float, mf2, m1, m1, vlmul_trunc_u32mf2) + #define OPENCV_HAL_IMPL_RVV_LUT_VEC(_Tpvec, _Tp) \ inline _Tpvec v_lut(const _Tp* tab, const v_int32& vidx) \ { \ @@ -512,7 +539,6 @@ inline v_uint32 v_lut_pairs(const unsigned* tab, const int* idx) { return v_rein inline v_uint32 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); } inline v_uint64 v_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); } inline v_uint64 v_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); } -inline v_uint64 v_lut_quads(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_quads((const int64_t*)tab, idx)); } ////////////// Pack boolean //////////////////// inline v_uint8 v_pack_b(const v_uint16& a, const v_uint16& b)