From 224b9ee33f9c570f3b411bd669aeec36ac751508 Mon Sep 17 00:00:00 2001
From: Maksim Shabunin <maksim.shabunin@gmail.com>
Date: Wed, 10 Jan 2024 12:53:33 +0300
Subject: [PATCH] RISC-V: updated intrin_rvv071.hpp to work with modern
 toolchain 2.8.0

- intrinsics implementation (071) reworked to use modern RVV intrinsics syntax
- cmake toolchain file (071) now allows selecting from predefined configurations

Co-authored-by: Fang Sun <fangsun@linux.alibaba.com>
---
 .../include/opencv2/core/cv_cpu_dispatch.h    |    2 +-
 .../opencv2/core/hal/intrin_rvv071.hpp        | 1596 ++++++++++-------
 .../linux/riscv64-071-gcc.toolchain.cmake     |   53 +-
 3 files changed, 1033 insertions(+), 618 deletions(-)

diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
index de7b84b82a..8269fa6121 100644
--- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h
+++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
@@ -147,7 +147,7 @@
 #endif
 
 #if defined(__riscv) && defined(__riscv_vector) && defined(__riscv_vector_071)
-# include<riscv-vector.h>
+# include<riscv_vector.h>
 # define CV_RVV071 1
 #endif
 
diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp
index 26f478feda..ef5f0d0ed9 100644
--- a/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp
@@ -19,7 +19,7 @@ namespace cv
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 
 #define CV_SIMD128 1
-#define CV_SIMD128_64F 0
+#define CV_SIMD128_64F 1
 //////////// Types ////////////
 struct v_uint8x16
 {
@@ -32,11 +32,11 @@ struct v_uint8x16
                uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
     {
         uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
-        val = (vuint8m1_t)vle_v_u8m1((unsigned char*)v, 16);
+        val = (vuint8m1_t)vle8_v_u8m1((unsigned char*)v, 16);
     }
     uchar get0() const
     {
-        return vmv_x_s_u8m1_u8(val, 16);
+        return vmv_x_s_u8m1_u8(val);
     }
 
     vuint8m1_t val;
@@ -53,11 +53,11 @@ struct v_int8x16
                schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
     {
         schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
-        val = (vint8m1_t)vle_v_i8m1((schar*)v, 16);
+        val = (vint8m1_t)vle8_v_i8m1((schar*)v, 16);
     }
     schar get0() const
     {
-        return vmv_x_s_i8m1_i8(val, 16);
+        return vmv_x_s_i8m1_i8(val);
     }
 
     vint8m1_t val;
@@ -73,11 +73,11 @@ struct v_uint16x8
     v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
     {
         ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
-        val = (vuint16m1_t)vle_v_u16m1((unsigned short*)v, 8);
+        val = (vuint16m1_t)vle16_v_u16m1((unsigned short*)v, 8);
     }
     ushort get0() const
     {
-        return vmv_x_s_u16m1_u16(val, 8);
+        return vmv_x_s_u16m1_u16(val);
     }
 
     vuint16m1_t val;
@@ -93,11 +93,11 @@ struct v_int16x8
     v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
     {
         short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
-        val = (vint16m1_t)vle_v_i16m1((signed short*)v, 8);
+        val = (vint16m1_t)vle16_v_i16m1((signed short*)v, 8);
     }
     short get0() const
     {
-        return vmv_x_s_i16m1_i16(val, 8);
+        return vmv_x_s_i16m1_i16(val);
     }
 
     vint16m1_t val;
@@ -113,11 +113,11 @@ struct v_uint32x4
     v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
     {
         unsigned v[] = {v0, v1, v2, v3};
-        val = (vuint32m1_t)vle_v_u32m1((unsigned int*)v, 4);
+        val = (vuint32m1_t)vle32_v_u32m1((unsigned int*)v, 4);
     }
     unsigned get0() const
     {
-        return vmv_x_s_u32m1_u32(val, 4);
+        return vmv_x_s_u32m1_u32(val);
     }
 
     vuint32m1_t val;
@@ -133,11 +133,11 @@ struct v_int32x4
     v_int32x4(int v0, int v1, int v2, int v3)
     {
         int v[] = {v0, v1, v2, v3};
-        val = (vint32m1_t)vle_v_i32m1((signed int*)v, 4);
+        val = (vint32m1_t)vle32_v_i32m1((signed int*)v, 4);
     }
     int get0() const
     {
-        return vmv_x_s_i32m1_i32(val, 4);
+        return vmv_x_s_i32m1_i32(val);
     }
     vint32m1_t val;
 };
@@ -152,11 +152,11 @@ struct v_float32x4
     v_float32x4(float v0, float v1, float v2, float v3)
     {
         float v[] = {v0, v1, v2, v3};
-        val = (vfloat32m1_t)vle_v_f32m1((float*)v, 4);
+        val = (vfloat32m1_t)vle32_v_f32m1((float*)v, 4);
     }
     float get0() const
     {
-        return vfmv_f_s_f32m1_f32(val, 4);
+        return vfmv_f_s_f32m1_f32(val);
     }
     vfloat32m1_t val;
 };
@@ -171,11 +171,11 @@ struct v_uint64x2
     v_uint64x2(uint64 v0, uint64 v1)
     {
         uint64 v[] = {v0, v1};
-        val = (vuint64m1_t)vle_v_u64m1((unsigned long*)v, 2);
+        val = (vuint64m1_t)vle64_v_u64m1((unsigned long*)v, 2);
     }
     uint64 get0() const
     {
-        return vmv_x_s_u64m1_u64(val, 2);
+        return vmv_x_s_u64m1_u64(val);
     }
     vuint64m1_t val;
 };
@@ -190,11 +190,11 @@ struct v_int64x2
     v_int64x2(int64 v0, int64 v1)
     {
         int64 v[] = {v0, v1};
-        val = (vint64m1_t)vle_v_i64m1((long*)v, 2);
+        val = (vint64m1_t)vle64_v_i64m1((long*)v, 2);
     }
     int64 get0() const
     {
-        return vmv_x_s_i64m1_i64(val, 2);
+        return vmv_x_s_i64m1_i64(val);
     }
     vint64m1_t val;
 };
@@ -209,21 +209,21 @@ struct v_float64x2
     v_float64x2(double v0, double v1)
     {
         double v[] = {v0, v1};
-        val = (vfloat64m1_t)vle_v_f64m1((double*)v, 2);
+        val = (vfloat64m1_t)vle64_v_f64m1((double*)v, 2);
     }
     double get0() const
     {
-        return vfmv_f_s_f64m1_f64(val, 2);
+        return vfmv_f_s_f64m1_f64(val);
     }
     vfloat64m1_t val;
 };
-
+/*
 #define OPENCV_HAL_IMPL_RISCVV_INIT(_Tpv, _Tp, suffix) \
-inline _Tp##m1_t vreinterpretq_##suffix##_##suffix(_Tp##m1_t v) { return v; } \
+inline _Tp##m1_t vreinterpret_v_##suffix##m1_##suffix##m1(_Tp##m1_t v) { return v; } \
 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16((vuint8m1_t)(v.val)); } \
 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16((vint8m1_t)(v.val)); } \
 inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8((vuint16m1_t)(v.val)); } \
-inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8((vint16m1_t)(v.val)); } \
+inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpret_v_i8m1_i16m1(v.val)); } \
 inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4((vuint32m1_t)(v.val)); } \
 inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4((vint32m1_t)(v.val)); } \
 inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2((vuint64m1_t)(v.val)); } \
@@ -233,17 +233,128 @@ inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(
 
 
 OPENCV_HAL_IMPL_RISCVV_INIT(uint8x16, vuint8, u8)
-OPENCV_HAL_IMPL_RISCVV_INIT(int8x16, vint8, s8)
+OPENCV_HAL_IMPL_RISCVV_INIT(int8x16, vint8, i8)
 OPENCV_HAL_IMPL_RISCVV_INIT(uint16x8, vuint16, u16)
-OPENCV_HAL_IMPL_RISCVV_INIT(int16x8, vint16, s16)
+OPENCV_HAL_IMPL_RISCVV_INIT(int16x8, vint16, i16)
 OPENCV_HAL_IMPL_RISCVV_INIT(uint32x4, vuint32, u32)
-OPENCV_HAL_IMPL_RISCVV_INIT(int32x4, vint32, s32)
+OPENCV_HAL_IMPL_RISCVV_INIT(int32x4, vint32, i32)
 OPENCV_HAL_IMPL_RISCVV_INIT(uint64x2, vuint64, u64)
-OPENCV_HAL_IMPL_RISCVV_INIT(int64x2, vint64, s64)
+OPENCV_HAL_IMPL_RISCVV_INIT(int64x2, vint64, i64)
 OPENCV_HAL_IMPL_RISCVV_INIT(float64x2, vfloat64, f64)
 OPENCV_HAL_IMPL_RISCVV_INIT(float32x4, vfloat32, f32)
+*/
+inline v_uint8x16 v_reinterpret_as_u8(const v_uint8x16& v) { return v_uint8x16(v.val); }
+inline v_int8x16 v_reinterpret_as_s8(const v_uint8x16& v) { return v_int8x16(vreinterpret_v_u8m1_i8m1(v.val)); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_uint8x16& v) { return v_uint16x8(vreinterpret_v_u8m1_u16m1(v.val)); }
+inline v_int16x8 v_reinterpret_as_s16(const v_uint8x16& v) { return v_int16x8(vreinterpret_v_u16m1_i16m1(vreinterpret_v_u8m1_u16m1(v.val))); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_uint8x16& v) { return v_uint32x4(vreinterpret_v_u8m1_u32m1(v.val)); }
+inline v_int32x4 v_reinterpret_as_s32(const v_uint8x16& v) { return v_int32x4(vreinterpret_v_u32m1_i32m1(vreinterpret_v_u8m1_u32m1(v.val))); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_uint8x16& v) { return v_uint64x2(vreinterpret_v_u8m1_u64m1(v.val)); }
+inline v_int64x2 v_reinterpret_as_s64(const v_uint8x16& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(vreinterpret_v_u8m1_u64m1(v.val))); }
+inline v_float32x4 v_reinterpret_as_f32(const v_uint8x16& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u8m1_u32m1(v.val))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_uint8x16& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u8m1_u64m1(v.val))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_int8x16& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(v.val)); }
+inline v_int8x16 v_reinterpret_as_s8(const v_int8x16& v) { return v_int8x16(v.val); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_int8x16& v) { return v_uint16x8(vreinterpret_v_u8m1_u16m1(vreinterpret_v_i8m1_u8m1(v.val))); }
+inline v_int16x8 v_reinterpret_as_s16(const v_int8x16& v) { return v_int16x8(vreinterpret_v_i8m1_i16m1(v.val)); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_int8x16& v) { return v_uint32x4(vreinterpret_v_u8m1_u32m1(vreinterpret_v_i8m1_u8m1(v.val))); }
+inline v_int32x4 v_reinterpret_as_s32(const v_int8x16& v) { return v_int32x4(vreinterpret_v_i8m1_i32m1(v.val)); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_int8x16& v) { return v_uint64x2(vreinterpret_v_u8m1_u64m1(vreinterpret_v_i8m1_u8m1(v.val))); }
+inline v_int64x2 v_reinterpret_as_s64(const v_int8x16& v) { return v_int64x2(vreinterpret_v_i8m1_i64m1(v.val)); }
+inline v_float32x4 v_reinterpret_as_f32(const v_int8x16& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i8m1_i32m1(v.val))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_int8x16& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i8m1_i64m1(v.val))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_uint16x8& v) { return v_uint8x16(vreinterpret_v_u16m1_u8m1(v.val)); }
+inline v_int8x16 v_reinterpret_as_s8(const v_uint16x8& v) { return v_int8x16(vreinterpret_v_i16m1_i8m1(vreinterpret_v_u16m1_i16m1(v.val))); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_uint16x8& v) { return v_uint16x8(v.val); }
+inline v_int16x8 v_reinterpret_as_s16(const v_uint16x8& v) { return v_int16x8(vreinterpret_v_u16m1_i16m1(v.val)); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_uint16x8& v) { return v_uint32x4(vreinterpret_v_u16m1_u32m1(v.val)); }
+inline v_int32x4 v_reinterpret_as_s32(const v_uint16x8& v) { return v_int32x4(vreinterpret_v_u32m1_i32m1(vreinterpret_v_u16m1_u32m1(v.val))); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_uint16x8& v) { return v_uint64x2(vreinterpret_v_u16m1_u64m1(v.val)); }
+inline v_int64x2 v_reinterpret_as_s64(const v_uint16x8& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(vreinterpret_v_u16m1_u64m1(v.val))); }
+inline v_float32x4 v_reinterpret_as_f32(const v_uint16x8& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u16m1_u32m1(v.val))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_uint16x8& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u16m1_u64m1(v.val))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_int16x8& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(v.val))); }
+inline v_int8x16 v_reinterpret_as_s8(const v_int16x8& v) { return v_int8x16(vreinterpret_v_i16m1_i8m1(v.val)); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_int16x8& v) { return v_uint16x8(vreinterpret_v_i16m1_u16m1(v.val)); }
+inline v_int16x8 v_reinterpret_as_s16(const v_int16x8& v) { return v_int16x8(v.val); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_int16x8& v) { return v_uint32x4(vreinterpret_v_u16m1_u32m1(vreinterpret_v_i16m1_u16m1(v.val))); }
+inline v_int32x4 v_reinterpret_as_s32(const v_int16x8& v) { return v_int32x4(vreinterpret_v_i16m1_i32m1(v.val)); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_int16x8& v) { return v_uint64x2(vreinterpret_v_u16m1_u64m1(vreinterpret_v_i16m1_u16m1(v.val))); }
+inline v_int64x2 v_reinterpret_as_s64(const v_int16x8& v) { return v_int64x2(vreinterpret_v_i16m1_i64m1(v.val)); }
+inline v_float32x4 v_reinterpret_as_f32(const v_int16x8& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i16m1_i32m1(v.val))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_int16x8& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i16m1_i64m1(v.val))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_uint32x4& v) { return v_uint8x16(vreinterpret_v_u32m1_u8m1(v.val)); }
+inline v_int8x16 v_reinterpret_as_s8(const v_uint32x4& v) { return v_int8x16(vreinterpret_v_i32m1_i8m1(vreinterpret_v_u32m1_i32m1(v.val))); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_uint32x4& v) { return v_uint16x8(vreinterpret_v_u32m1_u16m1(v.val)); }
+inline v_int16x8 v_reinterpret_as_s16(const v_uint32x4& v) { return v_int16x8(vreinterpret_v_i32m1_i16m1(vreinterpret_v_u32m1_i32m1(v.val))); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_uint32x4& v) { return v_uint32x4(v.val); }
+inline v_int32x4 v_reinterpret_as_s32(const v_uint32x4& v) { return v_int32x4(vreinterpret_v_u32m1_i32m1(v.val)); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_uint32x4& v) { return v_uint64x2(vreinterpret_v_u32m1_u64m1(v.val)); }
+inline v_int64x2 v_reinterpret_as_s64(const v_uint32x4& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(vreinterpret_v_u32m1_u64m1(v.val))); }
+inline v_float32x4 v_reinterpret_as_f32(const v_uint32x4& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(v.val)); }
+inline v_float64x2 v_reinterpret_as_f64(const v_uint32x4& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u32m1_u64m1(v.val))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_int32x4& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i32m1_i8m1(v.val))); }
+inline v_int8x16 v_reinterpret_as_s8(const v_int32x4& v) { return v_int8x16(vreinterpret_v_i32m1_i8m1(v.val)); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_int32x4& v) { return v_uint16x8(vreinterpret_v_u32m1_u16m1(vreinterpret_v_i32m1_u32m1(v.val))); }
+inline v_int16x8 v_reinterpret_as_s16(const v_int32x4& v) { return v_int16x8(vreinterpret_v_i32m1_i16m1(v.val)); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_int32x4& v) { return v_uint32x4(vreinterpret_v_i32m1_u32m1(v.val)); }
+inline v_int32x4 v_reinterpret_as_s32(const v_int32x4& v) { return v_int32x4(v.val); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_int32x4& v) { return v_uint64x2(vreinterpret_v_u32m1_u64m1(vreinterpret_v_i32m1_u32m1(v.val))); }
+inline v_int64x2 v_reinterpret_as_s64(const v_int32x4& v) { return v_int64x2(vreinterpret_v_i32m1_i64m1(v.val)); }
+inline v_float32x4 v_reinterpret_as_f32(const v_int32x4& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(v.val)); }
+inline v_float64x2 v_reinterpret_as_f64(const v_int32x4& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i32m1_i64m1(v.val))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_uint64x2& v) { return v_uint8x16(vreinterpret_v_u64m1_u8m1(v.val)); }
+inline v_int8x16 v_reinterpret_as_s8(const v_uint64x2& v) { return v_int8x16(vreinterpret_v_i64m1_i8m1(vreinterpret_v_u64m1_i64m1(v.val))); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_uint64x2& v) { return v_uint16x8(vreinterpret_v_u64m1_u16m1(v.val)); }
+inline v_int16x8 v_reinterpret_as_s16(const v_uint64x2& v) { return v_int16x8(vreinterpret_v_i64m1_i16m1(vreinterpret_v_u64m1_i64m1(v.val))); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_uint64x2& v) { return v_uint32x4(vreinterpret_v_u64m1_u32m1(v.val)); }
+inline v_int32x4 v_reinterpret_as_s32(const v_uint64x2& v) { return v_int32x4(vreinterpret_v_i64m1_i32m1(vreinterpret_v_u64m1_i64m1(v.val))); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_uint64x2& v) { return v_uint64x2(v.val); }
+inline v_int64x2 v_reinterpret_as_s64(const v_uint64x2& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(v.val)); }
+inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u64m1_u32m1(v.val))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(v.val)); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_int64x2& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i64m1_i8m1(v.val))); }
+inline v_int8x16 v_reinterpret_as_s8(const v_int64x2& v) { return v_int8x16(vreinterpret_v_i64m1_i8m1(v.val)); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_int64x2& v) { return v_uint16x8(vreinterpret_v_u64m1_u16m1(vreinterpret_v_i64m1_u64m1(v.val))); }
+inline v_int16x8 v_reinterpret_as_s16(const v_int64x2& v) { return v_int16x8(vreinterpret_v_i64m1_i16m1(v.val)); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_int64x2& v) { return v_uint32x4(vreinterpret_v_u64m1_u32m1(vreinterpret_v_i64m1_u64m1(v.val))); }
+inline v_int32x4 v_reinterpret_as_s32(const v_int64x2& v) { return v_int32x4(vreinterpret_v_i64m1_i32m1(v.val)); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_int64x2& v) { return v_uint64x2(vreinterpret_v_i64m1_u64m1(v.val)); }
+inline v_int64x2 v_reinterpret_as_s64(const v_int64x2& v) { return v_int64x2(v.val); }
+inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i64m1_i32m1(v.val))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(v.val)); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_float32x4& v) { return v_uint8x16(vreinterpret_v_u32m1_u8m1(vreinterpret_v_f32m1_u32m1(v.val))); }
+inline v_int8x16 v_reinterpret_as_s8(const v_float32x4& v) { return v_int8x16(vreinterpret_v_i32m1_i8m1(vreinterpret_v_f32m1_i32m1(v.val))); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_float32x4& v) { return v_uint16x8(vreinterpret_v_u32m1_u16m1(vreinterpret_v_f32m1_u32m1(v.val))); }
+inline v_int16x8 v_reinterpret_as_s16(const v_float32x4& v) { return v_int16x8(vreinterpret_v_i32m1_i16m1(vreinterpret_v_f32m1_i32m1(v.val))); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_float32x4& v) { return v_uint32x4(vreinterpret_v_f32m1_u32m1(v.val)); }
+inline v_int32x4 v_reinterpret_as_s32(const v_float32x4& v) { return v_int32x4(vreinterpret_v_f32m1_i32m1(v.val)); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_float32x4& v) { return v_uint64x2(vreinterpret_v_u32m1_u64m1(vreinterpret_v_f32m1_u32m1(v.val))); }
+inline v_int64x2 v_reinterpret_as_s64(const v_float32x4& v) { return v_int64x2(vreinterpret_v_i32m1_i64m1(vreinterpret_v_f32m1_i32m1(v.val))); }
+inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& v) { return v_float32x4(v.val); }
+inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i32m1_i64m1(vreinterpret_v_f32m1_i32m1(v.val)))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_float64x2& v) { return v_uint8x16(vreinterpret_v_u64m1_u8m1(vreinterpret_v_f64m1_u64m1(v.val))); }
+inline v_int8x16 v_reinterpret_as_s8(const v_float64x2& v) { return v_int8x16(vreinterpret_v_i64m1_i8m1(vreinterpret_v_f64m1_i64m1(v.val))); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_float64x2& v) { return v_uint16x8(vreinterpret_v_u64m1_u16m1(vreinterpret_v_f64m1_u64m1(v.val))); }
+inline v_int16x8 v_reinterpret_as_s16(const v_float64x2& v) { return v_int16x8(vreinterpret_v_i64m1_i16m1(vreinterpret_v_f64m1_i64m1(v.val))); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_float64x2& v) { return v_uint32x4(vreinterpret_v_u64m1_u32m1(vreinterpret_v_f64m1_u64m1(v.val))); }
+inline v_int32x4 v_reinterpret_as_s32(const v_float64x2& v) { return v_int32x4(vreinterpret_v_i64m1_i32m1(vreinterpret_v_f64m1_i64m1(v.val))); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_float64x2& v) { return v_uint64x2(vreinterpret_v_f64m1_u64m1(v.val)); }
+inline v_int64x2 v_reinterpret_as_s64(const v_float64x2& v) { return v_int64x2(vreinterpret_v_f64m1_i64m1(v.val)); }
+inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i64m1_i32m1(vreinterpret_v_f64m1_i64m1(v.val)))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& v) { return v_float64x2(v.val); }
+
 #define OPENCV_HAL_IMPL_RISCVV_INIT_SET(__Tp, _Tp, suffix, len, num) \
-inline v_##_Tp##x##num v_setzero_##suffix() { return v_##_Tp##x##num((v##_Tp##m1_t){0}); }     \
+inline v_##_Tp##x##num v_setzero_##suffix() { return v_##_Tp##x##num(vmv_v_x_##len##m1(0, num)); }     \
 inline v_##_Tp##x##num v_setall_##suffix(__Tp v) { return v_##_Tp##x##num(vmv_v_x_##len##m1(v, num)); }
 
 OPENCV_HAL_IMPL_RISCVV_INIT_SET(uchar, uint8, u8, u8, 16)
@@ -254,7 +365,7 @@ OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned int, uint32, u32, u32, 4)
 OPENCV_HAL_IMPL_RISCVV_INIT_SET(int, int32, s32, i32, 4)
 OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned long, uint64, u64, u64, 2)
 OPENCV_HAL_IMPL_RISCVV_INIT_SET(long, int64, s64, i64, 2)
-inline v_float32x4 v_setzero_f32() { return v_float32x4((vfloat32m1_t){0}); }
+inline v_float32x4 v_setzero_f32() { return v_float32x4(vfmv_v_f_f32m1(0, 4)); }
 inline v_float32x4 v_setall_f32(float v) { return v_float32x4(vfmv_v_f_f32m1(v, 4)); }
 
 inline v_float64x2 v_setzero_f64() { return v_float64x2(vfmv_v_f_f64m1(0, 2)); }
@@ -297,8 +408,8 @@ OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_int32x4, vmul_vv_i32m1, 4)
 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint32x4, vadd_vv_u32m1, 4)
 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint32x4, vsub_vv_u32m1, 4)
 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_uint32x4, vmul_vv_u32m1, 4)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int64x2, vadd_vv_i64m1, 2)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int64x2, vsub_vv_i64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int64x2, vsadd_vv_i64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int64x2, vssub_vv_i64m1, 2)
 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint64x2, vadd_vv_u64m1, 2)
 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint64x2, vsub_vv_u64m1, 2)
 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float32x4, vfadd_vv_f32m1, 4)
@@ -401,10 +512,10 @@ inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
                             const v_float32x4& m1, const v_float32x4& m2,
                             const v_float32x4& m3)
 {
-    vfloat32m1_t res = vfmul_vf_f32m1(m0.val, v.val[0], 4);//vmuli_f32(m0.val, v.val, 0);
-    res = vfmacc_vf_f32m1(res, v.val[1], m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
-    res = vfmacc_vf_f32m1(res, v.val[2], m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
-    res = vfmacc_vf_f32m1(res, v.val[3], m3.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    vfloat32m1_t res = vfmul_vv_f32m1(m0.val, vrgather_vx_f32m1(v.val, 0, 4), 4);//vmuli_f32(m0.val, v.val, 0);
+    res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 1, 4), m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 2, 4), m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 3, 4), m3.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
     return v_float32x4(res);
 }
 
@@ -412,9 +523,9 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
                                const v_float32x4& m1, const v_float32x4& m2,
                                const v_float32x4& a)
 {
-    vfloat32m1_t res = vfmul_vf_f32m1(m0.val, v.val[0], 4);//vmuli_f32(m0.val, v.val, 0);
-    res = vfmacc_vf_f32m1(res, v.val[1], m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
-    res = vfmacc_vf_f32m1(res, v.val[2], m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    vfloat32m1_t res = vfmul_vv_f32m1(m0.val, vrgather_vx_f32m1(v.val, 0, 4), 4);//vmuli_f32(m0.val, v.val, 0);
+    res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 1, 4), m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 2, 4), m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
     res = vfadd_vv_f32m1(res, a.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
     return v_float32x4(res);
 }
@@ -471,11 +582,11 @@ OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int64x2,  i64m1, 2)
 #define OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(bin_op, intrin) \
 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
 { \
-    return v_float32x4(vfloat32m1_t(intrin(vint32m1_t(a.val), vint32m1_t(b.val), 4))); \
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4))); \
 } \
 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
 { \
-    a.val = vfloat32m1_t(intrin(vint32m1_t(a.val), vint32m1_t(b.val), 4)); \
+    a.val = vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4)); \
     return a; \
 }
 
@@ -485,17 +596,17 @@ OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(^, vxor_vv_i32m1)
 
 inline v_float32x4 operator ~ (const v_float32x4& a)
 {
-    return v_float32x4((vfloat32m1_t)(vnot_v_i32m1((vint32m1_t)(a.val), 4)));
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 4)));
 }
 
 #define OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(bin_op, intrin) \
 inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
 { \
-    return v_float64x2(vfloat64m1_t(intrin(vint64m1_t(a.val), vint64m1_t(b.val), 2))); \
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2))); \
 } \
 inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
 { \
-    a.val = vfloat64m1_t(intrin(vint64m1_t(a.val), vint64m1_t(b.val), 2)); \
+    a.val = vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2)); \
     return a; \
 }
 
@@ -505,7 +616,7 @@ OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(^, vxor_vv_i64m1)
 
 inline v_float64x2 operator ~ (const v_float64x2& a)
 {
-    return v_float64x2((vfloat64m1_t)(vnot_v_i64m1((vint64m1_t)(a.val), 2)));
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a.val), 2)));
 }
 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
 {
@@ -527,19 +638,19 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
 inline v_uint32x4 v_abs(v_int32x4 x)
 {
     vbool32_t mask=vmslt_vx_i32m1_b32(x.val, 0, 4);
-    return v_uint32x4((vuint32m1_t)vrsub_vx_i32m1_m(mask, x.val, x.val, 0, 4));
+    return v_uint32x4(vreinterpret_v_i32m1_u32m1(vrsub_vx_i32m1_m(mask, x.val, x.val, 0, 4)));
 }
 
 inline v_uint16x8 v_abs(v_int16x8 x)
 {
     vbool16_t mask=vmslt_vx_i16m1_b16(x.val, 0, 8);
-    return v_uint16x8((vuint16m1_t)vrsub_vx_i16m1_m(mask, x.val, x.val, 0, 8));
+    return v_uint16x8(vreinterpret_v_i16m1_u16m1(vrsub_vx_i16m1_m(mask, x.val, x.val, 0, 8)));
 }
 
 inline v_uint8x16 v_abs(v_int8x16 x)
 {
     vbool8_t mask=vmslt_vx_i8m1_b8(x.val, 0, 16);
-    return v_uint8x16((vuint8m1_t)vrsub_vx_i8m1_m(mask, x.val, x.val, 0, 16));
+    return v_uint8x16(vreinterpret_v_i8m1_u8m1(vrsub_vx_i8m1_m(mask, x.val, x.val, 0, 16)));
 }
 
 inline v_float32x4 v_abs(v_float32x4 x)
@@ -591,7 +702,7 @@ inline v_int16x8 v_absdiffs(v_int16x8 a, v_int16x8 b){
 inline v_uint##_Tpvec v_absdiff(v_int##_Tpvec a, v_int##_Tpvec b){    \
      vint##_Tpv##_t max = vmax_vv_i##_Tpv(a.val, b.val, num);\
      vint##_Tpv##_t min = vmin_vv_i##_Tpv(a.val, b.val, num);\
-    return v_uint##_Tpvec((vuint##_Tpv##_t)vsub_vv_i##_Tpv(max, min, num));    \
+    return v_uint##_Tpvec(vreinterpret_v_i##_Tpv##_u##_Tpv(vsub_vv_i##_Tpv(max, min, num)));    \
 }
 
 OPENCV_HAL_IMPL_RISCVV_ABSDIFF(8x16, 8m1, 16)
@@ -604,8 +715,8 @@ inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
 {
     vint16m2_t res = vundefined_i16m2();
     res = vwmul_vv_i16m2(a.val, b.val, 16);
-    c.val = vget_i16m2_i16m1(res, 0);
-    d.val = vget_i16m2_i16m1(res, 1);
+    c.val = vget_v_i16m2_i16m1(res, 0);
+    d.val = vget_v_i16m2_i16m1(res, 1);
 }
 
 inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
@@ -613,8 +724,8 @@ inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
 {
     vuint16m2_t res = vundefined_u16m2();
     res = vwmulu_vv_u16m2(a.val, b.val, 16);
-    c.val = vget_u16m2_u16m1(res, 0);
-    d.val = vget_u16m2_u16m1(res, 1);
+    c.val = vget_v_u16m2_u16m1(res, 0);
+    d.val = vget_v_u16m2_u16m1(res, 1);
 }
 
 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
@@ -622,8 +733,8 @@ inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
 {
     vint32m2_t res = vundefined_i32m2();
     res = vwmul_vv_i32m2(a.val, b.val, 8);
-    c.val = vget_i32m2_i32m1(res, 0);
-    d.val = vget_i32m2_i32m1(res, 1);
+    c.val = vget_v_i32m2_i32m1(res, 0);
+    d.val = vget_v_i32m2_i32m1(res, 1);
 }
 
 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
@@ -631,8 +742,8 @@ inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
 {
     vuint32m2_t res = vundefined_u32m2();
     res = vwmulu_vv_u32m2(a.val, b.val, 8);
-    c.val = vget_u32m2_u32m1(res, 0);
-    d.val = vget_u32m2_u32m1(res, 1);
+    c.val = vget_v_u32m2_u32m1(res, 0);
+    d.val = vget_v_u32m2_u32m1(res, 1);
 }
 
 inline void v_mul_expand(const v_int32x4& a, const v_int32x4& b,
@@ -640,8 +751,8 @@ inline void v_mul_expand(const v_int32x4& a, const v_int32x4& b,
 {
     vint64m2_t res = vundefined_i64m2();
     res = vwmul_vv_i64m2(a.val, b.val, 4);
-    c.val = vget_i64m2_i64m1(res, 0);
-    d.val = vget_i64m2_i64m1(res, 1);
+    c.val = vget_v_i64m2_i64m1(res, 0);
+    d.val = vget_v_i64m2_i64m1(res, 1);
 }
 
 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
@@ -649,8 +760,8 @@ inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
 {
     vuint64m2_t res = vundefined_u64m2();
     res = vwmulu_vv_u64m2(a.val, b.val, 4);
-    c.val = vget_u64m2_u64m1(res, 0);
-    d.val = vget_u64m2_u64m1(res, 1);
+    c.val = vget_v_u64m2_u64m1(res, 0);
+    d.val = vget_v_u64m2_u64m1(res, 1);
 }
 
 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_add_wrap, vadd_vv_u8m1, 16)
@@ -669,118 +780,202 @@ OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_mul_wrap, vmul_vv_i16m1, 8)
 // 16 >> 32
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 {
+    vuint32m2_t vindex = vundefined_u32m2();
+    vuint32m1_t vindex0 = vid_v_u32m1(4);
+    vindex0 = vsll_vx_u32m1(vindex0, 1, 4);
+    vindex = vset_v_u32m1_u32m2(vindex, 0, vindex0);
+    vindex = vset_v_u32m1_u32m2(vindex, 1, vadd_vx_u32m1(vindex0, 1, 4));
     vint32m2_t res = vundefined_i32m2();
     res = vwmul_vv_i32m2(a.val, b.val, 8);
-    res = vrgather_vv_i32m2(res, (vuint32m2_t){0, 2, 4, 6, 1, 3, 5, 7}, 8);
-    return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(res, 0), vget_i32m2_i32m1(res, 1), 4));
+    res = vrgather_vv_i32m2(res, vindex, 8);
+    return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(res, 0), vget_v_i32m2_i32m1(res, 1), 4));
 }
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
 {
+    vuint32m2_t vindex = vundefined_u32m2();
+    vuint32m1_t vindex0 = vid_v_u32m1(4);
+    vindex0 = vsll_vx_u32m1(vindex0, 1, 4);
+    vindex = vset_v_u32m1_u32m2(vindex, 0, vindex0);
+    vindex = vset_v_u32m1_u32m2(vindex, 1, vadd_vx_u32m1(vindex0, 1, 4));
     vint32m2_t res = vundefined_i32m2();
     res = vwmul_vv_i32m2(a.val, b.val, 8);
-    res = vrgather_vv_i32m2(res, (vuint32m2_t){0, 2, 4, 6, 1, 3, 5, 7}, 8);
-    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(res, 0),vget_i32m2_i32m1(res, 1), 4), c.val, 4));
+    res = vrgather_vv_i32m2(res, vindex, 8);
+    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(res, 0),vget_v_i32m2_i32m1(res, 1), 4), c.val, 4));
 }
 
 // 32 >> 64
 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
 {
+    vuint64m2_t vindex = vundefined_u64m2();
+    vuint64m1_t vindex0 = vid_v_u64m1(2);
+    vindex0 = vsll_vx_u64m1(vindex0, 1, 2);
+    vindex = vset_v_u64m1_u64m2(vindex, 0, vindex0);
+    vindex = vset_v_u64m1_u64m2(vindex, 1, vadd_vx_u64m1(vindex0, 1, 2));
     vint64m2_t res = vundefined_i64m2();
     res = vwmul_vv_i64m2(a.val, b.val, 4);
-    res = vrgather_vv_i64m2(res, (vuint64m2_t){0, 2, 1, 3}, 4);
-    return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(res, 0), vget_i64m2_i64m1(res, 1), 2));
+    res = vrgather_vv_i64m2(res, vindex, 4);
+    return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(res, 0), vget_v_i64m2_i64m1(res, 1), 2));
 }
 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
 {
+    vuint64m2_t vindex = vundefined_u64m2();
+    vuint64m1_t vindex0 = vid_v_u64m1(2);
+    vindex0 = vsll_vx_u64m1(vindex0, 1, 2);
+    vindex = vset_v_u64m1_u64m2(vindex, 0, vindex0);
+    vindex = vset_v_u64m1_u64m2(vindex, 1, vadd_vx_u64m1(vindex0, 1, 2));
     vint64m2_t res = vundefined_i64m2();
     res = vwmul_vv_i64m2(a.val, b.val, 4);
-    res = vrgather_vv_i64m2(res, (vuint64m2_t){0, 2, 1, 3}, 4);
-    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(res, 0), vget_i64m2_i64m1(res, 1), 2), c.val, 2));
+    res = vrgather_vv_i64m2(res, vindex, 4);
+    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(res, 0), vget_v_i64m2_i64m1(res, 1), 2), c.val, 2));
 }
 
 // 8 >> 32
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
 {
+    vuint32m4_t vindex32 = vundefined_u32m4();
+    vuint32m1_t vindex0 = vid_v_u32m1(4);
+    vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
+    vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
     vuint16m2_t v1 = vundefined_u16m2();
     vuint32m2_t v2 = vundefined_u32m2();
     v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
-    v1 = vrgather_vv_u16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
-    v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
-    return v_uint32x4(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4));
+    v1 = vrgather_vv_u16m2(v1, vindex, 16);
+    v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
+    return v_uint32x4(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4));
 }
 
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
                                    const v_uint32x4& c)
 {
+    vuint32m4_t vindex32 = vundefined_u32m4();
+    vuint32m1_t vindex0 = vid_v_u32m1(4);
+    vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
+    vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
     vuint16m2_t v1 = vundefined_u16m2();
     vuint32m2_t v2 = vundefined_u32m2();
     v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
-    v1 = vrgather_vv_u16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
-    v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
-    return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4), c.val, 4));
+    v1 = vrgather_vv_u16m2(v1, vindex, 16);
+    v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
+    return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4), c.val, 4));
 }
 
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
 {
+    vuint32m4_t vindex32 = vundefined_u32m4();
+    vuint32m1_t vindex0 = vid_v_u32m1(4);
+    vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
+    vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
     vint16m2_t v1 = vundefined_i16m2();
     vint32m2_t v2 = vundefined_i32m2();
     v1 = vwmul_vv_i16m2(a.val, b.val, 16);
-    v1 = vrgather_vv_i16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
-    v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
-    return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4));
+    v1 = vrgather_vv_i16m2(v1, vindex, 16);
+    v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
+    return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4));
 }
 
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
                                    const v_int32x4& c)
 {
+    vuint32m4_t vindex32 = vundefined_u32m4();
+    vuint32m1_t vindex0 = vid_v_u32m1(4);
+    vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
+    vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
     vint16m2_t v1 = vundefined_i16m2();
     vint32m2_t v2 = vundefined_i32m2();
     v1 = vwmul_vv_i16m2(a.val, b.val, 16);
-    v1 = vrgather_vv_i16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
-    v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
-    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4), c.val, 4));
+    v1 = vrgather_vv_i16m2(v1, vindex, 16);
+    v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
+    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4), c.val, 4));
 }
 
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
 {
+    vuint64m4_t vindex64 = vundefined_u64m4();
+    vuint64m1_t vindex0 = vid_v_u64m1(2);
+    vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
+    vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
     vuint32m2_t v1 = vundefined_u32m2();
     vuint64m2_t v2 = vundefined_u64m2();
     v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
-    v1 = vrgather_vv_u32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
-    v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
-    return v_uint64x2(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2));
+    v1 = vrgather_vv_u32m2(v1, vindex, 8);
+    v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
+    return v_uint64x2(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2));
 }
 
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b,
                                    const v_uint64x2& c)
 {
+    vuint64m4_t vindex64 = vundefined_u64m4();
+    vuint64m1_t vindex0 = vid_v_u64m1(2);
+    vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
+    vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
     vuint32m2_t v1 = vundefined_u32m2();
     vuint64m2_t v2 = vundefined_u64m2();
     v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
-    v1 = vrgather_vv_u32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
-    v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
-    return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2), c.val, 2));
+    v1 = vrgather_vv_u32m2(v1, vindex, 8);
+    v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
+    return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2), c.val, 2));
 }
 
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
 {
+    vuint64m4_t vindex64 = vundefined_u64m4();
+    vuint64m1_t vindex0 = vid_v_u64m1(2);
+    vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
+    vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
     vint32m2_t v1 = vundefined_i32m2();
     vint64m2_t v2 = vundefined_i64m2();
     v1 = vwmul_vv_i32m2(a.val, b.val, 8);
-    v1 = vrgather_vv_i32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
-    v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
-    return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2));
+    v1 = vrgather_vv_i32m2(v1, vindex, 8);
+    v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
+    return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2));
 }
 
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
                                    const v_int64x2& c)
 {
+    vuint64m4_t vindex64 = vundefined_u64m4();
+    vuint64m1_t vindex0 = vid_v_u64m1(2);
+    vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
+    vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
     vint32m2_t v1 = vundefined_i32m2();
     vint64m2_t v2 = vundefined_i64m2();
     v1 = vwmul_vv_i32m2(a.val, b.val, 8);
-    v1 = vrgather_vv_i32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
-    v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
-    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2), c.val, 2));
+    v1 = vrgather_vv_i32m2(v1, vindex, 8);
+    v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
+    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2), c.val, 2));
 }
 
 //////// Fast Dot Product ////////
@@ -789,14 +984,14 @@ inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
 {
     vint32m2_t v1 = vundefined_i32m2();
     v1 = vwmul_vv_i32m2(a.val, b.val, 8);
-    return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4));
+    return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4));
 }
 
 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
 {
     vint32m2_t v1 = vundefined_i32m2();
     v1 = vwmul_vv_i32m2(a.val, b.val, 8);
-    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4), c.val, 4));
+    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4), c.val, 4));
 }
 
 // 32 >> 64
@@ -804,13 +999,13 @@ inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
 {
     vint64m2_t v1 = vundefined_i64m2();
     v1 = vwmul_vv_i64m2(a.val, b.val, 4);
-    return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 2));
+    return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(v1, 0), vget_v_i64m2_i64m1(v1, 1), 2));
 }
 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
 {
     vint64m2_t v1 = vundefined_i64m2();
     v1 = vwmul_vv_i64m2(a.val, b.val, 8);
-    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 4), c.val, 4));
+    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v1, 0), vget_v_i64m2_i64m1(v1, 1), 4), c.val, 4));
 }
 
 // 8 >> 32
@@ -819,8 +1014,8 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b
     vuint16m2_t v1 = vundefined_u16m2();
     vuint32m2_t v2 = vundefined_u32m2();
     v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
-    v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
-    return v_uint32x4(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4));
+    v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
+    return v_uint32x4(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4));
 }
 
 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
@@ -828,8 +1023,8 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b
     vuint16m2_t v1 = vundefined_u16m2();
     vuint32m2_t v2 = vundefined_u32m2();
     v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
-    v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
-    return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4), c.val, 4));
+    v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
+    return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4), c.val, 4));
 }
 
 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
@@ -837,16 +1032,16 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
     vint16m2_t v1 = vundefined_i16m2();
     vint32m2_t v2 = vundefined_i32m2();
     v1 = vwmul_vv_i16m2(a.val, b.val, 16);
-    v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
-    return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4));
+    v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
+    return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4));
 }
 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
 {
     vint16m2_t v1 = vundefined_i16m2();
     vint32m2_t v2 = vundefined_i32m2();
     v1 = vwmul_vv_i16m2(a.val, b.val, 16);
-    v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
-    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4), c.val, 4));
+    v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
+    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4), c.val, 4));
 }
 
 // 16 >> 64
@@ -855,16 +1050,16 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b
     vuint32m2_t v1 = vundefined_u32m2();
     vuint64m2_t v2 = vundefined_u64m2();
     v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
-    v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
-    return v_uint64x2(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2));
+    v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
+    return v_uint64x2(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2));
 }
 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
 {
     vuint32m2_t v1 = vundefined_u32m2();
     vuint64m2_t v2 = vundefined_u64m2();
     v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
-    v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
-    return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2), c.val, 2));
+    v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
+    return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2), c.val, 2));
 }
 
 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
@@ -872,16 +1067,16 @@ inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
     vint32m2_t v1 = vundefined_i32m2();
     vint64m2_t v2 = vundefined_i64m2();
     v1 = vwmul_vv_i32m2(a.val, b.val, 8);
-    v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
-    return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2));
+    v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
+    return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2));
 }
 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
 {
     vint32m2_t v1 = vundefined_i32m2();
     vint64m2_t v2 = vundefined_i64m2();
     v1 = vwmul_vv_i32m2(a.val, b.val, 8);
-    v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
-    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2), c.val, 2));
+    v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
+    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2), c.val, 2));
 }
 
 
@@ -890,16 +1085,16 @@ inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
 {\
     v##_Tpvec2##m1_t val = vmv_v_x_##len##m1(0, num); \
     val = intrin(val, a.val, val, num);    \
-    return vmv_x_s_##len##m1_##len(val, num);    \
+    return vmv_x_s_##len##m1_##len(val);    \
 }
 
 
-#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(_Tpvec, _Tpvec2, scalartype, func, funcu, num) \
+#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(_Tpvec, _Tpvec2, scalartype, func, funcu, num, scalerfunc) \
 inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
 {\
-    v##_Tpvec##m1_t val = (v##_Tpvec##m1_t)vmv_v_x_i8m1(0, num); \
+    v##_Tpvec##m1_t val = vundefined_##_Tpvec2##m1(); \
     val = v##funcu##_vs_##_Tpvec2##m1_##_Tpvec2##m1(val, a.val, a.val, num);    \
-    return val[0];    \
+    return scalerfunc(val);    \
 }
 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int8, int16, i16, int, sum, vwredsum_vs_i8m1_i16m1, 16)
 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int16, int32, i32, int, sum, vwredsum_vs_i16m1_i32m1, 8)
@@ -910,30 +1105,30 @@ OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint32, uint64, u64, unsigned, sum, vwredsumu
 inline float v_reduce_sum(const v_float32x4& a) \
 {\
     vfloat32m1_t val = vfmv_v_f_f32m1(0.0, 4); \
-    val = vfredsum_vs_f32m1_f32m1(val, a.val, val, 4);    \
-    return vfmv_f_s_f32m1_f32(val, 4);    \
+    val = vfredosum_vs_f32m1_f32m1(val, a.val, val, 4);    \
+    return vfmv_f_s_f32m1_f32(val);    \
 }
 inline double v_reduce_sum(const v_float64x2& a) \
 {\
     vfloat64m1_t val = vfmv_v_f_f64m1(0.0, 2); \
-    val = vfredsum_vs_f64m1_f64m1(val, a.val, val, 2);    \
-    return vfmv_f_s_f64m1_f64(val, 2);    \
+    val = vfredosum_vs_f64m1_f64m1(val, a.val, val, 2);    \
+    return vfmv_f_s_f64m1_f64(val);    \
 }
 inline uint64 v_reduce_sum(const v_uint64x2& a)
-{ return vext_x_v_u64m1_u64((vuint64m1_t)a.val, 0, 2)+vext_x_v_u64m1_u64((vuint64m1_t)a.val, 1, 2); }
+{ vuint64m1_t res = vundefined_u64m1(); return vmv_x_s_u64m1_u64(vredsum_vs_u64m1_u64m1(res, a.val, vmv_v_x_u64m1(0, 2), 2)); }
 
 inline int64 v_reduce_sum(const v_int64x2& a)
-{ return vext_x_v_i64m1_i64((vint64m1_t)a.val, 0, 2)+vext_x_v_i64m1_i64((vint64m1_t)a.val, 1, 2); }
+{ vint64m1_t res = vundefined_i64m1(); return vmv_x_s_i64m1_i64(vredsum_vs_i64m1_i64m1(res, a.val, vmv_v_x_i64m1(0, 2), 2)); }
 
 #define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(func)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int8,  i8, int, func, red##func, 16)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int16, i16, int, func, red##func, 8)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int32, i32, int, func, red##func, 4)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int64, i64, int, func, red##func, 2)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint8,  u8, unsigned, func, red##func##u, 16)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint16, u16, unsigned, func, red##func##u, 8)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint32, u32, unsigned, func, red##func##u, 4)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(float32, f32, float, func, fred##func, 4)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int8,  i8, int, func, red##func, 16, vmv_x_s_i8m1_i8)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int16, i16, int, func, red##func, 8, vmv_x_s_i16m1_i16)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int32, i32, int, func, red##func, 4, vmv_x_s_i32m1_i32)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int64, i64, int, func, red##func, 2, vmv_x_s_i64m1_i64)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint8,  u8, unsigned, func, red##func##u, 16, vmv_x_s_u8m1_u8)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint16, u16, unsigned, func, red##func##u, 8, vmv_x_s_u16m1_u16)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint32, u32, unsigned, func, red##func##u, 4, vmv_x_s_u32m1_u32)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(float32, f32, float, func, fred##func, 4, vfmv_f_s_f32m1_f32)
 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(max)
 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(min)
 
@@ -944,11 +1139,15 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
     vfloat32m1_t b0 = vfmv_v_f_f32m1(0.0, 4);
     vfloat32m1_t c0 = vfmv_v_f_f32m1(0.0, 4);
     vfloat32m1_t d0 = vfmv_v_f_f32m1(0.0, 4);
-    a0 = vfredsum_vs_f32m1_f32m1(a0, a.val, a0, 4);
-    b0 = vfredsum_vs_f32m1_f32m1(b0, b.val, b0, 4);
-    c0 = vfredsum_vs_f32m1_f32m1(c0, c.val, c0, 4);
-    d0 = vfredsum_vs_f32m1_f32m1(d0, d.val, d0, 4);
-    return v_float32x4(a0[0], b0[0], c0[0], d0[0]);
+    a0 = vfredosum_vs_f32m1_f32m1(a0, a.val, a0, 4);
+    b0 = vfredosum_vs_f32m1_f32m1(b0, b.val, b0, 4);
+    c0 = vfredosum_vs_f32m1_f32m1(c0, c.val, c0, 4);
+    d0 = vfredosum_vs_f32m1_f32m1(d0, d.val, d0, 4);
+    vfloat32m1_t res;
+    res = vslideup_vx_f32m1(a0, b0, 1, 4);
+    res = vslideup_vx_f32m1(res, c0, 2, 4);
+    res = vslideup_vx_f32m1(res, d0, 3, 4);
+    return v_float32x4(res);
 }
 
 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
@@ -957,8 +1156,8 @@ inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
     vfloat32m1_t x = vfsub_vv_f32m1(a.val, b.val, 4);
     vbool32_t mask=vmflt_vf_f32m1_b32(x, 0, 4);
     vfloat32m1_t val = vfrsub_vf_f32m1_m(mask, x, x, 0, 4);
-    a0 = vfredsum_vs_f32m1_f32m1(a0, val, a0, 4);
-    return a0[0];
+    a0 = vfredosum_vs_f32m1_f32m1(a0, val, a0, 4);
+    return vfmv_f_s_f32m1_f32(a0);
 }
 
 #define OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(_Tpvec, _Tpvec2) \
@@ -1020,43 +1219,43 @@ inline v_float32x4 operator == (const v_float32x4& a, const v_float32x4& b)
 {
     vbool32_t mask = vmfeq_vv_f32m1_b32(a.val, b.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
-    return v_float32x4((vfloat32m1_t)res);
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
 inline v_float32x4 operator != (const v_float32x4& a, const v_float32x4& b)
 {
     vbool32_t mask = vmfne_vv_f32m1_b32(a.val, b.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
-    return v_float32x4((vfloat32m1_t)res);
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
 inline v_float32x4 operator < (const v_float32x4& a, const v_float32x4& b)
 {
     vbool32_t mask = vmflt_vv_f32m1_b32(a.val, b.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
-    return v_float32x4((vfloat32m1_t)res);
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
 inline v_float32x4 operator <= (const v_float32x4& a, const v_float32x4& b)
 {
     vbool32_t mask = vmfle_vv_f32m1_b32(a.val, b.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
-    return v_float32x4((vfloat32m1_t)res);
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
 inline v_float32x4 operator > (const v_float32x4& a, const v_float32x4& b)
 {
     vbool32_t mask = vmfgt_vv_f32m1_b32(a.val, b.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
-    return v_float32x4((vfloat32m1_t)res);
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
 inline v_float32x4 operator >= (const v_float32x4& a, const v_float32x4& b)
 {
     vbool32_t mask = vmfge_vv_f32m1_b32(a.val, b.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
-    return v_float32x4((vfloat32m1_t)res);
-}
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
+}/**/
 inline v_float32x4 v_not_nan(const v_float32x4& a)
 {
-    vbool32_t mask = vmford_vv_f32m1_b32(a.val, a.val, 4);
+    vbool32_t mask = vmfeq_vv_f32m1_b32(a.val, a.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
-    return v_float32x4((vfloat32m1_t)res);
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
 
 //TODO: ==
@@ -1064,43 +1263,43 @@ inline v_float64x2 operator == (const v_float64x2& a, const v_float64x2& b)
 {
     vbool64_t mask = vmfeq_vv_f64m1_b64(a.val, b.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
-    return v_float64x2((vfloat64m1_t)res);
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
 inline v_float64x2 operator != (const v_float64x2& a, const v_float64x2& b)
 {
     vbool64_t mask = vmfne_vv_f64m1_b64(a.val, b.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
-    return v_float64x2((vfloat64m1_t)res);
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
 inline v_float64x2 operator < (const v_float64x2& a, const v_float64x2& b)
 {
     vbool64_t mask = vmflt_vv_f64m1_b64(a.val, b.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
-    return v_float64x2((vfloat64m1_t)res);
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
 inline v_float64x2 operator <= (const v_float64x2& a, const v_float64x2& b)
 {
     vbool64_t mask = vmfle_vv_f64m1_b64(a.val, b.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
-    return v_float64x2((vfloat64m1_t)res);
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
 inline v_float64x2 operator > (const v_float64x2& a, const v_float64x2& b)
 {
     vbool64_t mask = vmfgt_vv_f64m1_b64(a.val, b.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
-    return v_float64x2((vfloat64m1_t)res);
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
 inline v_float64x2 operator >= (const v_float64x2& a, const v_float64x2& b)
 {
     vbool64_t mask = vmfge_vv_f64m1_b64(a.val, b.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
-    return v_float64x2((vfloat64m1_t)res);
-}
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
+}/**/
 inline v_float64x2 v_not_nan(const v_float64x2& a)
 {
-    vbool64_t mask = vmford_vv_f64m1_b64(a.val, a.val, 2);
+    vbool64_t mask = vmfeq_vv_f64m1_b64(a.val, a.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
-    return v_float64x2((vfloat64m1_t)res);
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
 #define OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(_Tp, _T) \
 inline void v_transpose4x4(const v_##_Tp##32x4& a0, const v_##_Tp##32x4& a1, \
@@ -1108,16 +1307,23 @@ inline void v_transpose4x4(const v_##_Tp##32x4& a0, const v_##_Tp##32x4& a1, \
                          v_##_Tp##32x4& b0, v_##_Tp##32x4& b1, \
                          v_##_Tp##32x4& b2, v_##_Tp##32x4& b3) \
 { \
+    vuint32m4_t vindex = vundefined_u32m4(); \
+    vuint32m1_t vindex0 = vid_v_u32m1(4); \
+    vindex0 = vsll_vx_u32m1(vindex0, 2, 4); \
+    vindex = vset_v_u32m1_u32m4(vindex, 0, vindex0); \
+    vindex = vset_v_u32m1_u32m4(vindex, 1, vadd_vx_u32m1(vindex0, 1, 4)); \
+    vindex = vset_v_u32m1_u32m4(vindex, 2, vadd_vx_u32m1(vindex0, 2, 4)); \
+    vindex = vset_v_u32m1_u32m4(vindex, 3, vadd_vx_u32m1(vindex0, 3, 4)); \
     v##_Tp##32m4_t val = vundefined_##_T##m4();    \
-    val = vset_##_T##m4(val, 0, a0.val);    \
-    val = vset_##_T##m4(val, 1, a1.val);    \
-    val = vset_##_T##m4(val, 2, a2.val);    \
-    val = vset_##_T##m4(val, 3, a3.val);   \
-    val = vrgather_vv_##_T##m4(val, (vuint32m4_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);    \
-    b0.val = vget_##_T##m4_##_T##m1(val, 0);   \
-    b1.val = vget_##_T##m4_##_T##m1(val, 1);   \
-    b2.val = vget_##_T##m4_##_T##m1(val, 2);   \
-    b3.val = vget_##_T##m4_##_T##m1(val, 3);   \
+    val = vset_v_##_T##m1_##_T##m4(val, 0, a0.val);    \
+    val = vset_v_##_T##m1_##_T##m4(val, 1, a1.val);    \
+    val = vset_v_##_T##m1_##_T##m4(val, 2, a2.val);    \
+    val = vset_v_##_T##m1_##_T##m4(val, 3, a3.val);   \
+    val = vrgather_vv_##_T##m4(val, vindex, 16);    \
+    b0.val = vget_v_##_T##m4_##_T##m1(val, 0);   \
+    b1.val = vget_v_##_T##m4_##_T##m1(val, 1);   \
+    b2.val = vget_v_##_T##m4_##_T##m1(val, 2);   \
+    b3.val = vget_v_##_T##m4_##_T##m1(val, 3);   \
 }
 OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(uint, u32)
 OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(int, i32)
@@ -1167,25 +1373,28 @@ template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
 } \
 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
 {     \
-        return _Tpvec(vslidedown_vx_##_T##m1(a.val, n, num));\
+        suffix##m1_t res = vundefined_##_T##m1(); \
+        return _Tpvec(vslidedown_vx_##_T##m1(res, a.val, n, num));\
 } \
 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
 { return a; } \
 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
 { \
     suffix##m2_t tmp = vundefined_##_T##m2();    \
-    tmp = vset_##_T##m2(tmp, 0, a.val);          \
-    tmp = vset_##_T##m2(tmp, 1, b.val);          \
-        tmp = vslidedown_vx_##_T##m2(tmp, n, num2);\
-        return _Tpvec(vget_##_T##m2_##_T##m1(tmp, 0));\
+    suffix##m2_t res = vundefined_##_T##m2();    \
+    tmp = vset_v_##_T##m1_##_T##m2(tmp, 0, a.val);          \
+    tmp = vset_v_##_T##m1_##_T##m2(tmp, 1, b.val);          \
+        res = vslidedown_vx_##_T##m2(res, tmp, n, num2);\
+        return _Tpvec(vget_v_##_T##m2_##_T##m1(res, 0));\
 } \
 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
 { \
     suffix##m2_t tmp = vundefined_##_T##m2();    \
-    tmp = vset_##_T##m2(tmp, 0, b.val);    \
-    tmp = vset_##_T##m2(tmp, 1, a.val);    \
-        tmp = vslideup_vx_##_T##m2(tmp, n, num2);\
-        return _Tpvec(vget_##_T##m2_##_T##m1(tmp, 1));\
+    suffix##m2_t res = vundefined_##_T##m2();    \
+    tmp = vset_v_##_T##m1_##_T##m2(tmp, 0, b.val);    \
+    tmp = vset_v_##_T##m1_##_T##m2(tmp, 1, a.val);    \
+        res = vslideup_vx_##_T##m2(res, tmp, n, num2);\
+        return _Tpvec(vget_v_##_T##m2_##_T##m1(res, 1));\
 } \
 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
 { \
@@ -1203,50 +1412,132 @@ OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int64x2, vint64, i64, 2, 4, vmv_v_x, b64)
 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_float32x4, vfloat32, f32, 4, 8, vfmv_v_f, b32)
 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_float64x2, vfloat64, f64, 2, 4, vfmv_v_f, b64)
 
-#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num) \
+#if 1
+#define vreinterpret_v_i8m1_i8m1
+#define vreinterpret_v_u8m1_u8m1
+#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num, elemsize, ldst_len, ldst_type) \
 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
 { \
-  typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64; \
-  vuint64m1_t tmp = {*(unaligned_uint64*)ptr0, *(unaligned_uint64*)ptr1};\
-    return _Tpvec(_Tp2##_t(tmp)); } \
+  _Tp2##_t res = vundefined_##len(); \
+  _Tp2##_t res1 = vundefined_##len(); \
+  res = vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr0, 8)); \
+  res1 = vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr1, 8)); \
+  res = vslideup_vx_##len(res, res1, hnum, num); \
+  return _Tpvec(res); } \
 inline _Tpvec v_load_low(const _Tp* ptr) \
-{ return _Tpvec(vle_v_##len(ptr, hnum)); }\
+{ return _Tpvec(vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr, 8))); }\
 inline _Tpvec v_load_aligned(const _Tp* ptr) \
-{ return _Tpvec(vle_v_##len(ptr, num)); } \
+{ return _Tpvec(vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr, 16))); } \
 inline _Tpvec v_load(const _Tp* ptr) \
-{ return _Tpvec((_Tp2##_t)vle_v_##len((const _Tp *)ptr, num)); } \
+{ return _Tpvec(vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr, 16))); } \
 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
-{ vse_v_##len(ptr, a.val, hnum);}\
+{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 8);}\
 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
 { \
-  _Tp2##_t a0 = vslidedown_vx_##len(a.val, hnum, num);    \
-  vse_v_##len(ptr, a0, hnum);}\
+  _Tp2##_t a0 = vundefined_##len(); \
+  a0 = vslidedown_vx_##len(a0, a.val, hnum, num);    \
+  vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a0), 8);}\
 inline void v_store(_Tp* ptr, const _Tpvec& a) \
-{ vse_v_##len(ptr, a.val, num); } \
+{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); } \
 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
-{ vse_v_##len(ptr, a.val, num); } \
+{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); } \
 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
-{ vse_v_##len(ptr, a.val, num); } \
+{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); } \
 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
-{ vse_v_##len(ptr, a.val, num); }
+{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); }
+
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint8x16, uchar, vuint8m1, u8m1, 8, 16, 8, u8m1, uchar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int8x16,  schar, vint8m1, i8m1, 8, 16, 8, i8m1, schar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint16x8, ushort, vuint16m1, u16m1, 4, 8, 16, u8m1, uchar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int16x8,  short,  vint16m1, i16m1, 4, 8, 16, i8m1, schar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint32x4, unsigned, vuint32m1, u32m1, 2, 4, 32, u8m1, uchar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int32x4,  int,     vint32m1, i32m1, 2, 4, 32, i8m1, schar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint64x2, unsigned long, vuint64m1, u64m1, 1, 2, 64, u8m1, uchar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int64x2,  long,     vint64m1, i64m1, 1, 2, 64, i8m1, schar)
+
+#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_FLOAT_OP(_Tpvec, _Tp, _Tp2, len, hnum, num, elemsize) \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+  _Tp2##_t res = vundefined_##len(); \
+  _Tp2##_t res1 = vundefined_##len(); \
+  res = vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr0, 8))); \
+  res1 = vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr1, 8))); \
+  res = vslideup_vx_##len(res, res1, hnum, num); \
+  return _Tpvec(res); } \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ return _Tpvec(vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr, 8)))); }\
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr, 16)))); } \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec(vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr, 16)))); } \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 8);}\
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+  _Tp2##_t a0 = vundefined_##len(); \
+  a0 = vslidedown_vx_##len(a0, a.val, hnum, num);    \
+  vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a0)), 8);}\
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); }
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_FLOAT_OP(v_float32x4, float, vfloat32m1, f32m1, 2, 4, 32)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_FLOAT_OP(v_float64x2, double, vfloat64m1, f64m1, 1, 2, 64)
 
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint8x16, uchar, vuint8m1, u8m1, 8, 16)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int8x16,  schar, vint8m1, i8m1, 8, 16)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint16x8, ushort, vuint16m1, u16m1, 4, 8)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int16x8,  short,  vint16m1, i16m1, 4, 8)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint32x4, unsigned, vuint32m1, u32m1, 2, 4)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int32x4,  int,     vint32m1, i32m1, 2, 4)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint64x2, unsigned long, vuint64m1, u64m1, 1, 2)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int64x2,  long,     vint64m1, i64m1, 1, 2)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float32x4, float, vfloat32m1, f32m1, 2, 4)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float64x2, double, vfloat64m1, f64m1, 1, 2)
+#else
+
+#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num, elemsize) \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+  _Tp2##_t res, res1; \
+  res = vle##elemsize##_v_##len(ptr0, hnum); \
+  res1 = vle##elemsize##_v_##len(ptr1, hnum); \
+  res = vslideup_vx_##len(res, res1, hnum, num); \
+  return _Tpvec(res); } \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ return _Tpvec(vle##elemsize##_v_##len(ptr, hnum)); }\
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(vle##elemsize##_v_##len(ptr, num)); } \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec((_Tp2##_t)vle##elemsize##_v_##len((const _Tp *)ptr, num)); } \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ vse##elemsize##_v_##len(ptr, a.val, hnum);}\
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+  _Tp2##_t a0; \
+  a0 = vslidedown_vx_##len(a0, a.val, hnum, num);    \
+  vse##elemsize##_v_##len(ptr, a0, hnum);}\
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ vse##elemsize##_v_##len(ptr, a.val, num); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ vse##elemsize##_v_##len(ptr, a.val, num); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ vse##elemsize##_v_##len(ptr, a.val, num); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ vse##elemsize##_v_##len(ptr, a.val, num); }
+
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint8x16, uchar, vuint8m1, u8m1, 8, 16, 8)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int8x16,  schar, vint8m1, i8m1, 8, 16, 8)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint16x8, ushort, vuint16m1, u16m1, 4, 8, 16)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int16x8,  short,  vint16m1, i16m1, 4, 8, 16)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint32x4, unsigned, vuint32m1, u32m1, 2, 4, 32)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int32x4,  int,     vint32m1, i32m1, 2, 4, 32)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint64x2, unsigned long, vuint64m1, u64m1, 1, 2, 64)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int64x2,  long,     vint64m1, i64m1, 1, 2, 64)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float32x4, float, vfloat32m1, f32m1, 2, 4, 32)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float64x2, double, vfloat64m1, f64m1, 1, 2, 64)
 
+#endif
 
 ////////////// Lookup table access ////////////////////
 
 inline v_int8x16 v_lut(const schar* tab, const int* idx)
 {
-#if 1
+#if 0
     schar CV_DECL_ALIGNED(32) elems[16] =
     {
         tab[idx[ 0]],
@@ -1266,16 +1557,18 @@ inline v_int8x16 v_lut(const schar* tab, const int* idx)
         tab[idx[14]],
         tab[idx[15]]
     };
-    return v_int8x16(vle_v_i8m1(elems, 16));
+    return v_int8x16(vle8_v_i8m1(elems, 16));
 #else
-    int32xm4_t index32 = vlev_int32xm4(idx, 16);
-    vint16m2_t index16 = vnsra_vx_i16m2_int32xm4(index32, 0, 16);
-    vint8m1_t index = vnsra_vx_i8m1_i16m2(index16, 0, 16);
-    return v_int8x16(vlxbv_i8m1(tab, index, 16));
+#if __riscv_v == 7000
+    return v_int8x16(vnclip_wx_i8m1(vnclip_wx_i16m2(vlxb_v_i32m4((const int *)tab, vle32_v_u32m4((unsigned int *)idx, 16), 16), 0, 16), 0, 16));
+#else
+    return v_int8x16(vloxei32_v_i8m1(tab, vle32_v_u32m4((unsigned int *)idx, 16), 16));
+#endif
 #endif
 }
 
 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx){
+#if 0
     schar CV_DECL_ALIGNED(32) elems[16] =
     {
         tab[idx[0]],
@@ -1295,10 +1588,24 @@ inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx){
         tab[idx[7]],
         tab[idx[7] + 1]
     };
-    return v_int8x16(vle_v_i8m1(elems, 16));
+    return v_int8x16(vle8_v_i8m1(elems, 16));
+#else
+    vuint32m4_t seq, index;
+    vuint32m4_t vidx = vle32_v_u32m4((unsigned int *)idx, 8);
+    seq = vid_v_u32m4(16);
+    index = vsrl_vx_u32m4(seq, 1, 16);
+    vidx = vrgather_vv_u32m4(vidx, index, 16);
+    index = vadd_vv_u32m4(vand_vx_u32m4(seq, 1, 16), vidx, 16);
+#if __riscv_v == 7000
+    return v_int8x16(vnclip_wx_i8m1(vnclip_wx_i16m2(vlxb_v_i32m4((const int *)tab, index, 16), 0, 16), 0, 16));
+#else
+    return v_int8x16(vloxei32_v_i8m1(tab, index, 16));
+#endif
+#endif
 }
 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
 {
+#if 0
     schar CV_DECL_ALIGNED(32) elems[16] =
     {
         tab[idx[0]],
@@ -1318,7 +1625,23 @@ inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
         tab[idx[3] + 2],
         tab[idx[3] + 3]
     };
-    return v_int8x16(vle_v_i8m1(elems, 16));
+    return v_int8x16(vle8_v_i8m1(elems, 16));
+#else
+    vuint32m4_t seq, index;
+    vuint32m4_t vidx = vle32_v_u32m4((unsigned int *)idx, 4);
+    seq = vid_v_u32m4(16);
+    index = vsrl_vx_u32m4(seq, 2, 16);
+    vidx = vrgather_vv_u32m4(vidx, index, 16);
+    seq = vset_v_u32m1_u32m4(seq, 1, vget_v_u32m4_u32m1(seq, 0));
+    seq = vset_v_u32m1_u32m4(seq, 2, vget_v_u32m4_u32m1(seq, 0));
+    seq = vset_v_u32m1_u32m4(seq, 3, vget_v_u32m4_u32m1(seq, 0));
+    index = vadd_vv_u32m4(seq, vidx, 16);
+#if __riscv_v == 7000
+    return v_int8x16(vnclip_wx_i8m1(vnclip_wx_i16m2(vlxb_v_i32m4((const int *)tab, index, 16), 0, 16), 0, 16));
+#else
+    return v_int8x16(vloxei32_v_i8m1(tab, index, 16));
+#endif
+#endif
 }
 
 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
@@ -1327,6 +1650,7 @@ inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reint
 
 inline v_int16x8 v_lut(const short* tab, const int* idx)
 {
+#if 0
     short CV_DECL_ALIGNED(32) elems[8] =
     {
         tab[idx[0]],
@@ -1338,10 +1662,18 @@ inline v_int16x8 v_lut(const short* tab, const int* idx)
         tab[idx[6]],
         tab[idx[7]]
     };
-    return v_int16x8(vle_v_i16m1(elems, 8));
+    return v_int16x8(vle16_v_i16m1(elems, 8));
+#else
+#if __riscv_v == 7000
+    return v_int16x8(vnclip_wx_i16m1(vlxh_v_i32m2((const int *)tab, vsll_vx_u32m2(vle32_v_u32m2((unsigned int *)idx, 8), 1, 8), 8), 0, 8));
+#else
+    return v_int16x8(vloxei32_v_i16m1(tab, vsll_vx_u32m2(vle32_v_u32m2((unsigned int *)idx, 8), 1, 8), 8));
+#endif
+#endif
 }
 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
 {
+#if 0
     short CV_DECL_ALIGNED(32) elems[8] =
     {
         tab[idx[0]],
@@ -1353,10 +1685,24 @@ inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
         tab[idx[3]],
         tab[idx[3] + 1]
     };
-    return v_int16x8(vle_v_i16m1(elems, 8));
+    return v_int16x8(vle16_v_i16m1(elems, 8));
+#else
+    vuint32m2_t seq, index;
+    vuint32m2_t vidx = vle32_v_u32m2((unsigned int *)idx, 4);
+    seq = vid_v_u32m2(8);
+    index = vsrl_vx_u32m2(seq, 1, 8);
+    vidx = vrgather_vv_u32m2(vidx, index, 8);
+    index = vsll_vx_u32m2(vadd_vv_u32m2(vand_vx_u32m2(seq, 1, 8), vidx, 8), 1, 8);
+#if __riscv_v == 7000
+    return v_int16x8(vnclip_wx_i16m1(vlxh_v_i32m2((const int *)tab, index, 8), 0, 8));
+#else
+    return v_int16x8(vloxei32_v_i16m1(tab, index, 8));
+#endif
+#endif
 }
 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
 {
+#if 0
     short CV_DECL_ALIGNED(32) elems[8] =
     {
         tab[idx[0]],
@@ -1368,7 +1714,21 @@ inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
         tab[idx[1] + 2],
         tab[idx[1] + 3]
     };
-    return v_int16x8(vle_v_i16m1(elems, 8));
+    return v_int16x8(vle16_v_i16m1(elems, 8));
+#else
+    vuint32m2_t seq, index;
+    vuint32m2_t vidx = vle32_v_u32m2((unsigned int *)idx, 2);
+    seq = vid_v_u32m2(8);
+    index = vsrl_vx_u32m2(seq, 2, 8);
+    vidx = vrgather_vv_u32m2(vidx, index, 8);
+    seq = vset_v_u32m1_u32m2(seq, 1, vget_v_u32m2_u32m1(seq, 0));
+    index = vsll_vx_u32m2(vadd_vv_u32m2(seq, vidx, 8), 1, 8);
+#if __riscv_v == 7000
+    return v_int16x8(vnclip_wx_i16m1(vlxh_v_i32m2((const int *)tab, index, 8), 0, 8));
+#else
+    return v_int16x8(vloxei32_v_i16m1(tab, index, 8));
+#endif
+#endif
 }
 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
@@ -1376,6 +1736,7 @@ inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_rein
 
 inline v_int32x4 v_lut(const int* tab, const int* idx)
 {
+#if 0
     int CV_DECL_ALIGNED(32) elems[4] =
     {
         tab[idx[0]],
@@ -1383,10 +1744,14 @@ inline v_int32x4 v_lut(const int* tab, const int* idx)
         tab[idx[2]],
         tab[idx[3]]
     };
-    return v_int32x4(vle_v_i32m1(elems, 4));
+    return v_int32x4(vle32_v_i32m1(elems, 4));
+#else
+    return v_int32x4(vloxei32_v_i32m1(tab, vsll_vx_u32m1(vle32_v_u32m1((unsigned int *)idx, 4), 2, 4), 4));
+#endif
 }
 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
 {
+#if 0
     int CV_DECL_ALIGNED(32) elems[4] =
     {
         tab[idx[0]],
@@ -1394,11 +1759,20 @@ inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
         tab[idx[1]],
         tab[idx[1] + 1]
     };
-    return v_int32x4(vle_v_i32m1(elems, 4));
+    return v_int32x4(vle32_v_i32m1(elems, 4));
+#else
+    vuint32m1_t seq, index;
+    vuint32m1_t vidx = vle32_v_u32m1((unsigned int *)idx, 2);
+    seq = vid_v_u32m1(4);
+    index = vsrl_vx_u32m1(seq, 1, 4);
+    vidx = vrgather_vv_u32m1(vidx, index, 4);
+    index = vsll_vx_u32m1(vadd_vv_u32m1(vand_vx_u32m1(seq, 1, 4), vidx, 4), 2, 4);
+    return v_int32x4(vloxei32_v_i32m1(tab, index, 4));
+#endif
 }
 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
 {
-    return v_int32x4(vle_v_i32m1(tab+idx[0], 4));
+    return v_int32x4(vle32_v_i32m1(tab+idx[0], 4));
 }
 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
@@ -1406,26 +1780,27 @@ inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_re
 
 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
 {
-    vint64m1_t res = {tab[idx[0]], tab[idx[1]]};
-    return v_int64x2(res);
+    //vint64m1_t res = {tab[idx[0]], tab[idx[1]]};
+    return v_int64x2(vloxei64_v_i64m1(tab, vsll_vx_u64m1(vget_v_u64m2_u64m1(vwaddu_vx_u64m2(vle32_v_u32m1((uint32_t*)idx, 2), 0, 2), 0), 3, 2), 2));
 }
 inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
 {
-    return v_int64x2(vle_v_i64m1(tab+idx[0], 2));
+    return v_int64x2(vle64_v_i64m1(tab+idx[0], 2));
 }
 
 inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx)
 {
-    vuint64m1_t res = {tab[idx[0]], tab[idx[1]]};
-    return v_uint64x2(res);
+    //vuint64m1_t res = {tab[idx[0]], tab[idx[1]]};
+    return v_uint64x2(vloxei64_v_u64m1(tab, vsll_vx_u64m1(vget_v_u64m2_u64m1(vwaddu_vx_u64m2(vle32_v_u32m1((uint32_t*)idx, 2), 0, 2), 0), 3, 2), 2));
 }
 inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx)
 {
-    return v_uint64x2(vle_v_u64m1(tab+idx[0], 2));
+    return v_uint64x2(vle64_v_u64m1(tab+idx[0], 2));
 }
 
 inline v_float32x4 v_lut(const float* tab, const int* idx)
 {
+#if 0
     float CV_DECL_ALIGNED(32) elems[4] =
     {
         tab[idx[0]],
@@ -1433,10 +1808,14 @@ inline v_float32x4 v_lut(const float* tab, const int* idx)
         tab[idx[2]],
         tab[idx[3]]
     };
-    return v_float32x4(vle_v_f32m1(elems, 4));
+    return v_float32x4(vle32_v_f32m1(elems, 4));
+#else
+    return v_float32x4(vloxei32_v_f32m1(tab, vsll_vx_u32m1(vle32_v_u32m1((unsigned int *)idx, 4), 2, 4), 4));
+#endif
 }
 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
 {
+#if 0
     float CV_DECL_ALIGNED(32) elems[4] =
     {
         tab[idx[0]],
@@ -1444,69 +1823,79 @@ inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
         tab[idx[1]],
         tab[idx[1]+1]
     };
-    return v_float32x4(vle_v_f32m1(elems, 4));
+    return v_float32x4(vle32_v_f32m1(elems, 4));
+#else
+    vuint32m1_t seq, index;
+    vuint32m1_t vidx = vle32_v_u32m1((unsigned int *)idx, 2);
+    seq = vid_v_u32m1(4);
+    index = vsrl_vx_u32m1(seq, 1, 4);
+    vidx = vrgather_vv_u32m1(vidx, index, 4);
+    index = vsll_vx_u32m1(vadd_vv_u32m1(vand_vx_u32m1(seq, 1, 4), vidx, 4), 2, 4);
+    return v_float32x4(vloxei32_v_f32m1(tab, index, 4));
+#endif
 }
 inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
 {
-    return v_float32x4(vle_v_f32m1(tab + idx[0], 4));
+    return v_float32x4(vle32_v_f32m1(tab + idx[0], 4));
 }
 inline v_float64x2 v_lut(const double* tab, const int* idx)
 {
-    vfloat64m1_t res = {tab[idx[0]], tab[idx[1]]};
-    return v_float64x2(res);
+    //vfloat64m1_t res = {tab[idx[0]], tab[idx[1]]};
+    return v_float64x2(vloxei64_v_f64m1(tab, vsll_vx_u64m1(vget_v_u64m2_u64m1(vwaddu_vx_u64m2(vle32_v_u32m1((uint32_t*)idx, 2), 0, 2), 0), 3, 2), 2));
 }
 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
 {
-    return v_float64x2(vle_v_f64m1(tab+idx[0], 2));
+    return v_float64x2(vle64_v_f64m1(tab+idx[0], 2));
 }
 
 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
 {
-    int CV_DECL_ALIGNED(32) elems[4] =
+    /*int CV_DECL_ALIGNED(32) elems[4] =
     {
         tab[idxvec.val[0]],
         tab[idxvec.val[1]],
         tab[idxvec.val[2]],
         tab[idxvec.val[3]]
-    };
-    return v_int32x4(vle_v_i32m1(elems, 4));
+    };*/
+    return v_int32x4(vloxei32_v_i32m1(tab, vsll_vx_u32m1(vreinterpret_v_i32m1_u32m1(idxvec.val), 2, 4), 4));
 }
 
 inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
 {
-    unsigned CV_DECL_ALIGNED(32) elems[4] =
+    /*unsigned CV_DECL_ALIGNED(32) elems[4] =
     {
         tab[idxvec.val[0]],
         tab[idxvec.val[1]],
         tab[idxvec.val[2]],
         tab[idxvec.val[3]]
-    };
-    return v_uint32x4(vle_v_u32m1(elems, 4));
+    };*/
+    return v_uint32x4(vloxei32_v_u32m1(tab, vsll_vx_u32m1(vreinterpret_v_i32m1_u32m1(idxvec.val), 2, 4), 4));
 }
 
 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
 {
-    float CV_DECL_ALIGNED(32) elems[4] =
+    /*float CV_DECL_ALIGNED(32) elems[4] =
     {
         tab[idxvec.val[0]],
         tab[idxvec.val[1]],
         tab[idxvec.val[2]],
         tab[idxvec.val[3]]
-    };
-    return v_float32x4(vle_v_f32m1(elems, 4));
+    };*/
+    return v_float32x4(vloxei32_v_f32m1(tab, vsll_vx_u32m1(vreinterpret_v_i32m1_u32m1(idxvec.val), 2, 4), 4));
 }
 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
 {
-    vfloat64m1_t res = {tab[idxvec.val[0]], tab[idxvec.val[1]]};
-    return v_float64x2(res);
+    //vfloat64m1_t res = {tab[idxvec.val[0]], tab[idxvec.val[1]]};
+    return v_float64x2(vloxei64_v_f64m1(tab, vsll_vx_u64m1(vreinterpret_v_i64m1_u64m1(vget_v_i64m2_i64m1(vwadd_vx_i64m2(idxvec.val, 0, 2), 0)), 3, 2), 2));
 }
 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
 {
-    vint32m1_t index_x = vmul_vx_i32m1(idxvec.val, 4, 4);
-    vint32m1_t index_y = vadd_vx_i32m1(index_x, 4, 4);
+    vint32m1_t index = vmul_vx_i32m1(idxvec.val, 4, 4);
+    //vint32m1_t index_y = vadd_vx_i32m1(index_x, 4, 4);
 
-    x.val = vlxe_v_f32m1(tab, index_x, 4);
-    y.val = vlxe_v_f32m1(tab, index_y, 4);
+    //x.val = vlxe_v_f32m1(tab, index_x, 4);
+    //y.val = vlxe_v_f32m1(tab, index_y, 4);
+    vloxseg2ei32_v_f32m1(&x.val, &y.val, tab, vreinterpret_v_i32m1_u32m1(index), 4);
 }
 
 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
@@ -1518,52 +1907,52 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo
     y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
 }
 
-#define OPENCV_HAL_IMPL_RISCVV_PACKS(_Tp, _Tp2, _T2, num2, _T1, num, intrin, shr, _Type) \
+#define OPENCV_HAL_IMPL_RISCVV_PACKS(_Tp, _Tp2, _T2, num2, _T1, num, intrin, shr, _Type, elemsize) \
 inline v_##_Tp##x##num v_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
 { \
     v##_Tp2##m2_t  tmp = vundefined_##_T2##m2();    \
-    tmp = vset_##_T2##m2(tmp, 0, a.val);    \
-    tmp = vset_##_T2##m2(tmp, 1, b.val);    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, b.val);    \
     return v_##_Tp##x##num(shr##_##_T1##m1(tmp, 0, num)); \
 }\
 template<int n> inline \
 v_##_Tp##x##num v_rshr_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
 { \
     v##_Tp2##m2_t  tmp = vundefined_##_T2##m2();    \
-    tmp = vset_##_T2##m2(tmp, 0, a.val);    \
-    tmp = vset_##_T2##m2(tmp, 1, b.val);    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, b.val);    \
     return v_##_Tp##x##num(intrin##_##_T1##m1(tmp, n, num)); \
 }\
 inline void v_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
 { \
     v##_Tp2##m2_t tmp = vundefined_##_T2##m2();    \
-    tmp = vset_##_T2##m2(tmp, 0, a.val);    \
-    tmp = vset_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2));    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2));    \
     asm("" ::: "memory");                                       \
-    vse_v_##_T1##m1(ptr, shr##_##_T1##m1(tmp, 0, num), num2); \
+    vse##elemsize##_v_##_T1##m1(ptr, shr##_##_T1##m1(tmp, 0, num), num2); \
 }\
 template<int n> inline \
 void v_rshr_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
 { \
     v##_Tp2##m2_t tmp = vundefined_##_T2##m2();    \
-    tmp = vset_##_T2##m2(tmp, 0, a.val);    \
-    tmp = vset_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2));    \
-    vse_v_##_T1##m1(ptr, intrin##_##_T1##m1(tmp, n, num), num2); \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2));    \
+    vse##elemsize##_v_##_T1##m1(ptr, intrin##_##_T1##m1(tmp, n, num), num2); \
 }
-OPENCV_HAL_IMPL_RISCVV_PACKS(int8, int16, i16, 8, i8, 16, vnclip_vx, vnclip_vx, signed char)
-OPENCV_HAL_IMPL_RISCVV_PACKS(int16, int32, i32, 4, i16, 8, vnclip_vx, vnclip_vx, signed short)
-OPENCV_HAL_IMPL_RISCVV_PACKS(int32, int64, i64, 2, i32, 4, vnclip_vx, vnsra_vx, int)
-OPENCV_HAL_IMPL_RISCVV_PACKS(uint8, uint16, u16, 8, u8, 16, vnclipu_vx, vnclipu_vx, unsigned char)
-OPENCV_HAL_IMPL_RISCVV_PACKS(uint16, uint32, u32, 4, u16, 8, vnclipu_vx, vnclipu_vx, unsigned short)
-OPENCV_HAL_IMPL_RISCVV_PACKS(uint32, uint64, u64, 2, u32, 4, vnclipu_vx, vnsrl_vx, unsigned int)
+OPENCV_HAL_IMPL_RISCVV_PACKS(int8, int16, i16, 8, i8, 16, vnclip_wx, vnclip_wx, signed char, 8)
+OPENCV_HAL_IMPL_RISCVV_PACKS(int16, int32, i32, 4, i16, 8, vnclip_wx, vnclip_wx, signed short, 16)
+OPENCV_HAL_IMPL_RISCVV_PACKS(int32, int64, i64, 2, i32, 4, vnclip_wx, vnsra_wx, int, 32)
+OPENCV_HAL_IMPL_RISCVV_PACKS(uint8, uint16, u16, 8, u8, 16, vnclipu_wx, vnclipu_wx, unsigned char, 8)
+OPENCV_HAL_IMPL_RISCVV_PACKS(uint16, uint32, u32, 4, u16, 8, vnclipu_wx, vnclipu_wx, unsigned short, 16)
+OPENCV_HAL_IMPL_RISCVV_PACKS(uint32, uint64, u64, 2, u32, 4, vnclipu_wx, vnsrl_wx, unsigned int, 32)
 
 // pack boolean
 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
 {
     vuint16m2_t tmp = vundefined_u16m2();    \
-    tmp = vset_u16m2(tmp, 0, a.val);    \
-    tmp = vset_u16m2(tmp, 1, b.val);    \
-    return v_uint8x16(vnsrl_vx_u8m1(tmp, 0, 16));
+    tmp = vset_v_u16m1_u16m2(tmp, 0, a.val);    \
+    tmp = vset_v_u16m1_u16m2(tmp, 1, b.val);    \
+    return v_uint8x16(vnsrl_wx_u8m1(tmp, 0, 16));
 }
 
 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
@@ -1571,12 +1960,12 @@ inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
 {
     vuint32m4_t vabcd = vundefined_u32m4();    \
     vuint16m2_t v16 = vundefined_u16m2();    \
-    vabcd = vset_u32m4(vabcd, 0, a.val);    \
-    vabcd = vset_u32m4(vabcd, 1, b.val);    \
-    vabcd = vset_u32m4(vabcd, 2, c.val);    \
-    vabcd = vset_u32m4(vabcd, 3, d.val);    \
-    v16 = vnsrl_vx_u16m2(vabcd, 0, 16);
-    return v_uint8x16(vnsrl_vx_u8m1(v16, 0, 16));
+    vabcd = vset_v_u32m1_u32m4(vabcd, 0, a.val);    \
+    vabcd = vset_v_u32m1_u32m4(vabcd, 1, b.val);    \
+    vabcd = vset_v_u32m1_u32m4(vabcd, 2, c.val);    \
+    vabcd = vset_v_u32m1_u32m4(vabcd, 3, d.val);    \
+    v16 = vnsrl_wx_u16m2(vabcd, 0, 16);
+    return v_uint8x16(vnsrl_wx_u8m1(v16, 0, 16));
 }
 
 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
@@ -1586,17 +1975,17 @@ inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uin
     vuint64m8_t v64 = vundefined_u64m8();    \
     vuint32m4_t v32 = vundefined_u32m4();    \
     vuint16m2_t v16 = vundefined_u16m2();    \
-    v64 = vset_u64m8(v64, 0, a.val);    \
-    v64 = vset_u64m8(v64, 1, b.val);    \
-    v64 = vset_u64m8(v64, 2, c.val);    \
-    v64 = vset_u64m8(v64, 3, d.val);    \
-    v64 = vset_u64m8(v64, 4, e.val);    \
-    v64 = vset_u64m8(v64, 5, f.val);    \
-    v64 = vset_u64m8(v64, 6, g.val);    \
-    v64 = vset_u64m8(v64, 7, h.val);    \
-    v32 = vnsrl_vx_u32m4(v64, 0, 16);
-    v16 = vnsrl_vx_u16m2(v32, 0, 16);
-    return v_uint8x16(vnsrl_vx_u8m1(v16, 0, 16));
+    v64 = vset_v_u64m1_u64m8(v64, 0, a.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 1, b.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 2, c.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 3, d.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 4, e.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 5, f.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 6, g.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 7, h.val);    \
+    v32 = vnsrl_wx_u32m4(v64, 0, 16);
+    v16 = vnsrl_wx_u16m2(v32, 0, 16);
+    return v_uint8x16(vnsrl_wx_u8m1(v16, 0, 16));
 }
 
 //inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b) \
@@ -1612,35 +2001,35 @@ inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uin
 inline v_uint##tp1##x##num1 v_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
 { \
     vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
-    tmp = vset_##i##tp2##m2(tmp, 0, a.val);    \
-    tmp = vset_##i##tp2##m2(tmp, 1, b.val);    \
+    tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 1, b.val);    \
     vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
-    return v_uint##tp1##x##num1(vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, 0, num1));    \
+    return v_uint##tp1##x##num1(vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val), 0, num1));    \
 } \
 inline void v_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
 { \
     vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
-    tmp = vset_##i##tp2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val);    \
     vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
-    return vse_v_u##tp1##m1(ptr, vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, 0, num1), num2);    \
+    return vse##tp1##_v_u##tp1##m1(ptr, vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val), 0, num1), num2);    \
 } \
 template<int n> inline \
 v_uint##tp1##x##num1 v_rshr_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
 { \
     vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
-    tmp = vset_##i##tp2##m2(tmp, 0, a.val);    \
-    tmp = vset_##i##tp2##m2(tmp, 1, b.val);    \
+    tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 1, b.val);    \
     vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
-    return v_uint##tp1##x##num1(vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, n, num1));    \
+    return v_uint##tp1##x##num1(vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val), n, num1));    \
 } \
 template<int n> inline \
 void v_rshr_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
 { \
     vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
-    tmp = vset_##i##tp2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val);    \
     vint##tp2##m2_t val_ = vmax_vx_i##tp2##m2(tmp, 0, num1);\
-    vuint##tp1##m1_t val = vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val_, n, num1);    \
-    return vse_v_u##tp1##m1(ptr, val, num2);\
+    vuint##tp1##m1_t val = vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val_), n, num1);    \
+    return vse##tp1##_v_u##tp1##m1(ptr, val, num2);\
 }
 OPENCV_HAL_IMPL_RISCVV_PACK_U(8, 16, 16, 8, unsigned char )
 OPENCV_HAL_IMPL_RISCVV_PACK_U(16, 8, 32, 4, unsigned short)
@@ -1690,8 +2079,12 @@ static const signed char popCountTable[256] =
 };
 
 inline vuint8m1_t vcnt_u8(vuint8m1_t val){
-    vuint8m1_t v0 = val & 1;
-    return vlxe_v_u8m1((unsigned char*)popCountTable, val >> 1, 16)+v0;
+#if __riscv_v == 7000
+    vuint8m1_t v0 = vand_vx_u8m1(val, 1, 16);
+    return vadd_vv_u8m1(vloxei8_v_u8m1((unsigned char*)popCountTable, vsrl_vx_u8m1(val, 1, 16), 16), v0, 16);
+#else
+    return vloxei8_v_u8m1((unsigned char*)popCountTable, val, 16);
+#endif
 }
 
 inline v_uint8x16
@@ -1703,156 +2096,138 @@ v_popcount(const v_uint8x16& a)
 inline v_uint8x16
 v_popcount(const v_int8x16& a)
 {
-    return v_uint8x16(vcnt_u8((vuint8m1_t)a.val));
+    return v_uint8x16(vcnt_u8(vreinterpret_v_i8m1_u8m1(a.val)));
 }
 
 inline v_uint16x8
 v_popcount(const v_uint16x8& a)
 {
-    vuint8m2_t tmp = vundefined_u8m2();
-    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
-    vuint64m2_t mask = (vuint64m2_t){0x0E0C0A0806040200, 0, 0x0F0D0B0907050301, 0};
-    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
-    vuint16m2_t res = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 8);
-    return v_uint16x8(vget_u16m2_u16m1(res, 0));
+    vuint8m1_t tmp = vcnt_u8(vreinterpret_v_u16m1_u8m1(a.val));
+    vuint8m1_t seq = vid_v_u8m1(8);
+    vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8);
+    return v_uint16x8(vget_v_u16m2_u16m1(vwaddu_vv_u16m2(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8), 0));
 }
 
 inline v_uint16x8
 v_popcount(const v_int16x8& a)
 {
-    vuint8m2_t tmp = vundefined_u8m2();
-    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
-    vuint64m2_t mask = (vuint64m2_t){0x0E0C0A0806040200, 0, 0x0F0D0B0907050301, 0};
-    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
-    vuint16m2_t res = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 8);
-    return v_uint16x8(vget_u16m2_u16m1(res, 0));
+    vuint8m1_t tmp = vcnt_u8(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(a.val)));
+    vuint8m1_t seq = vid_v_u8m1(8);
+    vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8);
+    return v_uint16x8(vget_v_u16m2_u16m1(vwaddu_vv_u16m2(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8), 0));
 }
 
 inline v_uint32x4
 v_popcount(const v_uint32x4& a)
 {
-    vuint8m2_t tmp = vundefined_u8m2();
-    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
-    vuint64m2_t mask = (vuint64m2_t){0xFFFFFFFF0C080400, 0xFFFFFFFF0D090501,
-                     0xFFFFFFFF0E0A0602, 0xFFFFFFFF0F0B0703};
-    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
-    vuint16m2_t res_ = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 16);
-    vuint32m2_t res  = vwaddu_vv_u32m2(vget_u16m2_u16m1(res_, 0), vget_u16m2_u16m1(res_, 1), 8);
-    return v_uint32x4(vget_u32m2_u32m1(res, 0));
+    vuint8m1_t tmp = vcnt_u8(vreinterpret_v_u32m1_u8m1(a.val));
+    vuint8m1_t seq = vid_v_u8m1(8);
+    vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8);
+    vuint8m1_t sum = vadd_vv_u8m1(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8);
+    return v_uint32x4(vget_v_u32m4_u32m1(vwaddu_vx_u32m4(vwaddu_vv_u16m2(vrgather_vv_u8m1(sum, index, 4), vrgather_vv_u8m1(sum, vadd_vx_u8m1(index, 1, 4), 4), 4), 0, 4), 0));
 }
 
 inline v_uint32x4
 v_popcount(const v_int32x4& a)
 {
-    vuint8m2_t tmp = vundefined_u8m2();
-    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
-    vuint64m2_t mask = (vuint64m2_t){0xFFFFFFFF0C080400, 0xFFFFFFFF0D090501,
-                     0xFFFFFFFF0E0A0602, 0xFFFFFFFF0F0B0703};
-    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
-    vuint16m2_t res_ = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 16);
-    vuint32m2_t res  = vwaddu_vv_u32m2(vget_u16m2_u16m1(res_, 0), vget_u16m2_u16m1(res_, 1), 8);
-    return v_uint32x4(vget_u32m2_u32m1(res, 0));
+    vuint8m1_t tmp = vcnt_u8(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i32m1_i8m1(a.val)));
+    vuint8m1_t seq = vid_v_u8m1(8);
+    vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8);
+    vuint8m1_t sum = vadd_vv_u8m1(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8);
+    return v_uint32x4(vget_v_u32m4_u32m1(vwaddu_vx_u32m4(vwaddu_vv_u16m2(vrgather_vv_u8m1(sum, index, 4), vrgather_vv_u8m1(sum, vadd_vx_u8m1(index, 1, 4), 4), 4), 0, 4), 0));
 }
 
 inline v_uint64x2
 v_popcount(const v_uint64x2& a)
 {
-    vuint8m2_t tmp = vundefined_u8m2();
-    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
-    vuint64m2_t mask = (vuint64m2_t){0x0706050403020100, 0x0000000000000000,
-                     0x0F0E0D0C0B0A0908, 0x0000000000000000};
-    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
-    vuint8m1_t zero = vmv_v_x_u8m1(0, 16);
-    vuint8m1_t res1 = zero;
-    vuint8m1_t res2 = zero;
-    res1 = vredsum_vs_u8m1_u8m1(res1, vget_u8m2_u8m1(tmp, 0), zero, 8);
-    res2 = vredsum_vs_u8m1_u8m1(res2, vget_u8m2_u8m1(tmp, 1), zero, 8);
-
-    return v_uint64x2((unsigned long)vmv_x_s_u8m1_u8(res1, 8), (unsigned long)vmv_x_s_u8m1_u8(res2, 8));
+    vuint8m1_t tmp = vcnt_u8(vreinterpret_v_u64m1_u8m1(a.val));
+    vuint16m2_t tmp16 = vwaddu_vx_u16m2(tmp, 0, 16);
+    vuint16m1_t res1 = vundefined_u16m1();
+    vuint16m1_t res2 = vundefined_u16m1();
+    res1 = vredsum_vs_u16m1_u16m1(res1, vget_v_u16m2_u16m1(tmp16, 0), vmv_v_x_u16m1(0, 8), 8);
+    res2 = vredsum_vs_u16m1_u16m1(res2, vget_v_u16m2_u16m1(tmp16, 1), vmv_v_x_u16m1(0, 8), 8);
+    return v_uint64x2((unsigned long)vmv_x_s_u16m1_u16(res1), (unsigned long)vmv_x_s_u16m1_u16(res2));
 }
 
 inline v_uint64x2
 v_popcount(const v_int64x2& a)
 {
-    vuint8m2_t tmp = vundefined_u8m2();
-    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
-    vuint64m2_t mask = (vuint64m2_t){0x0706050403020100, 0x0000000000000000,
-                     0x0F0E0D0C0B0A0908, 0x0000000000000000};
-    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
-    vuint8m1_t zero = vmv_v_x_u8m1(0, 16);
-    vuint8m1_t res1 = zero;
-    vuint8m1_t res2 = zero;
-    res1 = vredsum_vs_u8m1_u8m1(res1, vget_u8m2_u8m1(tmp, 0), zero, 8);
-    res2 = vredsum_vs_u8m1_u8m1(res2, vget_u8m2_u8m1(tmp, 1), zero, 8);
-
-    return v_uint64x2((unsigned long)vmv_x_s_u8m1_u8(res1, 8), (unsigned long)vmv_x_s_u8m1_u8(res2, 8));
+    vuint8m1_t tmp = vcnt_u8(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i64m1_i8m1(a.val)));
+    vuint16m2_t tmp16 = vwaddu_vx_u16m2(tmp, 0, 16);
+    vuint16m1_t res1 = vundefined_u16m1(), res2 = vundefined_u16m1();
+    res1 = vredsum_vs_u16m1_u16m1(res1, vget_v_u16m2_u16m1(tmp16, 0), vmv_v_x_u16m1(0, 8), 8);
+    res2 = vredsum_vs_u16m1_u16m1(res2, vget_v_u16m2_u16m1(tmp16, 1), vmv_v_x_u16m1(0, 8), 8);
+    return v_uint64x2((unsigned long)vmv_x_s_u16m1_u16(res1), (unsigned long)vmv_x_s_u16m1_u16(res2));
 }
 
 #define SMASK 1, 2, 4, 8, 16, 32, 64, 128
 inline int v_signmask(const v_uint8x16& a)
 {
+    vuint16m1_t res = vundefined_u16m1();
+    vuint8m1_t id = vid_v_u8m1(16);
+    vuint16m2_t num = vsll_vv_u16m2(vmv_v_x_u16m2(1, 16), vwaddu_vx_u16m2(id, 0, 16), 16);
     vuint8m1_t t0  = vsrl_vx_u8m1(a.val, 7, 16);
-    vuint8m1_t m1  = (vuint8m1_t){SMASK, SMASK};
-    vuint16m2_t t1 = vwmulu_vv_u16m2(t0, m1, 16);
-    vuint32m1_t res = vmv_v_x_u32m1(0, 4);
-    vuint32m2_t t2 = vwmulu_vx_u32m2(vget_u16m2_u16m1(t1, 1), 256, 8);
-    res = vredsum_vs_u32m2_u32m1(res, t2, res, 8);
-    res = vwredsumu_vs_u16m1_u32m1(res, vget_u16m2_u16m1(t1, 0), res, 8);
-    return vmv_x_s_u32m1_u32(res, 8);
+    vbool8_t mask = vmseq_vx_u8m1_b8(t0, 1, 16);
+    res = vredsum_vs_u16m2_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 16);
+    return vmv_x_s_u16m1_u16(res);
 }
 inline int v_signmask(const v_int8x16& a)
 {
-    vuint8m1_t t0 = vsrl_vx_u8m1((vuint8m1_t)a.val, 7, 16);
-    vuint8m1_t m1 = (vuint8m1_t){SMASK, SMASK};
-    vint16m2_t t1 = (vint16m2_t)vwmulu_vv_u16m2(t0, m1, 16);
-    vint32m1_t res = vmv_v_x_i32m1(0, 4);
-    vint32m2_t t2 = vwmul_vx_i32m2(vget_i16m2_i16m1(t1, 1), 256, 8);
-    res = vredsum_vs_i32m2_i32m1(res, t2, res, 8);
-    res = vwredsum_vs_i16m1_i32m1(res, vget_i16m2_i16m1(t1, 0), res, 8);
-    return vmv_x_s_i32m1_i32(res, 8);
+    vuint16m1_t res = vundefined_u16m1();
+    vuint8m1_t id = vid_v_u8m1(16);
+    vuint16m2_t num = vsll_vv_u16m2(vmv_v_x_u16m2(1, 16), vwaddu_vx_u16m2(id, 0, 16), 16);
+    vbool8_t mask = vmslt_vx_i8m1_b8(a.val, 0, 16);
+    res = vredsum_vs_u16m2_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 16);
+    return vmv_x_s_u16m1_u16(res);
 }
 
 inline int v_signmask(const v_int16x8& a)
 {
-    vint16m1_t t0 = (vint16m1_t)vsrl_vx_u16m1((vuint16m1_t)a.val, 15, 8);
-    vint16m1_t m1 = (vint16m1_t){SMASK};
-    vint16m1_t t1 = vmul_vv_i16m1(t0, m1, 8);
-    vint16m1_t res = vmv_v_x_i16m1(0, 8);
-    res = vredsum_vs_i16m1_i16m1(res, t1, res, 8);
-    return vmv_x_s_i16m1_i16(res, 8);
+    vuint16m1_t res = vundefined_u16m1();
+    vuint16m1_t id = vid_v_u16m1(8);
+    vuint16m1_t num = vsll_vv_u16m1(vmv_v_x_u16m1(1, 8), id, 8);
+    vbool16_t mask = vmslt_vx_i16m1_b16(a.val, 0, 8);
+    res = vredsum_vs_u16m1_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 16);
+    return vmv_x_s_u16m1_u16(res);
 }
 inline int v_signmask(const v_uint16x8& a)
 {
-    vint16m1_t t0 = (vint16m1_t)vsrl_vx_u16m1((vuint16m1_t)a.val, 15, 8);
-    vint16m1_t m1 = (vint16m1_t){SMASK};
-    vint16m1_t t1 = vmul_vv_i16m1(t0, m1, 8);
-    vint16m1_t res = vmv_v_x_i16m1(0, 8);
-    res = vredsum_vs_i16m1_i16m1(res, t1, res, 8);
-    return vmv_x_s_i16m1_i16(res, 8);
+    vuint16m1_t res = vundefined_u16m1();
+    vuint16m1_t id = vid_v_u16m1(8);
+    vuint16m1_t num = vsll_vv_u16m1(vmv_v_x_u16m1(1, 8), id, 8);
+    vuint16m1_t t0  = vsrl_vx_u16m1(a.val, 15, 8);
+    vbool16_t mask = vmseq_vx_u16m1_b16(t0, 1, 8);
+    res = vredsum_vs_u16m1_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 8);
+    return vmv_x_s_u16m1_u16(res);
 }
 inline int v_signmask(const v_int32x4& a)
 {
-    vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1((vuint32m1_t)a.val, 31, 4);
-    vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
-    vint32m1_t res = vmv_v_x_i32m1(0, 4);
-    vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
-    res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
-    return vmv_x_s_i32m1_i32(res, 4);
+    vuint32m1_t res = vundefined_u32m1();
+    vuint32m1_t id = vid_v_u32m1(4);
+    vuint32m1_t num = vsll_vv_u32m1(vmv_v_x_u32m1(1, 4), id, 4);
+    vbool32_t mask = vmslt_vx_i32m1_b32(a.val, 0, 4);
+    res = vredsum_vs_u32m1_u32m1_m(mask, res, num, vmv_v_x_u32m1(0, 4), 4);
+    return vmv_x_s_u32m1_u32(res);
 }
 inline int v_signmask(const v_uint32x4& a)
 {
-    vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1(a.val, 31, 4);
-    vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
-    vint32m1_t res = vmv_v_x_i32m1(0, 4);
-    vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
-    res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
-    return vmv_x_s_i32m1_i32(res, 4);
+    vuint32m1_t res = vundefined_u32m1();
+    vuint32m1_t id = vid_v_u32m1(4);
+    vuint32m1_t num = vsll_vv_u32m1(vmv_v_x_u32m1(1, 4), id, 4);
+    vuint32m1_t t0  = vsrl_vx_u32m1(a.val, 31, 4);
+    vbool32_t mask = vmseq_vx_u32m1_b32(t0, 1, 4);
+    res = vredsum_vs_u32m1_u32m1_m(mask, res, num, vmv_v_x_u32m1(0, 4), 4);
+    return vmv_x_s_u32m1_u32(res);
 }
 inline int v_signmask(const v_uint64x2& a)
 {
-    vuint64m1_t v0 = vsrl_vx_u64m1(a.val, 63, 2);
-    int res = (int)vext_x_v_u64m1_u64(v0, 0, 2) + ((int)vext_x_v_u64m1_u64(v0, 1, 2) << 1);
-    return res;
+    vuint64m1_t res = vundefined_u64m1();
+    vuint64m1_t id = vid_v_u64m1(2);
+    vuint64m1_t num = vsll_vv_u64m1(vmv_v_x_u64m1(1, 2), id, 2);
+    vuint64m1_t t0  = vsrl_vx_u64m1(a.val, 63, 2);
+    vbool64_t mask = vmseq_vx_u64m1_b64(t0, 1, 2);
+    res = vredsum_vs_u64m1_u64m1_m(mask, res, num, vmv_v_x_u64m1(0, 2), 2);
+    return vmv_x_s_u64m1_u64(res);
 }
 inline int v_signmask(const v_int64x2& a)
 { return v_signmask(v_reinterpret_as_u64(a)); }
@@ -1860,12 +2235,14 @@ inline int v_signmask(const v_float64x2& a)
 { return v_signmask(v_reinterpret_as_u64(a)); }
 inline int v_signmask(const v_float32x4& a)
 {
-    vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1((vuint32m1_t)a.val, 31, 4);
-    vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
-    vint32m1_t res = vmv_v_x_i32m1(0, 4);
-    vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
-    res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
-    return vmv_x_s_i32m1_i32(res, 4);
+    return v_signmask(v_reinterpret_as_u32(a));
+    /*
+    vuint32m1_t res;
+    vuint32m1_t id = vid_v_u32m1(4);
+    vuint32m1_t num = vsll_vv_u32m1(vmv_v_x_u32m1(1, 4), id, 4);
+    vbool32_t mask = vmflt_vf_f32m1_b32(a.val, 0, 4);
+    res = vredsum_vs_u32m1_u32m1_m(mask, res, num, vmv_v_x_u32m1(0, 4), 4);
+    return vmv_x_s_u32m1_u32(res);*/
 }
 
 inline int v_scan_forward(const v_int8x16& a) {
@@ -1905,24 +2282,22 @@ int val = v_signmask(a);
 if(val==0) return 0;
 else return trailingZeros32(val); }
 
-#define OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(_Tpvec, suffix, _T, shift, num) \
+#define OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(_Tpvec, suffix, _T, shift, num, mask_b) \
 inline bool v_check_all(const v_##_Tpvec& a) \
 { \
     suffix##m1_t v0 = vsrl_vx_##_T(vnot_v_##_T(a.val, num), shift, num); \
-    vuint32m1_t v1 = vuint32m1_t(v0); \
-    return (v1[0] | v1[1] | v1[2] | v1[3]) == 0; \
+    return (vcpop_m_##mask_b(vmseq_vx_##_T##_##mask_b(v0, 1, num), num)) == 0; \
 } \
 inline bool v_check_any(const v_##_Tpvec& a) \
 { \
     suffix##m1_t v0 = vsrl_vx_##_T(a.val, shift, num); \
-    vuint32m1_t v1 = vuint32m1_t(v0); \
-    return (v1[0] | v1[1] | v1[2] | v1[3]) != 0; \
+    return (vcpop_m_##mask_b(vmseq_vx_##_T##_##mask_b(v0, 1, num), num)) != 0; \
 }
 
-OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint8x16, vuint8,  u8m1, 7, 16)
-OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint16x8, vuint16, u16m1, 15, 8)
-OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint32x4, vuint32, u32m1, 31, 4)
-OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint64x2, vuint64, u64m1, 63, 2)
+OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint8x16, vuint8,  u8m1, 7, 16, b8)
+OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint16x8, vuint16, u16m1, 15, 8, b16)
+OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint32x4, vuint32, u32m1, 31, 4, b32)
+OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint64x2, vuint64, u64m1, 63, 2, b64)
 
 inline bool v_check_all(const v_int8x16& a)
 { return v_check_all(v_reinterpret_as_u8(a)); }
@@ -1950,92 +2325,93 @@ inline bool v_check_any(const v_int64x2& a)
 inline bool v_check_any(const v_float64x2& a)
 { return v_check_any(v_reinterpret_as_u64(a)); }
 
-#define OPENCV_HAL_IMPL_RISCVV_SELECT(_Tpvec, suffix, _Tpvec2, num) \
+#define OPENCV_HAL_IMPL_RISCVV_SELECT(_Tpvec, suffix, _Tpvec2, num, mask_func) \
 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
 { \
-    return _Tpvec(vmerge_vvm_##suffix(_Tpvec2(mask.val), b.val, a.val, num)); \
+    return _Tpvec(vmerge_vvm_##suffix(mask_func(mask.val, 0, num), b.val, a.val, num)); \
 }
 
-OPENCV_HAL_IMPL_RISCVV_SELECT(v_int8x16,  i8m1, vbool8_t, 16)
-OPENCV_HAL_IMPL_RISCVV_SELECT(v_int16x8,  i16m1, vbool16_t, 8)
-OPENCV_HAL_IMPL_RISCVV_SELECT(v_int32x4,  i32m1, vbool32_t, 4)
-OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint8x16, u8m1, vbool8_t, 16)
-OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint16x8, u16m1, vbool16_t, 8)
-OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint32x4, u32m1, vbool32_t, 4)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_int8x16,  i8m1, vbool8_t, 16, vmsne_vx_i8m1_b8)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_int16x8,  i16m1, vbool16_t, 8, vmsne_vx_i16m1_b16)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_int32x4,  i32m1, vbool32_t, 4, vmsne_vx_i32m1_b32)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint8x16, u8m1, vbool8_t, 16, vmsne_vx_u8m1_b8)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint16x8, u16m1, vbool16_t, 8, vmsne_vx_u16m1_b16)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint32x4, u32m1, vbool32_t, 4, vmsne_vx_u32m1_b32)
 inline v_float32x4 v_select(const v_float32x4& mask, const v_float32x4& a, const v_float32x4& b)
 {
-    return v_float32x4((vfloat32m1_t)vmerge_vvm_u32m1((vbool32_t)mask.val, (vuint32m1_t)b.val, (vuint32m1_t)a.val, 4));
+    return v_float32x4(vmerge_vvm_f32m1(vmfne_vf_f32m1_b32(mask.val, 0, 4), b.val, a.val, 4));
 }
 inline v_float64x2 v_select(const v_float64x2& mask, const v_float64x2& a, const v_float64x2& b)
 {
-    return v_float64x2((vfloat64m1_t)vmerge_vvm_u64m1((vbool64_t)mask.val, (vuint64m1_t)b.val, (vuint64m1_t)a.val, 2));
+    return v_float64x2(vmerge_vvm_f64m1(vmfne_vf_f64m1_b64(mask.val, 0, 2), b.val, a.val, 2));
 }
 
-#define OPENCV_HAL_IMPL_RISCVV_EXPAND(add, _Tpvec, _Tpwvec, _Tp, _Tp1, num1, _Tp2, num2, _T1, _T2) \
+#define OPENCV_HAL_IMPL_RISCVV_EXPAND(add, _Tpvec, _Tpwvec, _Tp, _Tp1, num1, _Tp2, num2, _T1, _T2, num3) \
 inline void v_expand(const _Tpvec& a, v_##_Tpwvec& b0, v_##_Tpwvec& b1) \
 { \
-    _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num1), num1);    \
-    b0.val = vget_##_Tp2##m2_##_Tp2##m1(b, 0);  \
-    b1.val = vget_##_Tp2##m2_##_Tp2##m1(b, 1);  \
+    _T1##_t b = vw##add##_vx_##_Tp2##m2(a.val, 0, num1);    \
+    b0.val = vget_v_##_Tp2##m2_##_Tp2##m1(b, 0);  \
+    b1.val = vget_v_##_Tp2##m2_##_Tp2##m1(b, 1);  \
 } \
 inline v_##_Tpwvec v_expand_low(const _Tpvec& a) \
 { \
-    _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num2), num2);    \
-    return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 0)); \
+    _T1##_t b = vw##add##_vx_##_Tp2##m2(a.val, 0, num2);    \
+    return v_##_Tpwvec(vget_v_##_Tp2##m2_##_Tp2##m1(b, 0)); \
 } \
 inline v_##_Tpwvec v_expand_high(const _Tpvec& a) \
 { \
-    _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num1), num1);    \
-    return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 1)); \
+    _T1##_t b = vw##add##_vx_##_Tp2##m2(a.val, 0, num1);    \
+    return v_##_Tpwvec(vget_v_##_Tp2##m2_##_Tp2##m1(b, 1)); \
 } \
 inline v_##_Tpwvec v_load_expand(const _Tp* ptr) \
 { \
-    _T2##_t val = vle##_v_##_Tp1(ptr, num2);    \
-    _T1##_t b = vw##add##_vv_##_Tp2##m2(val, vmv_v_x_##_Tp1(0, num2), num2);    \
-    return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 0)); \
+    _T2##_t val = vle##num3##_v_##_Tp1(ptr, num2);    \
+    _T1##_t b = vw##add##_vx_##_Tp2##m2(val, 0, num2);    \
+    return v_##_Tpwvec(vget_v_##_Tp2##m2_##_Tp2##m1(b, 0)); \
 }
 
-OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint8x16, uint16x8, uchar, u8m1, 16, u16, 8, vuint16m2, vuint8m1)
-OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint16x8, uint32x4, ushort,  u16m1, 8, u32, 4, vuint32m2, vuint16m1)
-OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint32x4, uint64x2, uint,  u32m1, 4, u64, 2, vuint64m2, vuint32m1)
-OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int8x16, int16x8, schar,  i8m1, 16, i16, 8, vint16m2, vint8m1)
-OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int16x8, int32x4, short,  i16m1, 8, i32, 4, vint32m2, vint16m1)
-OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int32x4, int64x2, int,  i32m1, 4, i64, 2, vint64m2, vint32m1)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint8x16, uint16x8, uchar, u8m1, 16, u16, 8, vuint16m2, vuint8m1, 8)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint16x8, uint32x4, ushort,  u16m1, 8, u32, 4, vuint32m2, vuint16m1, 16)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint32x4, uint64x2, uint,  u32m1, 4, u64, 2, vuint64m2, vuint32m1, 32)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int8x16, int16x8, schar,  i8m1, 16, i16, 8, vint16m2, vint8m1, 8)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int16x8, int32x4, short,  i16m1, 8, i32, 4, vint32m2, vint16m1, 16)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int32x4, int64x2, int,  i32m1, 4, i64, 2, vint64m2, vint32m1, 32)
 
 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
 {
     vuint16m2_t b = vundefined_u16m2();
     vuint32m2_t c = vundefined_u32m2();
-    vuint8m1_t val = vle_v_u8m1(ptr, 4);    \
+    vuint8m1_t val = vle8_v_u8m1(ptr, 4);    \
     b = vwaddu_vv_u16m2(val, vmv_v_x_u8m1(0, 4), 4);    \
-    c = vwaddu_vv_u32m2(vget_u16m2_u16m1(b, 0), vmv_v_x_u16m1(0, 4), 4);    \
-    return v_uint32x4(vget_u32m2_u32m1(c, 0));
+    c = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(b, 0), vmv_v_x_u16m1(0, 4), 4);    \
+    return v_uint32x4(vget_v_u32m2_u32m1(c, 0));
 }
 
 inline v_int32x4 v_load_expand_q(const schar* ptr)
 {
     vint16m2_t b = vundefined_i16m2();
     vint32m2_t c = vundefined_i32m2();
-    vint8m1_t val = vle_v_i8m1(ptr, 4);    \
+    vint8m1_t val = vle8_v_i8m1(ptr, 4);    \
     b = vwadd_vv_i16m2(val, vmv_v_x_i8m1(0, 4), 4);    \
-    c = vwadd_vv_i32m2(vget_i16m2_i16m1(b, 0), vmv_v_x_i16m1(0, 4), 4);    \
-    return v_int32x4(vget_i32m2_i32m1(c, 0));
+    c = vwadd_vv_i32m2(vget_v_i16m2_i16m1(b, 0), vmv_v_x_i16m1(0, 4), 4);    \
+    return v_int32x4(vget_v_i32m2_i32m1(c, 0));
 }
-#define VITL_16 (vuint32m2_t){0x11011000, 0x13031202, 0x15051404, 0x17071606, 0x19091808, 0x1B0B1A0A, 0x1D0D1C0C, 0x1F0F1E0E}
-#define VITL_8 (vuint32m2_t){0x00080000, 0x00090001, 0x000A0002, 0x000B0003, 0x000C0004, 0x000D0005, 0x000E0006, 0x000F0007}
-#define VITL_4 (vuint32m2_t){0x00000000, 0x00000004, 0x00000001, 0x00000005, 0x00000002, 0x00000006, 0x00000003, 0x00000007}
-#define VITL_2 (vuint32m2_t){0, 0, 2, 0, 1, 0, 3, 0}
+#define VITL_16 {0x11011000, 0x13031202, 0x15051404, 0x17071606, 0x19091808, 0x1B0B1A0A, 0x1D0D1C0C, 0x1F0F1E0E}
+#define VITL_8 {0x00080000, 0x00090001, 0x000A0002, 0x000B0003, 0x000C0004, 0x000D0005, 0x000E0006, 0x000F0007}
+#define VITL_4 {0x00000000, 0x00000004, 0x00000001, 0x00000005, 0x00000002, 0x00000006, 0x00000003, 0x00000007}
+#define VITL_2 {0, 0, 2, 0, 1, 0, 3, 0}
 
-#define OPENCV_HAL_IMPL_RISCVV_UNPACKS(_Tpvec, _Tp, _T, _UTp, _UT, num, num2, len, numh) \
+#define OPENCV_HAL_IMPL_RISCVV_UNPACKS(_Tpvec, _Tp, _T, _UTp, _UT, num, num2, len, numh, refunc) \
 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
 { \
     v##_Tp##m2_t tmp = vundefined_##_T##m2();\
-    tmp = vset_##_T##m2(tmp, 0, a0.val); \
-    tmp = vset_##_T##m2(tmp, 1, a1.val); \
-    vuint32m2_t mask = VITL_##num;    \
-    tmp = (v##_Tp##m2_t)vrgather_vv_##_T##m2((v##_Tp##m2_t)tmp, (v##_UTp##m2_t)mask, num2);    \
-    b0.val = vget_##_T##m2_##_T##m1(tmp, 0); \
-    b1.val = vget_##_T##m2_##_T##m1(tmp, 1); \
+    tmp = vset_v_##_T##m1_##_T##m2(tmp, 0, a0.val); \
+    tmp = vset_v_##_T##m1_##_T##m2(tmp, 1, a1.val); \
+    unsigned mdata[] = VITL_##num; \
+    vuint32m2_t mask = vle32_v_u32m2(mdata, 8);    \
+    tmp = (v##_Tp##m2_t)vrgather_vv_##_T##m2((v##_Tp##m2_t)tmp, refunc(mask), num2);    \
+    b0.val = vget_v_##_T##m2_##_T##m1(tmp, 0); \
+    b1.val = vget_v_##_T##m2_##_T##m1(tmp, 1); \
 } \
 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
 { \
@@ -2044,58 +2420,59 @@ inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
 } \
 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
 { \
-    v##_Tp##m1_t b0 = vslidedown_vx_##_T##m1(b.val, numh, num);    \
-    v##_Tp##m1_t a0 = vslidedown_vx_##_T##m1(a.val, numh, num);    \
-    v##_Tp##m1_t b1 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num);    \
+    v##_Tp##m1_t b0 = vundefined_##_T##m1(); \
+    v##_Tp##m1_t a0 = vundefined_##_T##m1(); \
+    v##_Tp##m1_t b1 = vundefined_##_T##m1(); \
+    b0 = vslidedown_vx_##_T##m1(b0, b.val, numh, num);    \
+    a0 = vslidedown_vx_##_T##m1(a0, a.val, numh, num);    \
+    b1 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num);    \
     return v_##_Tpvec(b1);\
 } \
 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
 { \
+    v##_Tp##m1_t b0 = vundefined_##_T##m1(); \
+    v##_Tp##m1_t a0 = vundefined_##_T##m1(); \
     c.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a.val, b.val, numh, num);    \
-    v##_Tp##m1_t b0 = vslidedown_vx_##_T##m1(b.val, numh, num);    \
-    v##_Tp##m1_t a0 = vslidedown_vx_##_T##m1(a.val, numh, num);    \
+    b0 = vslidedown_vx_##_T##m1(b0, b.val, numh, num);    \
+    a0 = vslidedown_vx_##_T##m1(a0, a.val, numh, num);    \
     d.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num);    \
 }
 
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint8x16, uint8, u8, uint8, u8, 16, 32, b8, 8)
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(int8x16, int8, i8, uint8, u8, 16, 32, b8, 8)
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint16x8, uint16, u16, uint16, u16, 8, 16, b16, 4)
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(int16x8, int16, i16, uint16, u16, 8, 16, b16, 4)
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint32x4, uint32, u32, uint32, u32, 4, 8, b32, 2)
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(int32x4, int32, i32, uint32, u32, 4, 8, b32, 2)
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(float32x4, float32, f32, uint32, u32, 4, 8, b32, 2)
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(float64x2, float64, f64, uint64, u64, 2, 4, b64, 1)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint8x16, uint8, u8, uint8, u8, 16, 32, b8, 8, vreinterpret_v_u32m2_u8m2)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(int8x16, int8, i8, uint8, u8, 16, 32, b8, 8, vreinterpret_v_u32m2_u8m2)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint16x8, uint16, u16, uint16, u16, 8, 16, b16, 4, vreinterpret_v_u32m2_u16m2)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(int16x8, int16, i16, uint16, u16, 8, 16, b16, 4, vreinterpret_v_u32m2_u16m2)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint32x4, uint32, u32, uint32, u32, 4, 8, b32, 2,)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(int32x4, int32, i32, uint32, u32, 4, 8, b32, 2,)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(float32x4, float32, f32, uint32, u32, 4, 8, b32, 2,)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(float64x2, float64, f64, uint64, u64, 2, 4, b64, 1, vreinterpret_v_u32m2_u64m2)
 
 inline v_uint8x16 v_reverse(const v_uint8x16 &a)
 {
-    vuint64m1_t mask = (vuint64m1_t){0x08090A0B0C0D0E0F, 0x0001020304050607};
-    return v_uint8x16(vrgather_vv_u8m1(a.val, (vuint8m1_t)mask, 16));
+    return v_uint8x16(vrgather_vv_u8m1(a.val, vrsub_vx_u8m1(vid_v_u8m1(16), 15, 16), 16));
 }
 inline v_int8x16 v_reverse(const v_int8x16 &a)
 {
-    vint64m1_t mask = (vint64m1_t){0x08090A0B0C0D0E0F, 0x0001020304050607};
-    return v_int8x16(vrgather_vv_i8m1(a.val, (vuint8m1_t)mask, 16));
+    return v_int8x16(vrgather_vv_i8m1(a.val, vrsub_vx_u8m1(vid_v_u8m1(16), 15, 16), 16));
 }
 
 inline v_uint16x8 v_reverse(const v_uint16x8 &a)
 {
-    vuint64m1_t mask = (vuint64m1_t){0x0004000500060007, 0x000000100020003};
-    return v_uint16x8(vrgather_vv_u16m1(a.val, (vuint16m1_t)mask, 8));
+    return v_uint16x8(vrgather_vv_u16m1(a.val, vrsub_vx_u16m1(vid_v_u16m1(8), 7, 8), 8));
 }
 
 inline v_int16x8 v_reverse(const v_int16x8 &a)
 {
-    vint64m1_t mask = (vint64m1_t){0x0004000500060007, 0x000000100020003};
-    return v_int16x8(vrgather_vv_i16m1(a.val, (vuint16m1_t)mask, 8));
+    return v_int16x8(vrgather_vv_i16m1(a.val, vrsub_vx_u16m1(vid_v_u16m1(8), 7, 8), 8));
 }
 inline v_uint32x4 v_reverse(const v_uint32x4 &a)
 {
-    return v_uint32x4(vrgather_vv_u32m1(a.val, (vuint32m1_t){3, 2, 1, 0}, 4));
+    return v_uint32x4(vrgather_vv_u32m1(a.val, vrsub_vx_u32m1(vid_v_u32m1(4), 3, 4), 4));
 }
 
 inline v_int32x4 v_reverse(const v_int32x4 &a)
 {
-    return v_int32x4(vrgather_vv_i32m1(a.val, (vuint32m1_t){3, 2, 1, 0}, 4));
+    return v_int32x4(vrgather_vv_i32m1(a.val, vrsub_vx_u32m1(vid_v_u32m1(4), 3, 4), 4));
 }
 
 inline v_float32x4 v_reverse(const v_float32x4 &a)
@@ -2103,17 +2480,17 @@ inline v_float32x4 v_reverse(const v_float32x4 &a)
 
 inline v_uint64x2 v_reverse(const v_uint64x2 &a)
 {
-    return v_uint64x2(a.val[1], a.val[0]);
+    return v_uint64x2(vrgather_vv_u64m1(a.val, vrsub_vx_u64m1(vid_v_u64m1(2), 1, 2), 2));
 }
 
 inline v_int64x2 v_reverse(const v_int64x2 &a)
 {
-    return v_int64x2(a.val[1], a.val[0]);
+    return v_int64x2(vrgather_vv_i64m1(a.val, vrsub_vx_u64m1(vid_v_u64m1(2), 1, 2), 2));
 }
 
 inline v_float64x2 v_reverse(const v_float64x2 &a)
 {
-    return v_float64x2(a.val[1], a.val[0]);
+    return v_float64x2(vrgather_vv_f64m1(a.val, vrsub_vx_u64m1(vid_v_u64m1(2), 1, 2), 2));
 }
 
 #define OPENCV_HAL_IMPL_RISCVV_EXTRACT(_Tpvec, suffix, size) \
@@ -2132,19 +2509,19 @@ OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_float32x4, f32, 2)
 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_float64x2, f64, 3)
 
 
-#define OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(_Tpvec, _Tp, suffix) \
-template<int i> inline _Tp v_extract_n(_Tpvec v) { return v.val[i]; }
+#define OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(_Tpvec, _Tp, suffix, vtype, _vtype, num, mvfunc) \
+template<int i> inline _Tp v_extract_n(_Tpvec v) { vtype tmp = vundefined_##_vtype(); return mvfunc(vslidedown_vx_##_vtype(tmp, v.val, i, num)); }
 
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint8x16, uchar, u8)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int8x16, schar, s8)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint16x8, ushort, u16)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int16x8, short, s16)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint32x4, uint, u32)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int32x4, int, s32)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint64x2, uint64, u64)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int64x2, int64, s64)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float32x4, float, f32)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float64x2, double, f64)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint8x16, uchar, u8, vuint8m1_t, u8m1, 16, vmv_x_s_u8m1_u8)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int8x16, schar, s8, vint8m1_t, i8m1, 16, vmv_x_s_i8m1_i8)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint16x8, ushort, u16, vuint16m1_t, u16m1, 8, vmv_x_s_u16m1_u16)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int16x8, short, s16, vint16m1_t, i16m1, 8, vmv_x_s_i16m1_i16)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint32x4, uint, u32, vuint32m1_t, u32m1, 4, vmv_x_s_u32m1_u32)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int32x4, int, s32, vint32m1_t, i32m1, 4, vmv_x_s_i32m1_i32)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint64x2, uint64, u64, vuint64m1_t, u64m1, 2, vmv_x_s_u64m1_u64)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int64x2, int64, s64, vint64m1_t, i64m1, 2, vmv_x_s_i64m1_i64)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float32x4, float, f32, vfloat32m1_t, f32m1, 4, vfmv_f_s_f32m1_f32)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float64x2, double, f64, vfloat64m1_t, f64m1, 2, vfmv_f_s_f64m1_f64)
 
 #define OPENCV_HAL_IMPL_RISCVV_BROADCAST(_Tpvec, _Tp, num) \
 template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) { return _Tpvec(vrgather_vx_##_Tp##m1(v.val, i, num)); }
@@ -2158,10 +2535,24 @@ OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int32x4, i32, 4)
 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint64x2, u64, 2)
 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int64x2, i64, 2)
 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_float32x4, f32, 4)
+
+inline void __builtin_riscv_fsrm(int val)
+{
+    asm("csrw frm, %0\n\t"
+        :
+        :"r"(val));
+    return;
+}
+
+inline void barrier1(void *arg) {
+  __asm__ __volatile__("" : : "r" (arg) : "memory");
+}
+
 inline v_int32x4 v_round(const v_float32x4& a)
 {
     __builtin_riscv_fsrm(0);
-    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
+    barrier1(&nan);
     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
     __builtin_riscv_fsrm(0);
@@ -2170,7 +2561,8 @@ inline v_int32x4 v_round(const v_float32x4& a)
 inline v_int32x4 v_floor(const v_float32x4& a)
 {
     __builtin_riscv_fsrm(2);
-    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
+    barrier1(&nan);
     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
     __builtin_riscv_fsrm(0);
@@ -2180,7 +2572,8 @@ inline v_int32x4 v_floor(const v_float32x4& a)
 inline v_int32x4 v_ceil(const v_float32x4& a)
 {
     __builtin_riscv_fsrm(3);
-    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
+    barrier1(&nan);
     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
     __builtin_riscv_fsrm(0);
@@ -2190,7 +2583,8 @@ inline v_int32x4 v_ceil(const v_float32x4& a)
 inline v_int32x4 v_trunc(const v_float32x4& a)
 {
     __builtin_riscv_fsrm(1);
-    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
+    barrier1(&nan);
     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
     __builtin_riscv_fsrm(0);
@@ -2201,10 +2595,11 @@ inline v_int32x4 v_round(const v_float64x2& a)
 {
     __builtin_riscv_fsrm(0);
     vfloat64m2_t _val = vundefined_f64m2();
-    _val = vset_f64m2(_val, 0, a.val);
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
     //_val = vset_f64m2(_val, 1, a.val);
-    _val = vset_f64m2(_val, 1, vfmv_v_f_f64m1(0, 2));
-    vint32m1_t val = vfncvt_x_f_v_i32m1(_val, 4);
+    _val = vset_v_f64m1_f64m2(_val, 1, vfmv_v_f_f64m1(0, 2));
+    barrier1(&_val);
+    vint32m1_t val = vfncvt_x_f_w_i32m1(_val, 4);
     __builtin_riscv_fsrm(0);
     return v_int32x4(val);
 }
@@ -2212,9 +2607,10 @@ inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
 {
     __builtin_riscv_fsrm(0);
     vfloat64m2_t _val = vundefined_f64m2();
-    _val = vset_f64m2(_val, 0, a.val);
-    _val = vset_f64m2(_val, 1, b.val);
-    vint32m1_t val = vfncvt_x_f_v_i32m1(_val, 4);
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
+    _val = vset_v_f64m1_f64m2(_val, 1, b.val);
+    barrier1(&_val);
+    vint32m1_t val = vfncvt_x_f_w_i32m1(_val, 4);
     __builtin_riscv_fsrm(0);
     return v_int32x4(val);
 }
@@ -2222,10 +2618,10 @@ inline v_int32x4 v_floor(const v_float64x2& a)
 {
     __builtin_riscv_fsrm(2);
     vfloat64m2_t _val = vundefined_f64m2();
-    _val = vset_f64m2(_val, 0, a.val);
-    vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
-
-    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
+    vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(aval), 0x7f800000, 4);
+    barrier1(&nan);
     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
     __builtin_riscv_fsrm(0);
@@ -2236,10 +2632,10 @@ inline v_int32x4 v_ceil(const v_float64x2& a)
 {
     __builtin_riscv_fsrm(3);
     vfloat64m2_t _val = vundefined_f64m2();
-    _val = vset_f64m2(_val, 0, a.val);
-    vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
-
-    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
+    vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(aval), 0x7f800000, 4);
+    barrier1(&nan);
     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
     __builtin_riscv_fsrm(0);
@@ -2250,139 +2646,86 @@ inline v_int32x4 v_trunc(const v_float64x2& a)
 {
     __builtin_riscv_fsrm(1);
     vfloat64m2_t _val = vundefined_f64m2();
-    _val = vset_f64m2(_val, 0, a.val);
-    vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
-
-    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
+    vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(aval), 0x7f800000, 4);
+    barrier1(&nan);
     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
     __builtin_riscv_fsrm(0);
     return v_int32x4(val);
 }
 
-#define OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(intrin, _Tpvec, num, _Tp, _T)    \
+#define OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(intrin, _Tpvec, num, _Tp, _T, elemsize)    \
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
 { \
-    v##_Tpvec##m1x2_t ret = intrin##2e_v_##_T##m1x2(ptr, num);\
-    a.val = vget_##_T##m1x2_##_T##m1(ret, 0);  \
-    b.val = vget_##_T##m1x2_##_T##m1(ret, 1);  \
+    intrin##2e##elemsize##_v_##_T##m1(&a.val, &b.val, ptr, num); \
 } \
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
 { \
-    v##_Tpvec##m1x3_t ret = intrin##3e_v_##_T##m1x3(ptr, num);\
-    a.val = vget_##_T##m1x3_##_T##m1(ret, 0);  \
-    b.val = vget_##_T##m1x3_##_T##m1(ret, 1);  \
-    c.val = vget_##_T##m1x3_##_T##m1(ret, 2);  \
+    intrin##3e##elemsize##_v_##_T##m1(&a.val, &b.val, &c.val, ptr, num); \
 }\
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
                                 v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
 { \
-    v##_Tpvec##m1x4_t ret = intrin##4e_v_##_T##m1x4(ptr, num);\
-    a.val = vget_##_T##m1x4_##_T##m1(ret, 0);  \
-    b.val = vget_##_T##m1x4_##_T##m1(ret, 1);  \
-    c.val = vget_##_T##m1x4_##_T##m1(ret, 2);  \
-    d.val = vget_##_T##m1x4_##_T##m1(ret, 3);  \
+    intrin##4e##elemsize##_v_##_T##m1(&a.val, &b.val, &c.val, &d.val, ptr, num); \
 } \
 
-#define OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(intrin, _Tpvec, num, _Tp, _T)    \
+#define OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(intrin, _Tpvec, num, _Tp, _T, elemsize)    \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
-    v##_Tpvec##m1x2_t ret = vundefined_##_T##m1x2();      \
-    ret = vset_##_T##m1x2(ret, 0, a.val);  \
-    ret = vset_##_T##m1x2(ret, 1, b.val);  \
-    intrin##2e_v_##_T##m1x2(ptr, ret, num); \
+    intrin##2e##elemsize##_v_##_T##m1(ptr, a.val, b.val, num); \
 } \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
                                 const v_##_Tpvec##x##num& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
-    v##_Tpvec##m1x3_t ret = vundefined_##_T##m1x3();       \
-    ret = vset_##_T##m1x3(ret, 0, a.val);  \
-    ret = vset_##_T##m1x3(ret, 1, b.val);  \
-    ret = vset_##_T##m1x3(ret, 2, c.val);  \
-    intrin##3e_v_##_T##m1x3(ptr, ret, num); \
+    intrin##3e##elemsize##_v_##_T##m1(ptr, a.val, b.val, c.val, num); \
 } \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
                                 const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
 { \
-    v##_Tpvec##m1x4_t ret = vundefined_##_T##m1x4();             \
-    ret = vset_##_T##m1x4(ret, 0, a.val);  \
-    ret = vset_##_T##m1x4(ret, 1, b.val);  \
-    ret = vset_##_T##m1x4(ret, 2, c.val);  \
-    ret = vset_##_T##m1x4(ret, 3, d.val);  \
-    intrin##4e_v_##_T##m1x4(ptr, ret, num); \
+    intrin##4e##elemsize##_v_##_T##m1(ptr, a.val, b.val, c.val, d.val, num); \
 }
 
-#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(_Tpvec, _Tp, num, ld, st, _T) \
-OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(ld, _Tpvec, num, _Tp, _T)    \
-OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(st, _Tpvec, num, _Tp, _T)
+#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(_Tpvec, _Tp, num, ld, st, _T, elemsize) \
+OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(ld, _Tpvec, num, _Tp, _T, elemsize)    \
+OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(st, _Tpvec, num, _Tp, _T, elemsize)
 
 //OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, uchar, )
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int8, schar, 16, vlseg, vsseg, i8)
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int16, short, 8, vlseg, vsseg, i16)
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int32, int, 4, vlseg, vsseg, i32)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int8, schar, 16, vlseg, vsseg, i8, 8)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int16, short, 8, vlseg, vsseg, i16, 16)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int32, int, 4, vlseg, vsseg, i32, 32)
 
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, unsigned char, 16, vlseg, vsseg, u8)
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint16, unsigned short, 8, vlseg, vsseg, u16)
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint32, unsigned int, 4, vlseg, vsseg, u32)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, unsigned char, 16, vlseg, vsseg, u8, 8)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint16, unsigned short, 8, vlseg, vsseg, u16, 16)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint32, unsigned int, 4, vlseg, vsseg, u32, 32)
 
-#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(_Tpvec, _Tp, num, _T) \
+#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(_Tpvec, _Tp, num, _T, _esize) \
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
-{ \
-    v##_Tpvec##m1x2_t ret = vlseg2e_v_##_T##m1x2(ptr, num); \
-    a.val = vget_##_T##m1x2_##_T##m1(ret, 0);  \
-    b.val = vget_##_T##m1x2_##_T##m1(ret, 1);  \
-} \
+{ vlseg2e##_esize##_v_##_T##m1(&a.val, &b.val, ptr, num);} \
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
-{ \
-    v##_Tpvec##m1x3_t ret = vlseg3e_v_##_T##m1x3(ptr, num);    \
-    a.val = vget_##_T##m1x3_##_T##m1(ret, 0);  \
-    b.val = vget_##_T##m1x3_##_T##m1(ret, 1);  \
-    c.val = vget_##_T##m1x3_##_T##m1(ret, 2);  \
-}\
+{ vlseg3e##_esize##_v_##_T##m1(&a.val, &b.val, &c.val, ptr, num);}\
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
                                 v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
-{ \
-    v##_Tpvec##m1x4_t ret = vlseg4e_v_##_T##m1x4(ptr, num);    \
-    a.val = vget_##_T##m1x4_##_T##m1(ret, 0);  \
-    b.val = vget_##_T##m1x4_##_T##m1(ret, 1);  \
-    c.val = vget_##_T##m1x4_##_T##m1(ret, 2);  \
-    d.val = vget_##_T##m1x4_##_T##m1(ret, 3);  \
-} \
+{ vlseg4e##_esize##_v_##_T##m1(&a.val, &b.val, &c.val, &d.val, ptr, num);} \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
-{ \
-    v##_Tpvec##m1x2_t ret = vundefined_##_T##m1x2();    \
-    ret = vset_##_T##m1x2(ret, 0, a.val);  \
-    ret = vset_##_T##m1x2(ret, 1, b.val);  \
-    vsseg2e_v_##_T##m1x2(ptr, ret, num);    \
-} \
+{ vsseg2e##_esize##_v_##_T##m1(ptr, a.val, b.val, num);} \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
                                 const v_##_Tpvec##x##num& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
-{ \
-    v##_Tpvec##m1x3_t ret = vundefined_##_T##m1x3();    \
-    ret = vset_##_T##m1x3(ret, 0, a.val);  \
-    ret = vset_##_T##m1x3(ret, 1, b.val);  \
-    ret = vset_##_T##m1x3(ret, 2, c.val);  \
-    vsseg3e_v_##_T##m1x3(ptr, ret, num);    \
-} \
+{ vsseg3e##_esize##_v_##_T##m1(ptr, a.val, b.val, c.val, num);} \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
                                 const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
-{ \
-    v##_Tpvec##m1x4_t ret = vundefined_##_T##m1x4();    \
-    ret = vset_##_T##m1x4(ret, 0, a.val);  \
-    ret = vset_##_T##m1x4(ret, 1, b.val);  \
-    ret = vset_##_T##m1x4(ret, 2, c.val);  \
-    ret = vset_##_T##m1x4(ret, 3, d.val);  \
-    vsseg4e_v_##_T##m1x4(ptr, ret, num);    \
-}
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float32, float, 4, f32)
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float64, double, 2, f64)
+{ vsseg4e##_esize##_v_##_T##m1(ptr, a.val, b.val, c.val, d.val, num);}
 
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(uint64, unsigned long, 2, u64)
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(int64, long, 2, i64)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float32, float, 4, f32, 32)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float64, double, 2, f64, 64)
+
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(uint64, unsigned long, 2, u64, 64)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(int64, long, 2, i64, 64)
 
 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
 {
@@ -2393,17 +2736,17 @@ inline v_float32x4 v_cvt_f32(const v_int32x4& a)
 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
 {
     vfloat64m2_t _val = vundefined_f64m2();
-    _val = vset_f64m2(_val, 0, a.val);
-    vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
+    vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
     return v_float32x4(aval);
 }
 
 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
 {
     vfloat64m2_t _val = vundefined_f64m2();
-    _val = vset_f64m2(_val, 0, a.val);
-    _val = vset_f64m2(_val, 1, b.val);
-    vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 4);
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
+    _val = vset_v_f64m1_f64m2(_val, 1, b.val);
+    vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 4);
     return v_float32x4(aval);
 }
 
@@ -2411,26 +2754,26 @@ inline v_float64x2 v_cvt_f64(const v_int32x4& a)
 {
     vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
     vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
-    return v_float64x2(vget_f64m2_f64m1(_val, 0));
+    return v_float64x2(vget_v_f64m2_f64m1(_val, 0));
 }
 
 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
 {
     vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
     vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
-    return v_float64x2(vget_f64m2_f64m1(_val, 1));
+    return v_float64x2(vget_v_f64m2_f64m1(_val, 1));
 }
 
 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
 {
     vfloat64m2_t _val  = vfwcvt_f_f_v_f64m2(a.val, 4);
-    return v_float64x2(vget_f64m2_f64m1(_val, 0));
+    return v_float64x2(vget_v_f64m2_f64m1(_val, 0));
 }
 
 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 {
     vfloat64m2_t _val  = vfwcvt_f_f_v_f64m2(a.val, 4);
-    return v_float64x2(vget_f64m2_f64m1(_val, 1));
+    return v_float64x2(vget_v_f64m2_f64m1(_val, 1));
 }
 
 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
@@ -2441,8 +2784,9 @@ inline v_float64x2 v_cvt_f64(const v_int64x2& a)
 #endif
 inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
 {
-    vuint64m1_t m0 = {0x0705060403010200, 0x0F0D0E0C0B090A08};
-    return v_int8x16(vrgather_vv_i8m1(vec.val, (vuint8m1_t)m0, 16));
+    uint64 mdata[2] = {0x0705060403010200, 0x0F0D0E0C0B090A08};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int8x16(vrgather_vv_i8m1(vec.val, vreinterpret_v_u64m1_u8m1(m0), 16));
 }
 inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
 {
@@ -2451,8 +2795,9 @@ inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
 
 inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
 {
-    vuint64m1_t m0 = {0x0703060205010400, 0x0F0B0E0A0D090C08};
-    return v_int8x16(vrgather_vv_i8m1(vec.val, (vuint8m1_t)m0, 16));
+    uint64 mdata[2] = {0x0703060205010400, 0x0F0B0E0A0D090C08};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int8x16(vrgather_vv_i8m1(vec.val, vreinterpret_v_u64m1_u8m1(m0), 16));
 }
 inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec)
 {
@@ -2461,35 +2806,40 @@ inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec)
 
 inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
 {
-    vuint64m1_t m0 = {0x0706030205040100, 0x0F0E0B0A0D0C0908};
-    return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)vec.val, (vuint8m1_t)m0, 16));
+    uint64 mdata[2] = {0x0706030205040100, 0x0F0E0B0A0D0C0908};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int16x8(vreinterpret_v_i8m1_i16m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
 }
 inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
 inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
 {
-    vuint64m1_t m0 = {0x0B0A030209080100, 0x0F0E07060D0C0504};
-    return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
+    uint64 mdata[2] = {0x0B0A030209080100, 0x0F0E07060D0C0504};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int16x8(vreinterpret_v_i8m1_i16m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
 }
 inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
 
 inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
 {
-    vuint64m1_t m0 = {0x0B0A090803020100, 0x0F0E0D0C07060504};
-    return v_int32x4((vint32m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
+    uint64 mdata[2] = {0x0B0A090803020100, 0x0F0E0D0C07060504};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int32x4(vreinterpret_v_i8m1_i32m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i32m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
 }
 inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
 inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
 inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
 {
-    vuint64m1_t m0 = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
-    return v_int8x16((vint8m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
+    uint64 mdata[2] = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int8x16(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vec.val), vreinterpret_v_u64m1_u8m1(m0), 16)));
 }
 inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
 
 inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
 {
-    vuint64m1_t m0 = {0x0908050403020100, 0xFFFFFFFF0D0C0B0A};
-    return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
+    uint64 mdata[2] = {0x0908050403020100, 0xFFFFFFFF0D0C0B0A};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int16x8(vreinterpret_v_i8m1_i16m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
 }
 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
 
@@ -2506,7 +2856,7 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a,   const v_int32x4& b,
 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
 {
     vint64m2_t v1 = vwmul_vv_i64m2(a.val, b.val, 4);
-    vfloat64m1_t res = vfcvt_f_x_v_f64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 2), 2);
+    vfloat64m1_t res = vfcvt_f_x_v_f64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v1, 0), vget_v_i64m2_i64m1(v1, 1), 2), 2);
     return v_float64x2(res);
 }
 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
@@ -2514,21 +2864,37 @@ inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b,
   return res + c; }
 #endif
 ////// FP16 support ///////
+#if __riscv_v == 7000
 inline v_float32x4 v_load_expand(const float16_t* ptr)
 {
-    vfloat16m1_t v = vle_v_f16m1((__fp16*)ptr, 4);
+    vfloat16m1_t v = vle16_v_f16m1((__fp16*)ptr, 4);
     vfloat32m2_t v32 = vfwcvt_f_f_v_f32m2(v, 4);
-    return v_float32x4(vget_f32m2_f32m1(v32, 0));
+    return v_float32x4(vget_v_f32m2_f32m1(v32, 0));
 }
 
 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
 {
     vfloat32m2_t v32 = vundefined_f32m2();
-    v32 = vset_f32m2(v32, 0, v.val);
-    vfloat16m1_t hv = vfncvt_f_f_v_f16m1(v32, 4);
-    vse_v_f16m1((__fp16*)ptr, hv, 4);
+    v32 = vset_v_f32m1_f32m2(v32, 0, v.val);
+    vfloat16m1_t hv = vfncvt_f_f_w_f16m1(v32, 4);
+    vse16_v_f16m1((__fp16*)ptr, hv, 4);
+}
+#else
+inline v_float32x4 v_load_expand(const float16_t* ptr)
+{
+    vfloat16mf2_t v = vle16_v_f16mf2((__fp16*)ptr, 4);
+    vfloat32m1_t v32 = vfwcvt_f_f_v_f32m1(v, 4);
+    return v_float32x4(v32);
 }
 
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+{
+    //vfloat32m2_t v32 = vundefined_f32m2();
+    //v32 = vset_f32m2(v32, 0, v.val);
+    vfloat16mf2_t hv = vfncvt_f_f_w_f16mf2(v.val, 4);
+    vse16_v_f16mf2((__fp16*)ptr, hv, 4);
+}
+#endif
 
 inline void v_cleanup() {}
 
@@ -2536,5 +2902,5 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
 //! @endcond
 
-} // namespace cv
+}
 #endif
diff --git a/platforms/linux/riscv64-071-gcc.toolchain.cmake b/platforms/linux/riscv64-071-gcc.toolchain.cmake
index 53e4a7fced..0542006570 100644
--- a/platforms/linux/riscv64-071-gcc.toolchain.cmake
+++ b/platforms/linux/riscv64-071-gcc.toolchain.cmake
@@ -4,5 +4,54 @@ set(CMAKE_SYSTEM_PROCESSOR riscv64)
 set(CMAKE_CXX_COMPILER riscv64-unknown-linux-gnu-g++)
 set(CMAKE_C_COMPILER  riscv64-unknown-linux-gnu-gcc)
 
-set(CMAKE_CXX_FLAGS_INIT "-march=rv64gcv -mabi=lp64d -D__riscv_vector_071")
-set(CMAKE_C_FLAGS_INIT "-march=rv64gcv -mabi=lp64d -D__riscv_vector_071")
+# MangoPi MQ Pro - C906FD, C906FDV
+# Lichee Pi 4A - C910, C910V (?)
+# CanMV K230 - C908, C908V
+
+# See https://github.com/T-head-Semi/gcc/blob/xuantie-gcc-10.4.0/gcc/config/riscv/riscv-cores.def
+
+set(_enable_vector OFF)
+if(CORE STREQUAL "C906FD")
+  set(CMAKE_C_FLAGS_INIT "-mcpu=c906fd -mabi=lp64d  -mtune=c906fd")
+  set(CMAKE_CXX_FLAGS_INIT "-mcpu=c906fd -mabi=lp64d  -mtune=c906fd")
+elseif(CORE STREQUAL "C906FDV")
+  set(CMAKE_C_FLAGS_INIT "-mcpu=c906fd -mabi=lp64d  -mtune=c906fd")
+  set(CMAKE_CXX_FLAGS_INIT "-mcpu=c906fd -mabi=lp64d  -mtune=c906fd")
+  # Disabled due to limited 64-bit SEW support
+  # set(_enable_vector ON)
+elseif(CORE STREQUAL "C908")
+  set(CMAKE_C_FLAGS_INIT "-mcpu=c908 -mabi=lp64d  -mtune=c908")
+  set(CMAKE_CXX_FLAGS_INIT "-mcpu=c908 -mabi=lp64d  -mtune=c908")
+elseif(CORE STREQUAL "C908V")
+  set(CMAKE_C_FLAGS_INIT "-mcpu=c908v -mabi=lp64d  -mtune=c908")
+  set(CMAKE_CXX_FLAGS_INIT "-mcpu=c908v -mabi=lp64d  -mtune=c908")
+  set(_enable_vector ON) # RVV 1.0
+elseif(CORE STREQUAL "C910")
+  set(CMAKE_C_FLAGS_INIT "-mcpu=c910 -mabi=lp64d -mtune=c910")
+  set(CMAKE_CXX_FLAGS_INIT "-mcpu=c910 -mabi=lp64d -mtune=c910")
+elseif(CORE STREQUAL "C910V")
+  set(CMAKE_C_FLAGS_INIT "-march=rv64imafdcv0p7xthead -mabi=lp64d")
+  set(CMAKE_CXX_FLAGS_INIT "-march=rv64imafdcv0p7xthead -mabi=lp64d")
+  set(_enable_vector ON) # RVV 0.7.1
+elseif(CORE STREQUAL "C920")
+  set(CMAKE_C_FLAGS_INIT "-mcpu=c920 -mabi=lp64d  -mtune=c920")
+  set(CMAKE_CXX_FLAGS_INIT "-mcpu=c920 -mabi=lp64d  -mtune=c920")
+  set(_enable_vector ON) # RVV 0.7.1
+elseif(CORE STREQUAL "C920V2")
+  set(CMAKE_C_FLAGS_INIT "-mcpu=c920v2 -mabi=lp64d  -mtune=c920v2")
+  set(CMAKE_CXX_FLAGS_INIT "-mcpu=c920v2 -mabi=lp64d  -mtune=c920v2")
+  set(_enable_vector ON) # RVV 1.0
+else()
+  set(CMAKE_C_FLAGS_INIT "-march=rv64imafdc_zihintpause_zfh_zba_zbb_zbc_zbs_xtheadc -mabi=lp64d")
+  set(CMAKE_CXX_FLAGS_INIT "-march=rv64imafdc_zihintpause_zfh_zba_zbb_zbc_zbs_xtheadc -mabi=lp64d")
+endif()
+
+if(_enable_vector)
+  set(CMAKE_C_FLAGS_INIT "${CMAKE_C_FLAGS_INIT} -D__riscv_vector_071 -mrvv-vector-bits=128")
+  set(CMAKE_CXX_FLAGS_INIT "${CMAKE_CXX_FLAGS_INIT} -D__riscv_vector_071 -mrvv-vector-bits=128")
+endif()
+
+if(ENABLE_GCOV)
+  set(CMAKE_CXX_FLAGS_INIT "${CMAKE_CXX_FLAGS_INIT} -fprofile-arcs -ftest-coverage")
+  set(CMAKE_C_FLAGS_INIT "${CMAKE_C_FLAGS_INIT} -fprofile-arcs -ftest-coverage")
+endif()