Support fp16 for RISC-V.

pull/25743/head
Liutong HAN 6 months ago
parent f24e80297a
commit f3cc5a9e1e
  1. 70
      cmake/OpenCVCompilerOptimizations.cmake
  2. 21
      cmake/checks/cpu_fp16.cpp
  3. 25
      cmake/checks/cpu_rvv_fp16.cpp
  4. 7
      modules/core/include/opencv2/core/cv_cpu_dispatch.h
  5. 21
      modules/core/include/opencv2/core/cv_cpu_helper.h
  6. 8
      modules/core/include/opencv2/core/cvdef.h
  7. 24
      modules/core/include/opencv2/core/hal/intrin.hpp
  8. 499
      modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
  9. 7
      modules/core/src/system.cpp
  10. 24
      modules/core/test/test_intrin_utils.hpp

@ -52,7 +52,7 @@ list(APPEND CPU_ALL_OPTIMIZATIONS "AVX512_COMMON;AVX512_KNL;AVX512_KNM;AVX512_SK
list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16 NEON_DOTPROD NEON_FP16 NEON_BF16)
list(APPEND CPU_ALL_OPTIMIZATIONS MSA)
list(APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3)
list(APPEND CPU_ALL_OPTIMIZATIONS RVV)
list(APPEND CPU_ALL_OPTIMIZATIONS RVV FP16 RVV_ZVFH)
list(APPEND CPU_ALL_OPTIMIZATIONS LSX)
list(APPEND CPU_ALL_OPTIMIZATIONS LASX)
list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS)
@ -170,6 +170,21 @@ elseif(" ${CMAKE_CXX_FLAGS} " MATCHES " -march=native | -xHost | /QxHost ")
set(CPU_BASELINE_DETECT ON)
endif()
# This macro traverses all the dependent (IMPLIES) backends for the CPU_${OPT}_FLAGS_ON.
macro(ocv_cpu_riscv_update_flag FEATURE_NAME_LIST COMMON_OPTION)
foreach(OPT IN LISTS ${FEATURE_NAME_LIST})
unset(APPEND_TRAILING)
# traverse all dependency and merge extensions to a flag.
foreach(IMPLIE IN LISTS CPU_${OPT}_IMPLIES)
string(APPEND APPEND_TRAILING "_${CPU_${IMPLIE}_FLAG}")
endforeach()
string(APPEND APPEND_TRAILING "_${CPU_${OPT}_FLAG}")
# Update flag
set(CPU_${OPT}_FLAGS_ON "${COMMON_OPTION}${APPEND_TRAILING}")
message(STATUS "CPU_${OPT}_FLAGS_ON is ${CPU_${OPT}_FLAGS_ON}")
endforeach()
endmacro()
if(X86 OR X86_64)
ocv_update(CPU_KNOWN_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;POPCNT;SSE4_2;FP16;FMA3;AVX;AVX2;AVX_512F;AVX512_COMMON;AVX512_KNL;AVX512_KNM;AVX512_SKX;AVX512_CNL;AVX512_CLX;AVX512_ICL")
@ -390,12 +405,28 @@ elseif(PPC64LE)
set(CPU_BASELINE "VSX" CACHE STRING "${HELP_CPU_BASELINE}")
elseif(RISCV)
if(NOT DEFINED PLATFORM_STR)
set(PLATFORM_STR "rv64gc")
endif()
ocv_update(CPU_KNOWN_OPTIMIZATIONS "RVV;FP16;RVV_ZVFH")
ocv_update(CPU_RVV_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_rvv.cpp")
ocv_update(CPU_KNOWN_OPTIMIZATIONS "RVV")
ocv_update(CPU_RVV_FLAGS_ON "-march=rv64gcv")
ocv_update(CPU_FP16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_fp16.cpp")
ocv_update(CPU_RVV_ZVFH_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_rvv_fp16.cpp")
ocv_update(CPU_RVV_ZVFH_IMPLIES "RVV;FP16")
ocv_update(CPU_FP16_IMPLIES "RVV")
set(CPU_RVV_FLAG "v")
set(CPU_FP16_FLAG "zvfhmin")
set(CPU_RVV_ZVFH_FLAG "zvfh")
set(BASE_ARCHITECTURE "-march=${PLATFORM_STR}")
ocv_cpu_riscv_update_flag(CPU_KNOWN_OPTIMIZATIONS ${BASE_ARCHITECTURE})
ocv_update(CPU_RVV_FLAGS_CONFLICT "-march=[^ ]*")
set(CPU_DISPATCH "" CACHE STRING "${HELP_CPU_DISPATCH}")
if(NOT ${BUILD_SHARED_LIBS}) # static build for k230
add_extra_compiler_option("-static -static-libgcc -static-libstdc++")
endif()
set(CPU_DISPATCH "FP16;RVV_ZVFH" CACHE STRING "${HELP_CPU_DISPATCH}")
set(CPU_BASELINE "DETECT" CACHE STRING "${HELP_CPU_BASELINE}")
elseif(LOONGARCH64)
@ -495,6 +526,32 @@ macro(ocv_cpu_aarch64_baseline_merge_feature_options FEATURE_NAME_LIST FLAG_STRI
endif()
endmacro()
macro(ocv_cpu_riscv_baseline_merge_feature_options FEATURE_NAME_LIST FLAG_STRING COMMON_OPTION)
unset(_POSTFIX)
unset(APPEND_TRAILING)
# Check each feature option.
foreach(OPT IN LISTS ${FEATURE_NAME_LIST})
string(FIND "${${FLAG_STRING}}" "${CPU_${OPT}_FLAGS_ON}" OPT_FOUND)
if(NOT ${OPT_FOUND} EQUAL -1)
# e.g. when ${CPU_${OPT}_FLAGS_ON} is "rv64gc_v_zvfhmin"
# the ${TRAILING_PART} will be "_v_zvfhmin"
# and the ${parts} will be "_v;_zvfhmin" (a list)
string(REPLACE "${COMMON_OPTION}" "" TRAILING_PART "${CPU_${OPT}_FLAGS_ON}")
string(REGEX MATCHALL "_[^_]+" parts ${TRAILING_PART})
list(APPEND _POSTFIX ${parts})
# remove ${CPU_${OPT}_FLAGS_ON} from ${FLAG_STRING}
string(REGEX REPLACE "${CPU_${OPT}_FLAGS_ON}( |$)" "" ${FLAG_STRING} ${${FLAG_STRING}})
endif()
endforeach()
# Remove the duplicate extensions. (e.g. _v, _v, ...)
list(REMOVE_DUPLICATES _POSTFIX)
# Merge to one extensions flag
foreach(TRAILING IN LISTS _POSTFIX)
string(APPEND APPEND_TRAILING "${TRAILING}")
endforeach()
set(${FLAG_STRING} "${${FLAG_STRING}} ${COMMON_OPTION}${APPEND_TRAILING}")
endmacro()
foreach(OPT ${CPU_KNOWN_OPTIMIZATIONS})
set(CPU_${OPT}_USAGE_COUNT 0 CACHE INTERNAL "")
if("${CPU_${OPT}_FLAGS_ON}" STREQUAL "disabled")
@ -597,6 +654,11 @@ if(AARCH64)
endif()
endif()
if(RISCV)
string(STRIP "${CPU_BASELINE_FLAGS}" CPU_BASELINE_FLAGS)
ocv_cpu_riscv_baseline_merge_feature_options(CPU_KNOWN_OPTIMIZATIONS CPU_BASELINE_FLAGS ${BASE_ARCHITECTURE})
endif()
foreach(OPT ${CPU_BASELINE_REQUIRE})
if(NOT ";${CPU_BASELINE_FINAL};" MATCHES ";${OPT};")
message(SEND_ERROR "Required baseline optimization is not supported: ${OPT} (CPU_BASELINE_REQUIRE=${CPU_BASELINE_REQUIRE})")

@ -23,6 +23,27 @@ int test()
*(float16x4_t*)dst = v_dst;
return (int)dst[0];
}
#elif (defined __riscv_zvfhmin && __riscv_zvfhmin) || (defined __riscv_zvfh && __riscv_zvfh)
#include <riscv_vector.h>
int test()
{
const _Float16 input1[] = {0.5f, 1.5f, 2.5f, 3.5f};
const float input2[] = {-0.5f, -1.5f, -2.5f, -3.5f};
short dst[4];
size_t vl = __riscv_vsetvl_e16m1(4);
vfloat16m1_t in_f16 = __riscv_vle16_v_f16m1(input1, vl);
vfloat32m2_t in_f32 = __riscv_vle32_v_f32m2(input2, vl);
vfloat32m2_t cvt_f32 = __riscv_vfwcvt_f_f_v_f32m2(in_f16, vl);
vfloat32m2_t res_f32 = __riscv_vfadd(in_f32, cvt_f32, vl);
vfloat16m1_t res_f16 = __riscv_vfncvt_f_f_w_f16m1(res_f32, vl);
__riscv_vse16_v_f16m1((_Float16*)dst, res_f16, vl);
return (int)dst[0];
}
#else
#error "FP16 is not supported"
#endif

@ -0,0 +1,25 @@
#include <stdio.h>
#if defined(__riscv) && __riscv && defined (__riscv_zvfh) && __riscv_zvfh
# include <riscv_vector.h>
int test()
{
const _Float16 input1[] = {0.5f, 1.5f, 2.5f, 3.5f};
const _Float16 input2[] = {-0.5f, -1.5f, -2.5f, -3.5f};
size_t vl = __riscv_vsetvl_e16m1(4);
vfloat16m1_t vec1 = __riscv_vle16_v_f16m1(input1, vl);
vfloat16m1_t vec2 = __riscv_vle16_v_f16m1(input2, vl);
vfloat16m1_t result = __riscv_vfadd_vv_f16m1(vec1, vec2, vl);
return (int)__riscv_vfmv_f_s_f16m1_f16(result);
}
#else
#error "RISC-V Vector Extension with Half-Precision Floating-Point (zvfh) is not supported"
#endif
int main()
{
printf("%d\n", test());
return 0;
}

@ -74,6 +74,8 @@
#ifdef CV_CPU_COMPILE_FP16
# if defined(__arm__) || defined(__aarch64__) || defined(_M_ARM) || defined(_M_ARM64)
# include <arm_neon.h>
# elif defined(__riscv_vector)
# include <riscv_vector.h>
# else
# include <immintrin.h>
# endif
@ -250,6 +252,11 @@ struct VZeroUpperGuard {
# define CV_FP16 1
#endif
#if defined(__riscv_zvfhmin) && __riscv_zvfhmin || (defined(__riscv_zvfh) && __riscv_zvfh)
# include <riscv_vector.h>
# define CV_FP16 1
#endif
#endif // !__OPENCV_BUILD && !__CUDACC (Compatibility code)

@ -567,6 +567,27 @@
#endif
#define __CV_CPU_DISPATCH_CHAIN_RVV(fn, args, mode, ...) CV_CPU_CALL_RVV(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_RVV_ZVFH
# define CV_TRY_RVV_ZVFH 1
# define CV_CPU_FORCE_RVV_ZVFH 1
# define CV_CPU_HAS_SUPPORT_RVV_ZVFH 1
# define CV_CPU_CALL_RVV_ZVFH(fn, args) return (cpu_baseline::fn args)
# define CV_CPU_CALL_RVV_ZVFH_(fn, args) return (opt_RVV_ZVFH::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_RVV_ZVFH
# define CV_TRY_RVV_ZVFH 1
# define CV_CPU_FORCE_RVV_ZVFH 0
# define CV_CPU_HAS_SUPPORT_RVV_ZVFH (cv::checkHardwareSupport(CV_CPU_RVV_ZVFH))
# define CV_CPU_CALL_RVV_ZVFH(fn, args) if (CV_CPU_HAS_SUPPORT_RVV_ZVFH) return (opt_RVV_ZVFH::fn args)
# define CV_CPU_CALL_RVV_ZVFH_(fn, args) if (CV_CPU_HAS_SUPPORT_RVV_ZVFH) return (opt_RVV_ZVFH::fn args)
#else
# define CV_TRY_RVV_ZVFH 0
# define CV_CPU_FORCE_RVV_ZVFH 0
# define CV_CPU_HAS_SUPPORT_RVV_ZVFH 0
# define CV_CPU_CALL_RVV_ZVFH(fn, args)
# define CV_CPU_CALL_RVV_ZVFH_(fn, args)
#endif
#define __CV_CPU_DISPATCH_CHAIN_RVV_ZVFH(fn, args, mode, ...) CV_CPU_CALL_RVV_ZVFH(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_LSX
# define CV_TRY_LSX 1
# define CV_CPU_FORCE_LSX 1

@ -288,6 +288,7 @@ namespace cv {
#define CV_CPU_VSX3 201
#define CV_CPU_RVV 210
#define CV_CPU_RVV_ZVFH 211
#define CV_CPU_LSX 230
#define CV_CPU_LASX 231
@ -350,6 +351,7 @@ enum CpuFeatures {
CPU_VSX3 = 201,
CPU_RVV = 210,
CPU_RVV_ZVFH = 211,
CPU_LSX = 230,
CPU_LASX = 231,
@ -384,6 +386,8 @@ enum CpuFeatures {
#if defined __ARM_FP16_FORMAT_IEEE \
&& !defined __CUDACC__
# define CV_FP16_TYPE 1
#elif (defined(__riscv_zvfh) && __riscv_zvfh) || (defined(__riscv_zvfhmin) && __riscv_zvfhmin)
# define CV_FP16_TYPE 1
#else
# define CV_FP16_TYPE 0
#endif
@ -838,12 +842,14 @@ class hfloat
public:
#if CV_FP16_TYPE
hfloat() = default;
explicit hfloat(float x) { h = (__fp16)x; }
operator float() const { return (float)h; }
#if defined __ARM_FP16_FORMAT_IEEE
explicit hfloat(float x) { h = (__fp16)x; }
protected:
__fp16 h;
#else
explicit hfloat(float x) { h = (_Float16)x; }
explicit operator _Float16() const { return h; }
protected:
_Float16 h;
#endif

@ -343,6 +343,10 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
#define CV_SIMD_SCALABLE_64F 0
#endif
#ifndef CV_SIMD_SCALABLE_FP16
#define CV_SIMD_SCALABLE_FP16 0
#endif
//==================================================================================================
template<typename _Tp> struct V_RegTraits
@ -412,6 +416,9 @@ template<typename _Tp> struct V_RegTraits
CV_DEF_REG_TRAITS(v, v_int8, schar, s8, v_uint8, v_int16, v_int32, v_int8, void);
CV_DEF_REG_TRAITS(v, v_uint16, ushort, u16, v_uint16, v_uint32, v_uint64, v_int16, void);
CV_DEF_REG_TRAITS(v, v_int16, short, s16, v_uint16, v_int32, v_int64, v_int16, void);
#if CV_SIMD_SCALABLE_FP16
CV_DEF_REG_TRAITS(v, v_float16, hfloat, f16, v_float16, v_float32, v_float64, v_int16, v_int16);
#endif
CV_DEF_REG_TRAITS(v, v_uint32, unsigned, u32, v_uint32, v_uint64, void, v_int32, void);
CV_DEF_REG_TRAITS(v, v_int32, int, s32, v_uint32, v_int64, void, v_int32, void);
CV_DEF_REG_TRAITS(v, v_float32, float, f32, v_float32, v_float64, void, v_int32, v_int32);
@ -542,6 +549,7 @@ using namespace CV__SIMD_NAMESPACE;
#define CV__SIMD_NAMESPACE simd
namespace CV__SIMD_NAMESPACE {
#define CV_SIMD 0
#define CV_SIMD_FP16 0
#define CV_SIMD_WIDTH 128 /* 1024/8 */
#define VXPREFIX(func) v##func
@ -565,7 +573,7 @@ namespace CV__SIMD_NAMESPACE {
inline v_int8 vx_setall_s8(schar v) { return VXPREFIX(_setall_s8)(v); }
inline v_uint16 vx_setall_u16(ushort v) { return VXPREFIX(_setall_u16)(v); }
inline v_int16 vx_setall_s16(short v) { return VXPREFIX(_setall_s16)(v); }
#if CV_SIMD_FP16
#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
inline v_float16 vx_setall_f16(hfloat v) { return VXPREFIX(_setall_f16)(v); }
#endif
inline v_int32 vx_setall_s32(int v) { return VXPREFIX(_setall_s32)(v); }
@ -585,7 +593,7 @@ namespace CV__SIMD_NAMESPACE {
inline v_int8 vx_setzero_s8() { return VXPREFIX(_setzero_s8)(); }
inline v_uint16 vx_setzero_u16() { return VXPREFIX(_setzero_u16)(); }
inline v_int16 vx_setzero_s16() { return VXPREFIX(_setzero_s16)(); }
#if CV_SIMD_FP16
#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
inline v_float16 vx_setzero_f16() { return VXPREFIX(_setzero_f16)(); }
#endif
inline v_int32 vx_setzero_s32() { return VXPREFIX(_setzero_s32)(); }
@ -605,7 +613,7 @@ namespace CV__SIMD_NAMESPACE {
inline v_int8 vx_load(const schar * ptr) { return VXPREFIX(_load)(ptr); }
inline v_uint16 vx_load(const ushort * ptr) { return VXPREFIX(_load)(ptr); }
inline v_int16 vx_load(const short * ptr) { return VXPREFIX(_load)(ptr); }
#if CV_SIMD_FP16
#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
inline v_float16 vx_load(const hfloat * ptr) { return VXPREFIX(_load)(ptr); }
#endif
inline v_int32 vx_load(const int * ptr) { return VXPREFIX(_load)(ptr); }
@ -625,7 +633,7 @@ namespace CV__SIMD_NAMESPACE {
inline v_int8 vx_load_aligned(const schar * ptr) { return VXPREFIX(_load_aligned)(ptr); }
inline v_uint16 vx_load_aligned(const ushort * ptr) { return VXPREFIX(_load_aligned)(ptr); }
inline v_int16 vx_load_aligned(const short * ptr) { return VXPREFIX(_load_aligned)(ptr); }
#if CV_SIMD_FP16
#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
inline v_float16 vx_load_aligned(const hfloat * ptr) { return VXPREFIX(_load_aligned)(ptr); }
#endif
inline v_int32 vx_load_aligned(const int * ptr) { return VXPREFIX(_load_aligned)(ptr); }
@ -645,7 +653,7 @@ namespace CV__SIMD_NAMESPACE {
inline v_int8 vx_load_low(const schar * ptr) { return VXPREFIX(_load_low)(ptr); }
inline v_uint16 vx_load_low(const ushort * ptr) { return VXPREFIX(_load_low)(ptr); }
inline v_int16 vx_load_low(const short * ptr) { return VXPREFIX(_load_low)(ptr); }
#if CV_SIMD_FP16
#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
inline v_float16 vx_load_low(const hfloat * ptr) { return VXPREFIX(_load_low)(ptr); }
#endif
inline v_int32 vx_load_low(const int * ptr) { return VXPREFIX(_load_low)(ptr); }
@ -665,7 +673,7 @@ namespace CV__SIMD_NAMESPACE {
inline v_int8 vx_load_halves(const schar * ptr0, const schar * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
inline v_uint16 vx_load_halves(const ushort * ptr0, const ushort * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
inline v_int16 vx_load_halves(const short * ptr0, const short * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
#if CV_SIMD_FP16
#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
inline v_float16 vx_load_halves(const hfloat * ptr0, const hfloat * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
#endif
inline v_int32 vx_load_halves(const int * ptr0, const int * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
@ -685,7 +693,7 @@ namespace CV__SIMD_NAMESPACE {
inline v_int8 vx_lut(const schar * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
inline v_uint16 vx_lut(const ushort * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
inline v_int16 vx_lut(const short* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
#if CV_SIMD_FP16
#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
inline v_float16 vx_lut(const hfloat * ptr, const int * idx) { return VXPREFIX(_lut)(ptr, idx); }
#endif
inline v_int32 vx_lut(const int* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
@ -705,7 +713,7 @@ namespace CV__SIMD_NAMESPACE {
inline v_int8 vx_lut_pairs(const schar * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
inline v_uint16 vx_lut_pairs(const ushort * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
inline v_int16 vx_lut_pairs(const short* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
#if CV_SIMD_FP16
#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
inline v_float16 vx_lut_pairs(const hfloat * ptr, const int * idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
#endif
inline v_int32 vx_lut_pairs(const int* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }

@ -30,6 +30,12 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
#define CV_SIMD_SCALABLE 1
#define CV_SIMD_SCALABLE_64F 1
#if defined(__riscv_zvfh) && __riscv_zvfh
#define CV_SIMD_SCALABLE_FP16 1
#else
#define CV_SIMD_SCALABLE_FP16 0
#endif
using v_uint8 = vuint8m1_t;
using v_int8 = vint8m1_t;
@ -40,6 +46,9 @@ using v_int32 = vint32m1_t;
using v_uint64 = vuint64m1_t;
using v_int64 = vint64m1_t;
#if CV_SIMD_SCALABLE_FP16
using v_float16 = vfloat16m1_t;
#endif
using v_float32 = vfloat32m1_t;
#if CV_SIMD_SCALABLE_64F
using v_float64 = vfloat64m1_t;
@ -117,6 +126,13 @@ OPENCV_HAL_IMPL_RVV_TRAITS(vuint64m2_t, uint64_t, e64m2, 64)
OPENCV_HAL_IMPL_RVV_TRAITS(vuint64m4_t, uint64_t, e64m4, 64)
OPENCV_HAL_IMPL_RVV_TRAITS(vuint64m8_t, uint64_t, e64m8, 64)
#if CV_SIMD_SCALABLE_FP16
OPENCV_HAL_IMPL_RVV_TRAITS(vfloat16m1_t, hfloat, e16m1, 16)
OPENCV_HAL_IMPL_RVV_TRAITS(vfloat16m2_t, hfloat, e16m2, 16)
OPENCV_HAL_IMPL_RVV_TRAITS(vfloat16m4_t, hfloat, e16m4, 16)
OPENCV_HAL_IMPL_RVV_TRAITS(vfloat16m8_t, hfloat, e16m8, 16)
#endif
OPENCV_HAL_IMPL_RVV_TRAITS(vfloat32m1_t, float, e32m1, 32)
OPENCV_HAL_IMPL_RVV_TRAITS(vfloat32m2_t, float, e32m2, 32)
OPENCV_HAL_IMPL_RVV_TRAITS(vfloat32m4_t, float, e32m4, 32)
@ -155,6 +171,12 @@ OPENCV_HAL_IMPL_RVV_GRT0_INT(int32, int)
OPENCV_HAL_IMPL_RVV_GRT0_INT(uint64, uint64)
OPENCV_HAL_IMPL_RVV_GRT0_INT(int64, int64)
#if CV_SIMD_SCALABLE_FP16
inline hfloat v_get0(const v_float16& v) \
{ \
return (hfloat)__riscv_vfmv_f(v); \
}
#endif
inline float v_get0(const v_float32& v) \
{ \
return __riscv_vfmv_f(v); \
@ -197,6 +219,20 @@ inline v_##_Tpv v_setall_##suffix(_Tp v) \
return __riscv_vfmv_v_f_##suffix##m1(v, vl); \
}
#if CV_SIMD_SCALABLE_FP16
inline v_float16 v_setzero_f16()
{
return __riscv_vfmv_v_f_f16m1(0, VTraits<v_float16>::vlanes());
}
inline v_float16 v_setall_f16(float v) // In some cases we may use v_setall_f16(1.0f)
{
return __riscv_vfmv_v_f_f16m1((_Float16)v, VTraits<v_float16>::vlanes());
}
inline v_float16 v_setall_f16(hfloat v)
{
return __riscv_vfmv_v_f_f16m1((_Float16)v, VTraits<v_float16>::vlanes());
}
#endif
OPENCV_HAL_IMPL_RVV_INIT_FP(float32, float, f32, VTraits<v_float32>::vlanes())
#if CV_SIMD_SCALABLE_64F
OPENCV_HAL_IMPL_RVV_INIT_FP(float64, double, f64, VTraits<v_float64>::vlanes())
@ -216,6 +252,9 @@ OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int8, s8)
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int16, s16)
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int32, s32)
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int64, s64)
#if CV_SIMD_SCALABLE_FP16
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(float16, f16)
#endif
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(float32, f32)
#if CV_SIMD_SCALABLE_64F
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(float64, f64)
@ -234,6 +273,10 @@ inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, int8, u8, s8, u8, i8)
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, int16, u16, s16, u16, i16)
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, int32, u32, s32, u32, i32)
#if CV_SIMD_SCALABLE_FP16
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, float16, u16, f16, u16, f16)
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16, float16, s16, f16, i16, f16)
#endif
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, float32, u32, f32, u32, f32)
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32, float32, s32, f32, i32, f32)
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64, int64, u64, s64, u64, i64)
@ -277,6 +320,14 @@ OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int64, u32, s64, u, i, 32, 64)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int8, u64, s8, u, i, 64, 8)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int16, u64, s16, u, i, 64, 16)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int32, u64, s32, u, i, 64, 32)
#if CV_SIMD_SCALABLE_FP16
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, float16, u8, f16, u, f, 8, 16)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, float16, u32, f16, u, f, 32, 16)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, float16, u64, f16, u, f, 64, 16)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8, float16, s8, f16, i, f, 8, 16)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int32, float16, s32, f16, i, f, 32, 16)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int64, float16, s64, f16, i, f, 64, 16)
#endif
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, float32, u8, f32, u, f, 8, 32)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, float32, u16, f32, u, f, 16, 32)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, float32, u64, f32, u, f, 64, 32)
@ -291,6 +342,17 @@ OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8, float64, s8, f64, i, f, 8, 64)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16, float64, s16, f64, i, f, 16, 64)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int32, float64, s32, f64, i, f, 32, 64)
// Three times reinterpret
#if CV_SIMD_SCALABLE_FP16
inline v_float16 v_reinterpret_as_f16(const v_float64& v) \
{ \
return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vreinterpret_v_u64m1_u16m1(__riscv_vreinterpret_v_f64m1_u64m1(v)));\
}
inline v_float64 v_reinterpret_as_f64(const v_float16& v) \
{ \
return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vreinterpret_v_u16m1_u64m1(__riscv_vreinterpret_v_f16m1_u16m1(v)));\
}
#endif
inline v_float32 v_reinterpret_as_f32(const v_float64& v) \
{ \
return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u64m1_u32m1(__riscv_vreinterpret_v_f64m1_u64m1(v)));\
@ -332,9 +394,12 @@ inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b, int i = s) \
} \
template<int s = 0> inline _Tp v_extract_n(_Tpvec v, int i = s) \
{ \
return __riscv_vfmv_f(__riscv_vslidedown(v, i, vl)); \
return (_Tp)__riscv_vfmv_f(__riscv_vslidedown(v, i, vl)); \
}
#if CV_SIMD_SCALABLE_FP16
OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float16, hfloat, VTraits<v_float16>::vlanes())
#endif
OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float32, float, VTraits<v_float32>::vlanes())
#if CV_SIMD_SCALABLE_64F
OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float64, double, VTraits<v_float64>::vlanes())
@ -343,7 +408,7 @@ OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float64, double, VTraits<v_float64>::vlanes())
#define OPENCV_HAL_IMPL_RVV_EXTRACT(_Tpvec, _Tp, vl) \
inline _Tp v_extract_highest(_Tpvec v) \
{ \
return v_extract_n(v, vl-1); \
return (_Tp)v_extract_n(v, vl-1); \
}
OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint8, uchar, VTraits<v_uint8>::vlanes())
@ -354,6 +419,9 @@ OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint32, unsigned int, VTraits<v_uint32>::vlanes())
OPENCV_HAL_IMPL_RVV_EXTRACT(v_int32, int, VTraits<v_int32>::vlanes())
OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint64, uint64, VTraits<v_uint64>::vlanes())
OPENCV_HAL_IMPL_RVV_EXTRACT(v_int64, int64, VTraits<v_int64>::vlanes())
#if CV_SIMD_SCALABLE_FP16
OPENCV_HAL_IMPL_RVV_EXTRACT(v_float16, hfloat, VTraits<v_float16>::vlanes())
#endif
OPENCV_HAL_IMPL_RVV_EXTRACT(v_float32, float, VTraits<v_float32>::vlanes())
#if CV_SIMD_SCALABLE_64F
OPENCV_HAL_IMPL_RVV_EXTRACT(v_float64, double, VTraits<v_float64>::vlanes())
@ -408,6 +476,47 @@ _Tpvec v_load_##suffix(Targs... nScalars) \
return v_load({nScalars...}); \
}
#define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP_FP16(_Tpvec, _nTpvec, _Tp, hvl, vl, width, suffix) \
inline _Tpvec v_load(const _Tp* ptr) \
{ \
return __riscv_vle##width##_v_##suffix##m1((_Float16*)ptr, vl); \
} \
inline _Tpvec v_load_aligned(const _Tp* ptr) \
{ \
return __riscv_vle##width##_v_##suffix##m1((_Float16*)ptr, vl); \
} \
inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
{ \
__riscv_vse##width##_v_##suffix##m1((_Float16*)ptr, a, vl); \
} \
inline _Tpvec v_load_low(const _Tp* ptr) \
{ \
return __riscv_vle##width##_v_##suffix##m1((_Float16*)ptr, hvl); \
} \
inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
{ \
return __riscv_vslideup(__riscv_vle##width##_v_##suffix##m1((_Float16*)ptr0, hvl), __riscv_vle##width##_v_##suffix##m1((_Float16*)ptr1, hvl), hvl, vl); \
} \
inline void v_store(_Tp* ptr, const _Tpvec& a) \
{ \
__riscv_vse##width((_Float16*)ptr, a, vl); \
} \
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
{ \
__riscv_vse##width((_Float16*)ptr, a, vl); \
} \
inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
{ \
__riscv_vse##width((_Float16*)ptr, a, vl); \
} \
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
{ \
__riscv_vse##width((_Float16*)ptr, a, hvl); \
} \
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
{ \
__riscv_vse##width((_Float16*)ptr, __riscv_vslidedown_vx_##suffix##m1(a, hvl, vl), hvl); \
}
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint8, vuint8m1_t, uchar, VTraits<v_uint8>::vlanes() / 2, VTraits<v_uint8>::vlanes(), 8, u8)
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int8, vint8m1_t, schar, VTraits<v_int8>::vlanes() / 2, VTraits<v_int8>::vlanes(), 8, i8)
@ -417,6 +526,9 @@ OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint32, vuint32m1_t, unsigned int, VTraits<v_
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int32, vint32m1_t, int, VTraits<v_int32>::vlanes() / 2, VTraits<v_int32>::vlanes(), 32, i32)
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint64, vuint64m1_t, uint64, VTraits<v_uint64>::vlanes() / 2, VTraits<v_uint64>::vlanes(), 64, u64)
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int64, vint64m1_t, int64, VTraits<v_int64>::vlanes() / 2, VTraits<v_int64>::vlanes(), 64, i64)
#if CV_SIMD_SCALABLE_FP16
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP_FP16(v_float16, vfloat16m1_t, hfloat, VTraits<v_float16>::vlanes() /2 , VTraits<v_float16>::vlanes(), 16, f16)
#endif
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float32, vfloat32m1_t, float, VTraits<v_float32>::vlanes() /2 , VTraits<v_float32>::vlanes(), 32, f32)
#if CV_SIMD_SCALABLE_64F
@ -430,16 +542,25 @@ inline _Tpvec v_lut(const _Tp* tab, const int* idx) \
auto vidx = __riscv_vmul(__riscv_vreinterpret_u32##suffix(__riscv_vle32_v_i32##suffix(idx, VTraits<_Tpvec>::vlanes())), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
return __riscv_vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
}
#define OPENCV_HAL_IMPL_RVV_LUT_FP16(_Tpvec, _Tp, suffix) \
inline _Tpvec v_lut(const _Tp* tab, const int* idx) \
{ \
auto vidx = __riscv_vmul(__riscv_vreinterpret_u32##suffix(__riscv_vle32_v_i32##suffix(idx, VTraits<_Tpvec>::vlanes())), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
return __riscv_vloxei32((_Float16*)tab, vidx, VTraits<_Tpvec>::vlanes()); \
}
OPENCV_HAL_IMPL_RVV_LUT(v_int8, schar, m4)
OPENCV_HAL_IMPL_RVV_LUT(v_int16, short, m2)
OPENCV_HAL_IMPL_RVV_LUT(v_int32, int, m1)
OPENCV_HAL_IMPL_RVV_LUT(v_int64, int64_t, mf2)
#if CV_SIMD_SCALABLE_FP16
OPENCV_HAL_IMPL_RVV_LUT_FP16(v_float16, hfloat, m2)
#endif
OPENCV_HAL_IMPL_RVV_LUT(v_float32, float, m1)
#if CV_SIMD_SCALABLE_64F
OPENCV_HAL_IMPL_RVV_LUT(v_float64, double, mf2)
#endif
#define OPENCV_HAL_IMPL_RVV_LUT_PAIRS(_Tpvec, _Tp, suffix1, suffix2, v_trunc) \
#define OPENCV_HAL_IMPL_RVV_LUT_PAIRS(_Tpvec, _Tp, _TpCast, suffix1, suffix2, v_trunc) \
inline _Tpvec v_lut_pairs(const _Tp* tab, const int* idx) \
{ \
auto v0 = __riscv_vle32_v_u32##suffix1((unsigned*)idx, VTraits<_Tpvec>::vlanes()/2); \
@ -449,19 +570,22 @@ inline _Tpvec v_lut_pairs(const _Tp* tab, const int* idx) \
auto sh1 = __riscv_vslide1up(v_trunc(__riscv_vreinterpret_u32##suffix2(w1)),0, VTraits<_Tpvec>::vlanes()); \
auto vid = __riscv_vor(sh1, v_trunc(__riscv_vreinterpret_u32##suffix2(w0)), VTraits<_Tpvec>::vlanes()); \
auto vidx = __riscv_vmul(vid, sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
return __riscv_vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
return __riscv_vloxei32((_TpCast *)tab, vidx, VTraits<_Tpvec>::vlanes()); \
}
OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int8, schar, m2, m4, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int16, short, m1, m2, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int32, int, mf2, m1, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float32, float, mf2, m1, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int64, int64_t, mf2, m1, __riscv_vlmul_trunc_u32mf2)
OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int8, schar, schar, m2, m4, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int16, short, short, m1, m2, OPENCV_HAL_NOP)
#if CV_SIMD_SCALABLE_FP16
OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float16, hfloat, _Float16, m1, m2, OPENCV_HAL_NOP)
#endif
OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int32, int, int, mf2, m1, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float32, float, float, mf2, m1, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int64, int64_t, int64_t, mf2, m1, __riscv_vlmul_trunc_u32mf2)
#if CV_SIMD_SCALABLE_64F
OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float64, double, mf2, m1, __riscv_vlmul_trunc_u32mf2)
OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float64, double, double, mf2, m1, __riscv_vlmul_trunc_u32mf2)
#endif
#define OPENCV_HAL_IMPL_RVV_LUT_QUADS(_Tpvec, _Tp, suffix0, suffix1, suffix2, v_trunc) \
#define OPENCV_HAL_IMPL_RVV_LUT_QUADS(_Tpvec, _Tp, _TpCast, suffix0, suffix1, suffix2, v_trunc) \
inline _Tpvec v_lut_quads(const _Tp* tab, const int* idx) \
{ \
auto v0 = __riscv_vle32_v_u32##suffix0((unsigned*)idx, VTraits<_Tpvec>::vlanes()/4); \
@ -481,12 +605,15 @@ inline _Tpvec v_lut_quads(const _Tp* tab, const int* idx) \
auto shwid1 = __riscv_vslide1up(__riscv_vreinterpret_u32##suffix2(wid1),0, VTraits<_Tpvec>::vlanes()); \
auto vid = __riscv_vor(shwid1, __riscv_vreinterpret_u32##suffix2(wid0), VTraits<_Tpvec>::vlanes()); \
auto vidx = __riscv_vmul(vid, sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
return __riscv_vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
return __riscv_vloxei32((_TpCast *)tab, vidx, VTraits<_Tpvec>::vlanes()); \
}
OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int8, schar, m1, m2, m4, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int16, short, mf2 , m1, m2, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int32, int, mf2, m1, m1, __riscv_vlmul_trunc_u32mf2)
OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_float32, float, mf2, m1, m1, __riscv_vlmul_trunc_u32mf2)
OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int8, schar, schar, m1, m2, m4, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int16, short, short, mf2 , m1, m2, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int32, int, int, mf2, m1, m1, __riscv_vlmul_trunc_u32mf2)
#if CV_SIMD_SCALABLE_FP16
OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_float16, hfloat, _Float16, mf2 , m1, m2, OPENCV_HAL_NOP)
#endif
OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_float32, float, float, mf2, m1, m1, __riscv_vlmul_trunc_u32mf2)
#define OPENCV_HAL_IMPL_RVV_LUT_VEC(_Tpvec, _Tp) \
inline _Tpvec v_lut(const _Tp* tab, const v_int32& vidx) \
@ -557,6 +684,12 @@ OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, add, __riscv_vsaddu)
OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, sub, __riscv_vssubu)
OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, add, __riscv_vsadd)
OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, sub, __riscv_vssub)
#if CV_SIMD_SCALABLE_FP16
OPENCV_HAL_IMPL_RVV_BIN_OP(v_float16, add, __riscv_vfadd)
OPENCV_HAL_IMPL_RVV_BIN_OP(v_float16, sub, __riscv_vfsub)
OPENCV_HAL_IMPL_RVV_BIN_OP(v_float16, mul, __riscv_vfmul)
OPENCV_HAL_IMPL_RVV_BIN_OP(v_float16, div, __riscv_vfdiv)
#endif
OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, add, __riscv_vadd)
OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, sub, __riscv_vsub)
OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, mul, __riscv_vmul)
@ -602,6 +735,10 @@ OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int64, __riscv_vadd)
OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_uint32, __riscv_vmul)
OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_int32, __riscv_vmul)
OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_float32, __riscv_vfmul)
#if CV_SIMD_SCALABLE_FP16
OPENCV_HAL_IMPL_RVV_BIN_MADD(v_float16, __riscv_vfadd)
OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_float16, __riscv_vfmul)
#endif
#if CV_SIMD_SCALABLE_64F
OPENCV_HAL_IMPL_RVV_BIN_MADD(v_float64, __riscv_vfadd)
OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_float64, __riscv_vfmul)
@ -689,14 +826,30 @@ OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int32, VTraits<v_int32>::vlanes())
OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint64, VTraits<v_uint64>::vlanes())
OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int64, VTraits<v_int64>::vlanes())
#define OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(intrin) \
#if CV_SIMD_SCALABLE_FP16
#define OPENCV_HAL_IMPL_RVV_FLT16_BIT_OP(intrin) \
inline v_float16 intrin (const v_float16& a, const v_float16& b) \
{ \
return __riscv_vreinterpret_f16m1(intrin(__riscv_vreinterpret_i16m1(a), __riscv_vreinterpret_i16m1(b))); \
}
OPENCV_HAL_IMPL_RVV_FLT16_BIT_OP(v_and)
OPENCV_HAL_IMPL_RVV_FLT16_BIT_OP(v_or)
OPENCV_HAL_IMPL_RVV_FLT16_BIT_OP(v_xor)
inline v_float16 v_not (const v_float16& a) \
{ \
return __riscv_vreinterpret_f16m1(v_not(__riscv_vreinterpret_i16m1(a))); \
}
#endif
#define OPENCV_HAL_IMPL_RVV_FLT32_BIT_OP(intrin) \
inline v_float32 intrin (const v_float32& a, const v_float32& b) \
{ \
return __riscv_vreinterpret_f32m1(intrin(__riscv_vreinterpret_i32m1(a), __riscv_vreinterpret_i32m1(b))); \
}
OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(v_and)
OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(v_or)
OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(v_xor)
OPENCV_HAL_IMPL_RVV_FLT32_BIT_OP(v_and)
OPENCV_HAL_IMPL_RVV_FLT32_BIT_OP(v_or)
OPENCV_HAL_IMPL_RVV_FLT32_BIT_OP(v_xor)
inline v_float32 v_not (const v_float32& a) \
{ \
@ -774,6 +927,18 @@ inline _Tpvec v_##op (const _Tpvec& a, const _Tpvec& b) \
return _Tpvec(res); \
} //TODO
#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP_FP16(_Tpvec, op, intrin, suffix) \
inline _Tpvec v_##op (const _Tpvec& a, const _Tpvec& b) \
{ \
size_t VLEN = VTraits<_Tpvec>::vlanes(); \
union { uint64_t u; _Float16 d; } ones; \
ones.u = -1; \
auto diff = intrin(a, b, VLEN); \
auto z = __riscv_vfmv_v_f_##suffix##m1(0, VLEN); \
auto res = __riscv_vfmerge(z, ones.d, diff, VLEN); \
return _Tpvec(res); \
} //TODO
#define OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(_Tpvec, suffix) \
OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, eq, __riscv_vmseq, suffix) \
OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ne, __riscv_vmsne, suffix) \
@ -798,6 +963,13 @@ OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, gt, __riscv_vmfgt, suffix) \
OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, le, __riscv_vmfle, suffix) \
OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ge, __riscv_vmfge, suffix)
#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP_FP16(_Tpvec, suffix) \
OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP_FP16(_Tpvec, eq, __riscv_vmfeq, suffix) \
OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP_FP16(_Tpvec, ne, __riscv_vmfne, suffix) \
OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP_FP16(_Tpvec, lt, __riscv_vmflt, suffix) \
OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP_FP16(_Tpvec, gt, __riscv_vmfgt, suffix) \
OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP_FP16(_Tpvec, le, __riscv_vmfle, suffix) \
OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP_FP16(_Tpvec, ge, __riscv_vmfge, suffix)
OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint8, u8)
OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint16, u16)
@ -807,11 +979,19 @@ OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int8, i8)
OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int16, i16)
OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int32, i32)
OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int64, i64)
#if CV_SIMD_SCALABLE_FP16
OPENCV_HAL_IMPL_RVV_FLOAT_CMP_FP16(v_float16, f16)
#endif
OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float32, f32)
#if CV_SIMD_SCALABLE_64F
OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float64, f64)
#endif
#if CV_SIMD_SCALABLE_FP16
inline v_float16 v_not_nan(const v_float16& a)
{ return v_eq(a, a); }
#endif
inline v_float32 v_not_nan(const v_float32& a)
{ return v_eq(a, a); }
@ -840,6 +1020,10 @@ OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32, v_min, __riscv_vminu, VTraits<v_uint32>::
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32, v_max, __riscv_vmaxu, VTraits<v_uint32>::vlanes())
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32, v_min, __riscv_vmin, VTraits<v_int32>::vlanes())
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32, v_max, __riscv_vmax, VTraits<v_int32>::vlanes())
#if CV_SIMD_SCALABLE_FP16
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float16, v_min, __riscv_vfmin, VTraits<v_float16>::vlanes())
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float16, v_max, __riscv_vfmax, VTraits<v_float16>::vlanes())
#endif
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32, v_min, __riscv_vfmin, VTraits<v_float32>::vlanes())
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32, v_max, __riscv_vfmax, VTraits<v_float32>::vlanes())
#if CV_SIMD_SCALABLE_64F
@ -990,6 +1174,10 @@ OPENCV_HAL_IMPL_RVV_REDUCE(v_int16, max, short, i16, VTraits<v_int16>::vlanes(),
OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32, max, unsigned, u32, VTraits<v_uint32>::vlanes(), redmaxu)
OPENCV_HAL_IMPL_RVV_REDUCE(v_int32, max, int, i32, VTraits<v_int32>::vlanes(), redmax)
OPENCV_HAL_IMPL_RVV_REDUCE(v_float32, max, float, f32, VTraits<v_float32>::vlanes(), fredmax)
#if CV_SIMD_SCALABLE_FP16
OPENCV_HAL_IMPL_RVV_REDUCE(v_float16, max, hfloat, f16, VTraits<v_float16>::vlanes(), fredmax)
OPENCV_HAL_IMPL_RVV_REDUCE(v_float16, min, hfloat, f16, VTraits<v_float16>::vlanes(), fredmin)
#endif
inline v_float32 v_reduce_sum4(const v_float32& a, const v_float32& b,
const v_float32& c, const v_float32& d)
@ -1043,53 +1231,31 @@ inline v_float32 v_reduce_sum4(const v_float32& a, const v_float32& b,
}
////////////// Square-Root //////////////
inline v_float32 v_sqrt(const v_float32& x)
{
return __riscv_vfsqrt(x, VTraits<v_float32>::vlanes());
}
inline v_float32 v_invsqrt(const v_float32& x)
{
v_float32 one = v_setall_f32(1.0f);
return v_div(one, v_sqrt(x));
}
#if CV_SIMD_SCALABLE_64F
inline v_float64 v_sqrt(const v_float64& x)
{
return __riscv_vfsqrt(x, VTraits<v_float64>::vlanes());
#define OPENCV_HAL_IMPL_RVV_SQR_FP(_Tpvec, _setAllFunc) \
inline _Tpvec v_sqrt(const _Tpvec& x) \
{ \
return __riscv_vfsqrt(x, VTraits<_Tpvec>::vlanes()); \
} \
inline _Tpvec v_invsqrt(const _Tpvec& x) \
{ \
return v_div(_setAllFunc(1.0f), v_sqrt(x)); \
} \
inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
{ \
_Tpvec x = __riscv_vfmacc(__riscv_vfmul(a, a, VTraits<_Tpvec>::vlanes()), b, b, VTraits<_Tpvec>::vlanes()); \
return v_sqrt(x); \
} \
inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
{ \
return __riscv_vfmacc(__riscv_vfmul(a, a, VTraits<_Tpvec>::vlanes()), b, b, VTraits<_Tpvec>::vlanes()); \
}
inline v_float64 v_invsqrt(const v_float64& x)
{
v_float64 one = v_setall_f64(1.0f);
return v_div(one, v_sqrt(x));
}
#if CV_SIMD_SCALABLE_FP16
OPENCV_HAL_IMPL_RVV_SQR_FP(v_float16, v_setall_f16)
#endif
inline v_float32 v_magnitude(const v_float32& a, const v_float32& b)
{
v_float32 x = __riscv_vfmacc(__riscv_vfmul(a, a, VTraits<v_float32>::vlanes()), b, b, VTraits<v_float32>::vlanes());
return v_sqrt(x);
}
inline v_float32 v_sqr_magnitude(const v_float32& a, const v_float32& b)
{
return v_float32(__riscv_vfmacc(__riscv_vfmul(a, a, VTraits<v_float32>::vlanes()), b, b, VTraits<v_float32>::vlanes()));
}
OPENCV_HAL_IMPL_RVV_SQR_FP(v_float32, v_setall_f32)
#if CV_SIMD_SCALABLE_64F
inline v_float64 v_magnitude(const v_float64& a, const v_float64& b)
{
v_float64 x = __riscv_vfmacc(__riscv_vfmul(a, a, VTraits<v_float64>::vlanes()), b, b, VTraits<v_float64>::vlanes());
return v_sqrt(x);
}
inline v_float64 v_sqr_magnitude(const v_float64& a, const v_float64& b)
{
return __riscv_vfmacc(__riscv_vfmul(a, a, VTraits<v_float64>::vlanes()), b, b, VTraits<v_float64>::vlanes());
}
OPENCV_HAL_IMPL_RVV_SQR_FP(v_float64, v_setall_f64)
#endif
////////////// Multiply-Add //////////////
@ -1113,6 +1279,18 @@ inline v_int32 v_muladd(const v_int32& a, const v_int32& b, const v_int32& c)
return v_fma(a, b, c);
}
#if CV_SIMD_SCALABLE_FP16
inline v_float16 v_fma(const v_float16& a, const v_float16& b, const v_float16& c)
{
return __riscv_vfmacc(c, a, b, VTraits<v_float16>::vlanes());
}
inline v_float16 v_muladd(const v_float16& a, const v_float16& b, const v_float16& c)
{
return v_fma(a, b, c);
}
#endif
#if CV_SIMD_SCALABLE_64F
inline v_float64 v_fma(const v_float64& a, const v_float64& b, const v_float64& c)
{
@ -1153,6 +1331,13 @@ inline bool v_check_all(const v_uint16& a)
inline bool v_check_any(const v_uint16& a)
{ return v_check_any(v_reinterpret_as_s16(a)); }
#if CV_SIMD_SCALABLE_FP16
inline bool v_check_all(const v_float16& a)
{ return v_check_all(v_reinterpret_as_s16(a)); }
inline bool v_check_any(const v_float16& a)
{ return v_check_any(v_reinterpret_as_s16(a)); }
#endif
inline bool v_check_all(const v_uint32& a)
{ return v_check_all(v_reinterpret_as_s32(a)); }
inline bool v_check_any(const v_uint32& a)
@ -1186,6 +1371,9 @@ inline _Tpvec v_##abs(const _Tpvec& a, const _Tpvec& b) \
OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint8, absdiff)
OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint16, absdiff)
OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint32, absdiff)
#if CV_SIMD_SCALABLE_FP16
OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float16, absdiff)
#endif
OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float32, absdiff)
#if CV_SIMD_SCALABLE_64F
OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float64, absdiff)
@ -1212,6 +1400,9 @@ inline _Tprvec v_abs(const _Tpvec& a) \
OPENCV_HAL_IMPL_RVV_ABS(v_uint8, v_int8, s8)
OPENCV_HAL_IMPL_RVV_ABS(v_uint16, v_int16, s16)
OPENCV_HAL_IMPL_RVV_ABS(v_uint32, v_int32, s32)
#if CV_SIMD_SCALABLE_FP16
OPENCV_HAL_IMPL_RVV_ABS(v_float16, v_float16, f16)
#endif
OPENCV_HAL_IMPL_RVV_ABS(v_float32, v_float32, f32)
#if CV_SIMD_SCALABLE_64F
OPENCV_HAL_IMPL_RVV_ABS(v_float64, v_float64, f64)
@ -1246,6 +1437,12 @@ OPENCV_HAL_IMPL_RVV_SELECT(v_uint32, VTraits<v_uint32>::vlanes())
OPENCV_HAL_IMPL_RVV_SELECT(v_int8, VTraits<v_int8>::vlanes())
OPENCV_HAL_IMPL_RVV_SELECT(v_int16, VTraits<v_int16>::vlanes())
OPENCV_HAL_IMPL_RVV_SELECT(v_int32, VTraits<v_int32>::vlanes())
#if CV_SIMD_SCALABLE_FP16
inline v_float16 v_select(const v_float16& mask, const v_float16& a, const v_float16& b) \
{ \
return __riscv_vmerge(b, a, __riscv_vmfne(mask, 0, VTraits<v_float16>::vlanes()), VTraits<v_float16>::vlanes()); \
}
#endif
inline v_float32 v_select(const v_float32& mask, const v_float32& a, const v_float32& b) \
{ \
@ -1314,12 +1511,39 @@ template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
{ CV_UNUSED(b); return a; }
#if CV_SIMD_SCALABLE_FP16
OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float16, f16, VTraits<v_float16>::vlanes())
#endif
OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float32, f32, VTraits<v_float32>::vlanes())
#if CV_SIMD_SCALABLE_64F
OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float64, f64, VTraits<v_float64>::vlanes())
#endif
////////////// Convert to float //////////////
#if CV_SIMD_SCALABLE_FP16
inline v_float16 v_cvt_f16(const v_float32 &a)
{
return __riscv_vfncvt_f(__riscv_vlmul_ext_f32m2(a), VTraits<v_float32>::vlanes());
}
inline v_float16 v_cvt_f16(const v_float32 &a, const v_float32 &b)
{
return __riscv_vfncvt_f(__riscv_vset(__riscv_vlmul_ext_f32m2(a),1,b), VTraits<v_float16>::vlanes());
}
inline v_float16 v_cvt_f16(const v_int16 &a)
{
return __riscv_vfcvt_f(a, VTraits<v_float16>::vlanes());
}
inline v_float32 v_cvt_f32(const v_float16 &a)
{
return __riscv_vget_f32m1(__riscv_vfwcvt_f(a, VTraits<v_float16>::vlanes()), 0);
}
inline v_float32 v_cvt_f32_high(const v_float16 &a)
{
return __riscv_vget_f32m1(__riscv_vfwcvt_f(a, VTraits<v_float16>::vlanes()), 1);
}
#endif
inline v_float32 v_cvt_f32(const v_int32& a)
{
return __riscv_vfcvt_f_x_v_f32m1(a, VTraits<v_float32>::vlanes());
@ -1367,13 +1591,16 @@ inline v_float64 v_cvt_f64(const v_int64& a)
#define OPENCV_HAL_IMPL_RVV_BROADCAST(_Tpvec, suffix) \
template<int s = 0> inline _Tpvec v_broadcast_element(_Tpvec v, int i = s) \
{ \
return v_setall_##suffix(v_extract_n(v, i)); \
return v_setall_##suffix((_Float16)v_extract_n(v, i)); \
} \
inline _Tpvec v_broadcast_highest(_Tpvec v) \
{ \
return v_setall_##suffix(v_extract_n(v, VTraits<_Tpvec>::vlanes()-1)); \
return v_setall_##suffix((_Float16)v_extract_n(v, VTraits<_Tpvec>::vlanes()-1)); \
}
#if CV_SIMD_SCALABLE_FP16
OPENCV_HAL_IMPL_RVV_BROADCAST(v_float16, f16)
#endif
OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint32, u32)
OPENCV_HAL_IMPL_RVV_BROADCAST(v_int32, s32)
OPENCV_HAL_IMPL_RVV_BROADCAST(v_float32, f32)
@ -1390,6 +1617,9 @@ OPENCV_HAL_IMPL_RVV_REVERSE(v_uint8, 8)
OPENCV_HAL_IMPL_RVV_REVERSE(v_int8, 8)
OPENCV_HAL_IMPL_RVV_REVERSE(v_uint16, 16)
OPENCV_HAL_IMPL_RVV_REVERSE(v_int16, 16)
#if CV_SIMD_SCALABLE_FP16
OPENCV_HAL_IMPL_RVV_REVERSE(v_float16, 16)
#endif
OPENCV_HAL_IMPL_RVV_REVERSE(v_uint32, 32)
OPENCV_HAL_IMPL_RVV_REVERSE(v_int32, 32)
OPENCV_HAL_IMPL_RVV_REVERSE(v_float32, 32)
@ -1531,6 +1761,9 @@ OPENCV_HAL_IMPL_RVV_ZIP(v_uint8, vuint8m2_t, u8, 8, 16, OPENCV_HAL_NOP, OPENCV_H
OPENCV_HAL_IMPL_RVV_ZIP(v_int8, vint8m2_t, i8, 8, 16, __riscv_vreinterpret_u8m2, __riscv_vreinterpret_u8m1)
OPENCV_HAL_IMPL_RVV_ZIP(v_uint16, vuint16m2_t, u16, 16, 32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_RVV_ZIP(v_int16, vint16m2_t, i16, 16, 32, __riscv_vreinterpret_u16m2, __riscv_vreinterpret_u16m1)
#if CV_SIMD_SCALABLE_FP16
OPENCV_HAL_IMPL_RVV_ZIP(v_float16, vfloat16m2_t, f16, 16, 32, __riscv_vreinterpret_u16m2, __riscv_vreinterpret_u16m1)
#endif
OPENCV_HAL_IMPL_RVV_ZIP(v_uint32, vuint32m2_t, u32, 32, 64, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_RVV_ZIP(v_int32, vint32m2_t, i32, 32, 64, __riscv_vreinterpret_u32m2, __riscv_vreinterpret_u32m1)
OPENCV_HAL_IMPL_RVV_ZIP(v_float32, vfloat32m2_t, f32, 32, 64, __riscv_vreinterpret_u32m2, __riscv_vreinterpret_u32m1)
@ -1580,66 +1813,72 @@ OPENCV_HAL_IMPL_RVV_UNPACKS(v_uint16, 16)
OPENCV_HAL_IMPL_RVV_UNPACKS(v_int16, 16)
OPENCV_HAL_IMPL_RVV_UNPACKS(v_uint32, 32)
OPENCV_HAL_IMPL_RVV_UNPACKS(v_int32, 32)
#if CV_SIMD_SCALABLE_FP16
OPENCV_HAL_IMPL_RVV_UNPACKS(v_float16, 16)
#endif
OPENCV_HAL_IMPL_RVV_UNPACKS(v_float32, 32)
#if CV_SIMD_SCALABLE_64F
OPENCV_HAL_IMPL_RVV_UNPACKS(v_float64, 64)
#endif
#define OPENCV_HAL_IMPL_RVV_INTERLEAVED(_Tpvec, _Tp, suffix, width, hwidth, vl) \
#define OPENCV_HAL_IMPL_RVV_INTERLEAVED(_Tpvec, _Tp, _TpCast, suffix, width, hwidth, vl) \
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
{ \
a = __riscv_vlse##width##_v_##suffix##m1(ptr , sizeof(_Tp)*2, VTraits<v_##_Tpvec>::vlanes()); \
b = __riscv_vlse##width##_v_##suffix##m1(ptr+1, sizeof(_Tp)*2, VTraits<v_##_Tpvec>::vlanes()); \
a = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)ptr , sizeof(_Tp)*2, VTraits<v_##_Tpvec>::vlanes()); \
b = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)(ptr+1), sizeof(_Tp)*2, VTraits<v_##_Tpvec>::vlanes()); \
}\
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
{ \
a = __riscv_vlse##width##_v_##suffix##m1(ptr , sizeof(_Tp)*3, VTraits<v_##_Tpvec>::vlanes()); \
b = __riscv_vlse##width##_v_##suffix##m1(ptr+1, sizeof(_Tp)*3, VTraits<v_##_Tpvec>::vlanes()); \
c = __riscv_vlse##width##_v_##suffix##m1(ptr+2, sizeof(_Tp)*3, VTraits<v_##_Tpvec>::vlanes()); \
a = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)ptr , sizeof(_Tp)*3, VTraits<v_##_Tpvec>::vlanes()); \
b = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)(ptr+1), sizeof(_Tp)*3, VTraits<v_##_Tpvec>::vlanes()); \
c = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)(ptr+2), sizeof(_Tp)*3, VTraits<v_##_Tpvec>::vlanes()); \
} \
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
v_##_Tpvec& c, v_##_Tpvec& d) \
{ \
\
a = __riscv_vlse##width##_v_##suffix##m1(ptr , sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
b = __riscv_vlse##width##_v_##suffix##m1(ptr+1, sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
c = __riscv_vlse##width##_v_##suffix##m1(ptr+2, sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
d = __riscv_vlse##width##_v_##suffix##m1(ptr+3, sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
a = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)ptr , sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
b = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)(ptr+1), sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
c = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)(ptr+2), sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
d = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)(ptr+3), sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
} \
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
{ \
__riscv_vsse##width(ptr, sizeof(_Tp)*2, a, VTraits<v_##_Tpvec>::vlanes()); \
__riscv_vsse##width(ptr+1, sizeof(_Tp)*2, b, VTraits<v_##_Tpvec>::vlanes()); \
__riscv_vsse##width((_TpCast *)ptr, sizeof(_Tp)*2, a, VTraits<v_##_Tpvec>::vlanes()); \
__riscv_vsse##width((_TpCast *)(ptr+1), sizeof(_Tp)*2, b, VTraits<v_##_Tpvec>::vlanes()); \
} \
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
{ \
__riscv_vsse##width(ptr, sizeof(_Tp)*3, a, VTraits<v_##_Tpvec>::vlanes()); \
__riscv_vsse##width(ptr+1, sizeof(_Tp)*3, b, VTraits<v_##_Tpvec>::vlanes()); \
__riscv_vsse##width(ptr+2, sizeof(_Tp)*3, c, VTraits<v_##_Tpvec>::vlanes()); \
__riscv_vsse##width((_TpCast *)ptr, sizeof(_Tp)*3, a, VTraits<v_##_Tpvec>::vlanes()); \
__riscv_vsse##width((_TpCast *)(ptr+1), sizeof(_Tp)*3, b, VTraits<v_##_Tpvec>::vlanes()); \
__riscv_vsse##width((_TpCast *)(ptr+2), sizeof(_Tp)*3, c, VTraits<v_##_Tpvec>::vlanes()); \
} \
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
const v_##_Tpvec& c, const v_##_Tpvec& d, \
hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
{ \
__riscv_vsse##width(ptr, sizeof(_Tp)*4, a, VTraits<v_##_Tpvec>::vlanes()); \
__riscv_vsse##width(ptr+1, sizeof(_Tp)*4, b, VTraits<v_##_Tpvec>::vlanes()); \
__riscv_vsse##width(ptr+2, sizeof(_Tp)*4, c, VTraits<v_##_Tpvec>::vlanes()); \
__riscv_vsse##width(ptr+3, sizeof(_Tp)*4, d, VTraits<v_##_Tpvec>::vlanes()); \
}
OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint8, uchar, u8, 8, 4, VTraits<v_uint8>::vlanes())
OPENCV_HAL_IMPL_RVV_INTERLEAVED(int8, schar, i8, 8, 4, VTraits<v_int8>::vlanes())
OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint16, ushort, u16, 16, 8, VTraits<v_uint16>::vlanes())
OPENCV_HAL_IMPL_RVV_INTERLEAVED(int16, short, i16, 16, 8, VTraits<v_int16>::vlanes())
OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint32, unsigned, u32, 32, 16, VTraits<v_uint32>::vlanes())
OPENCV_HAL_IMPL_RVV_INTERLEAVED(int32, int, i32, 32, 16, VTraits<v_int32>::vlanes())
OPENCV_HAL_IMPL_RVV_INTERLEAVED(float32, float, f32, 32, 16, VTraits<v_float32>::vlanes())
OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint64, uint64, u64, 64, 32, VTraits<v_uint64>::vlanes())
OPENCV_HAL_IMPL_RVV_INTERLEAVED(int64, int64, i64, 64, 32, VTraits<v_int64>::vlanes())
__riscv_vsse##width((_TpCast *)ptr, sizeof(_Tp)*4, a, VTraits<v_##_Tpvec>::vlanes()); \
__riscv_vsse##width((_TpCast *)(ptr+1), sizeof(_Tp)*4, b, VTraits<v_##_Tpvec>::vlanes()); \
__riscv_vsse##width((_TpCast *)(ptr+2), sizeof(_Tp)*4, c, VTraits<v_##_Tpvec>::vlanes()); \
__riscv_vsse##width((_TpCast *)(ptr+3), sizeof(_Tp)*4, d, VTraits<v_##_Tpvec>::vlanes()); \
}
OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint8, uchar, uchar, u8, 8, 4, VTraits<v_uint8>::vlanes())
OPENCV_HAL_IMPL_RVV_INTERLEAVED(int8, schar, schar, i8, 8, 4, VTraits<v_int8>::vlanes())
OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint16, ushort, ushort, u16, 16, 8, VTraits<v_uint16>::vlanes())
OPENCV_HAL_IMPL_RVV_INTERLEAVED(int16, short, short, i16, 16, 8, VTraits<v_int16>::vlanes())
#if CV_SIMD_SCALABLE_FP16
OPENCV_HAL_IMPL_RVV_INTERLEAVED(float16, hfloat, _Float16, f16, 16, 8, VTraits<v_float16>::vlanes())
#endif
OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint32, unsigned, unsigned, u32, 32, 16, VTraits<v_uint32>::vlanes())
OPENCV_HAL_IMPL_RVV_INTERLEAVED(int32, int, int, i32, 32, 16, VTraits<v_int32>::vlanes())
OPENCV_HAL_IMPL_RVV_INTERLEAVED(float32, float, float, f32, 32, 16, VTraits<v_float32>::vlanes())
OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint64, uint64, uint64, u64, 64, 32, VTraits<v_uint64>::vlanes())
OPENCV_HAL_IMPL_RVV_INTERLEAVED(int64, int64, int64, i64, 64, 32, VTraits<v_int64>::vlanes())
#if CV_SIMD_SCALABLE_64F
OPENCV_HAL_IMPL_RVV_INTERLEAVED(float64, double, f64, 64, 32, VTraits<v_float64>::vlanes())
OPENCV_HAL_IMPL_RVV_INTERLEAVED(float64, double, double, f64, 64, 32, VTraits<v_float64>::vlanes())
#endif
static uint64_t idx_interleave_pairs[] = { \
@ -1781,6 +2020,10 @@ inline int64 v_signmask(const v_uint8& a)
{ return v_signmask(v_reinterpret_as_s8(a)); }
inline int64 v_signmask(const v_uint16& a)
{ return v_signmask(v_reinterpret_as_s16(a)); }
#if CV_SIMD_SCALABLE_FP16
inline int v_signmask(const v_float16& a)
{ return v_signmask(v_reinterpret_as_s16(a)); }
#endif
inline int v_signmask(const v_uint32& a)
{ return v_signmask(v_reinterpret_as_s32(a)); }
inline int v_signmask(const v_float32& a)
@ -1862,6 +2105,35 @@ inline void v_pack_store(hfloat* ptr, const v_float32& v)
}
#endif
////////////// Rounding //////////////
#if CV_SIMD_SCALABLE_FP16
inline v_int16 v_round(const v_float16& a)
{
return __riscv_vfcvt_x(a, VTraits<v_float16>::vlanes());
}
inline v_int16 v_floor(const v_float16& a)
{
#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>11999
return __riscv_vfcvt_x_f_v_i16m1_rm(a, 1 /*RNE, round-to-nearest-even*/, VTraits<v_float16>::vlanes());
#else
return __riscv_vfcvt_x(vfsub(a, 0.5f - 1e-5, VTraits<v_float16>::vlanes()), VTraits<v_float16>::vlanes());
#endif
}
inline v_int16 v_ceil(const v_float16& a)
{
#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>11999
return __riscv_vfcvt_x_f_v_i16m1_rm(a, 3 /*ROD, round-to-odd*/, VTraits<v_float16>::vlanes());
#else
return __riscv_vfcvt_x(vfadd(a, 0.5f - 1e-5, VTraits<v_float16>::vlanes()), VTraits<v_float16>::vlanes());
#endif
}
inline v_int16 v_trunc(const v_float16& a)
{
return __riscv_vfcvt_rtz_x(a, VTraits<v_float16>::vlanes());
}
#endif
inline v_int32 v_round(const v_float32& a)
{
// return vfcvt_x(vfadd(a, 1e-6, VTraits<v_float32>::vlanes()), VTraits<v_float32>::vlanes());
@ -2155,6 +2427,41 @@ inline v_float64 v_dotprod_expand_fast(const v_int32& a, const v_int32& b, const
#endif
// TODO: only 128 bit now.
#if CV_SIMD_SCALABLE_FP16
inline v_float16 v_matmul( const v_float16 &v,
const v_float16 &m0, const v_float16 &m1,
const v_float16 &m2, const v_float16 &m3,
const v_float16 &m4, const v_float16 &m5,
const v_float16 &m6, const v_float16 &m7) {
vfloat16m1_t res;
res = __riscv_vfmul_vf_f16m1(m0, (_Float16)v_extract_n(v, 0), VTraits<v_float16>::vlanes());
res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 1), m1, VTraits<v_float16>::vlanes());
res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 2), m2, VTraits<v_float16>::vlanes());
res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 3), m3, VTraits<v_float16>::vlanes());
res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 4), m4, VTraits<v_float16>::vlanes());
res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 5), m5, VTraits<v_float16>::vlanes());
res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 6), m6, VTraits<v_float16>::vlanes());
res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 7), m7, VTraits<v_float16>::vlanes());
return res;
}
inline v_float16 v_matmuladd( const v_float16 &v,
const v_float16 &m0, const v_float16 &m1,
const v_float16 &m2, const v_float16 &m3,
const v_float16 &m4, const v_float16 &m5,
const v_float16 &m6,
const v_float16 &a) {
vfloat16m1_t res;
res = __riscv_vfmul_vf_f16m1(m0, (_Float16)v_extract_n(v, 0), VTraits<v_float16>::vlanes());
res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 1), m1, VTraits<v_float16>::vlanes());
res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 2), m2, VTraits<v_float16>::vlanes());
res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 3), m3, VTraits<v_float16>::vlanes());
res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 4), m4, VTraits<v_float16>::vlanes());
res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 5), m5, VTraits<v_float16>::vlanes());
res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 6), m6, VTraits<v_float16>::vlanes());
return __riscv_vfadd(res, a, VTraits<v_float16>::vlanes());
}
#endif
inline v_float32 v_matmul(const v_float32& v, const v_float32& m0,
const v_float32& m1, const v_float32& m2,
const v_float32& m3)

@ -434,6 +434,7 @@ struct HWFeatures
g_hwFeatureNames[CPU_AVX512_ICL] = "AVX512-ICL";
g_hwFeatureNames[CPU_RVV] = "RVV";
g_hwFeatureNames[CPU_RVV_ZVFH] = "RVV_ZVFH";
g_hwFeatureNames[CPU_LSX] = "LSX";
g_hwFeatureNames[CPU_LASX] = "LASX";
@ -712,6 +713,12 @@ struct HWFeatures
#if defined __riscv && defined __riscv_vector
have[CV_CPU_RVV] = true;
#if (defined __riscv_zvfh && __riscv_zvfh) || (defined __riscv_zvfhmin && __riscv_zvfhmin)
have[CV_CPU_FP16] = true;
#endif
#if defined __riscv_zvfh && __riscv_zvfh
have[CV_CPU_RVV_ZVFH] = true;
#endif
#endif
#if defined __loongarch64 && defined __linux__

@ -159,8 +159,8 @@ template <typename R> std::ostream & operator<<(std::ostream & out, const Data<R
out << "{ ";
for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
// out << std::hex << +V_TypeTraits<typename VTraits<R>::lane_type>::reinterpret_int(d.d[i]);
out << +d.d[i];
out << std::hex << +V_TypeTraits<typename VTraits<R>::lane_type>::reinterpret_int(d.d[i]);
// out << +d.d[i]; // Note: No operator '<<' for _Float16
if (i + 1 < VTraits<R>::vlanes())
out << ", ";
}
@ -182,7 +182,7 @@ template<> inline void EXPECT_COMPARE_EQ_<double>(const double a, const double b
EXPECT_DOUBLE_EQ( a, b );
}
#if CV_SIMD_FP16
#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
template<> inline void EXPECT_COMPARE_EQ_<hfloat>(const hfloat a, const hfloat b)
{
EXPECT_LT(std::abs(float(a - b)), 0.126);
@ -564,7 +564,7 @@ template<typename R> struct TheTest
// Handle accuracy for fp16
TheTest & test_div_fp16()
{
#if CV_SIMD_FP16
#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
Data<R> dataA, dataB;
dataB.reverse();
R a = dataA, b = dataB;
@ -1572,7 +1572,7 @@ template<typename R> struct TheTest
TheTest & test_matmul_fp16()
{
#if CV_SIMD_FP16
#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
Data<R> dataV, data0, data1, data2, data3, data4, data5, data6, data7;
data1.reverse();
data2 += 2;
@ -1657,7 +1657,8 @@ template<typename R> struct TheTest
TheTest & test_transpose8x8_fp16()
{
#if CV_SIMD_FP16
#if (CV_SIMD_FP16 /*|| CV_SIMD_SCALABLE_FP16*/)
// Note: The scalable backend does not yet implement fixed-length functions
Data<R> dataA0, dataA1, dataA2, dataA3, dataA4, dataA5, dataA6, dataA7;
dataA1 *= 2;
dataA2 *= 4;
@ -1713,7 +1714,8 @@ template<typename R> struct TheTest
TheTest & test_reduce_sum8()
{
#if CV_SIMD_FP16
#if (CV_SIMD_FP16 /*|| CV_SIMD_SCALABLE_FP16*/)
// Note: The scalable backend does not yet implement fixed-length functions
Data<R> dataA, dataB, dataC, dataD, dataW, dataX, dataY, dataZ;
dataB *= 0.01f;
dataC *= 0.001f;
@ -1773,7 +1775,7 @@ template<typename R> struct TheTest
TheTest & test_loadstore_fp16()
{
#if CV_SIMD_FP16
#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
AlignedData<R> data;
AlignedData<R> out;
@ -1804,7 +1806,7 @@ template<typename R> struct TheTest
TheTest & test_float_cvt_fp16()
{
#if CV_SIMD_FP16
#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
AlignedData<v_float32> data;
// check conversion
@ -2449,7 +2451,7 @@ void test_hal_intrin_float16()
DUMP_ENTRY(v_float16);
#if CV_FP16
TheTest<v_float32>().test_loadstore_fp16_f32();
#if CV_SIMD_FP16
#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
TheTest<v_float16>()
.test_loadstore_fp16()
.test_float_cvt_fp16()
@ -2476,6 +2478,8 @@ void test_hal_intrin_float16()
.test_extract_n<0>().test_extract_n<1>()
.test_exp_fp16()
.test_log_fp16()
#else
std::cout << "SKIP: CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16 is not available" << std::endl;
#endif
;
#else

Loading…
Cancel
Save