diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake index e8e1e11f4e..25defb0460 100644 --- a/cmake/OpenCVCompilerOptimizations.cmake +++ b/cmake/OpenCVCompilerOptimizations.cmake @@ -52,7 +52,7 @@ list(APPEND CPU_ALL_OPTIMIZATIONS "AVX512_COMMON;AVX512_KNL;AVX512_KNM;AVX512_SK list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16 NEON_DOTPROD NEON_FP16 NEON_BF16) list(APPEND CPU_ALL_OPTIMIZATIONS MSA) list(APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3) -list(APPEND CPU_ALL_OPTIMIZATIONS RVV) +list(APPEND CPU_ALL_OPTIMIZATIONS RVV FP16 RVV_ZVFH) list(APPEND CPU_ALL_OPTIMIZATIONS LSX) list(APPEND CPU_ALL_OPTIMIZATIONS LASX) list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS) @@ -170,6 +170,21 @@ elseif(" ${CMAKE_CXX_FLAGS} " MATCHES " -march=native | -xHost | /QxHost ") set(CPU_BASELINE_DETECT ON) endif() +# This macro traverses all the dependent (IMPLIES) backends for the CPU_${OPT}_FLAGS_ON. +macro(ocv_cpu_riscv_update_flag FEATURE_NAME_LIST COMMON_OPTION) + foreach(OPT IN LISTS ${FEATURE_NAME_LIST}) + unset(APPEND_TRAILING) + # traverse all dependency and merge extensions to a flag. + foreach(IMPLIE IN LISTS CPU_${OPT}_IMPLIES) + string(APPEND APPEND_TRAILING "_${CPU_${IMPLIE}_FLAG}") + endforeach() + string(APPEND APPEND_TRAILING "_${CPU_${OPT}_FLAG}") + # Update flag + set(CPU_${OPT}_FLAGS_ON "${COMMON_OPTION}${APPEND_TRAILING}") + message(STATUS "CPU_${OPT}_FLAGS_ON is ${CPU_${OPT}_FLAGS_ON}") + endforeach() +endmacro() + if(X86 OR X86_64) ocv_update(CPU_KNOWN_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;POPCNT;SSE4_2;FP16;FMA3;AVX;AVX2;AVX_512F;AVX512_COMMON;AVX512_KNL;AVX512_KNM;AVX512_SKX;AVX512_CNL;AVX512_CLX;AVX512_ICL") @@ -390,12 +405,28 @@ elseif(PPC64LE) set(CPU_BASELINE "VSX" CACHE STRING "${HELP_CPU_BASELINE}") elseif(RISCV) + if(NOT DEFINED PLATFORM_STR) + set(PLATFORM_STR "rv64gc") + endif() + + ocv_update(CPU_KNOWN_OPTIMIZATIONS "RVV;FP16;RVV_ZVFH") ocv_update(CPU_RVV_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_rvv.cpp") - ocv_update(CPU_KNOWN_OPTIMIZATIONS "RVV") - ocv_update(CPU_RVV_FLAGS_ON "-march=rv64gcv") + ocv_update(CPU_FP16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_fp16.cpp") + ocv_update(CPU_RVV_ZVFH_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_rvv_fp16.cpp") + ocv_update(CPU_RVV_ZVFH_IMPLIES "RVV;FP16") + ocv_update(CPU_FP16_IMPLIES "RVV") + set(CPU_RVV_FLAG "v") + set(CPU_FP16_FLAG "zvfhmin") + set(CPU_RVV_ZVFH_FLAG "zvfh") + set(BASE_ARCHITECTURE "-march=${PLATFORM_STR}") + ocv_cpu_riscv_update_flag(CPU_KNOWN_OPTIMIZATIONS ${BASE_ARCHITECTURE}) + ocv_update(CPU_RVV_FLAGS_CONFLICT "-march=[^ ]*") - set(CPU_DISPATCH "" CACHE STRING "${HELP_CPU_DISPATCH}") + if(NOT ${BUILD_SHARED_LIBS}) # static build for k230 + add_extra_compiler_option("-static -static-libgcc -static-libstdc++") + endif() + set(CPU_DISPATCH "FP16;RVV_ZVFH" CACHE STRING "${HELP_CPU_DISPATCH}") set(CPU_BASELINE "DETECT" CACHE STRING "${HELP_CPU_BASELINE}") elseif(LOONGARCH64) @@ -495,6 +526,32 @@ macro(ocv_cpu_aarch64_baseline_merge_feature_options FEATURE_NAME_LIST FLAG_STRI endif() endmacro() +macro(ocv_cpu_riscv_baseline_merge_feature_options FEATURE_NAME_LIST FLAG_STRING COMMON_OPTION) + unset(_POSTFIX) + unset(APPEND_TRAILING) + # Check each feature option. + foreach(OPT IN LISTS ${FEATURE_NAME_LIST}) + string(FIND "${${FLAG_STRING}}" "${CPU_${OPT}_FLAGS_ON}" OPT_FOUND) + if(NOT ${OPT_FOUND} EQUAL -1) + # e.g. when ${CPU_${OPT}_FLAGS_ON} is "rv64gc_v_zvfhmin" + # the ${TRAILING_PART} will be "_v_zvfhmin" + # and the ${parts} will be "_v;_zvfhmin" (a list) + string(REPLACE "${COMMON_OPTION}" "" TRAILING_PART "${CPU_${OPT}_FLAGS_ON}") + string(REGEX MATCHALL "_[^_]+" parts ${TRAILING_PART}) + list(APPEND _POSTFIX ${parts}) + # remove ${CPU_${OPT}_FLAGS_ON} from ${FLAG_STRING} + string(REGEX REPLACE "${CPU_${OPT}_FLAGS_ON}( |$)" "" ${FLAG_STRING} ${${FLAG_STRING}}) + endif() + endforeach() + # Remove the duplicate extensions. (e.g. _v, _v, ...) + list(REMOVE_DUPLICATES _POSTFIX) + # Merge to one extensions flag + foreach(TRAILING IN LISTS _POSTFIX) + string(APPEND APPEND_TRAILING "${TRAILING}") + endforeach() + set(${FLAG_STRING} "${${FLAG_STRING}} ${COMMON_OPTION}${APPEND_TRAILING}") +endmacro() + foreach(OPT ${CPU_KNOWN_OPTIMIZATIONS}) set(CPU_${OPT}_USAGE_COUNT 0 CACHE INTERNAL "") if("${CPU_${OPT}_FLAGS_ON}" STREQUAL "disabled") @@ -597,6 +654,11 @@ if(AARCH64) endif() endif() +if(RISCV) + string(STRIP "${CPU_BASELINE_FLAGS}" CPU_BASELINE_FLAGS) + ocv_cpu_riscv_baseline_merge_feature_options(CPU_KNOWN_OPTIMIZATIONS CPU_BASELINE_FLAGS ${BASE_ARCHITECTURE}) +endif() + foreach(OPT ${CPU_BASELINE_REQUIRE}) if(NOT ";${CPU_BASELINE_FINAL};" MATCHES ";${OPT};") message(SEND_ERROR "Required baseline optimization is not supported: ${OPT} (CPU_BASELINE_REQUIRE=${CPU_BASELINE_REQUIRE})") diff --git a/cmake/checks/cpu_fp16.cpp b/cmake/checks/cpu_fp16.cpp index c57b5d47b6..fd88981d73 100644 --- a/cmake/checks/cpu_fp16.cpp +++ b/cmake/checks/cpu_fp16.cpp @@ -23,6 +23,27 @@ int test() *(float16x4_t*)dst = v_dst; return (int)dst[0]; } +#elif (defined __riscv_zvfhmin && __riscv_zvfhmin) || (defined __riscv_zvfh && __riscv_zvfh) +#include + +int test() +{ + const _Float16 input1[] = {0.5f, 1.5f, 2.5f, 3.5f}; + const float input2[] = {-0.5f, -1.5f, -2.5f, -3.5f}; + short dst[4]; + + size_t vl = __riscv_vsetvl_e16m1(4); + + vfloat16m1_t in_f16 = __riscv_vle16_v_f16m1(input1, vl); + vfloat32m2_t in_f32 = __riscv_vle32_v_f32m2(input2, vl); + + vfloat32m2_t cvt_f32 = __riscv_vfwcvt_f_f_v_f32m2(in_f16, vl); + vfloat32m2_t res_f32 = __riscv_vfadd(in_f32, cvt_f32, vl); + vfloat16m1_t res_f16 = __riscv_vfncvt_f_f_w_f16m1(res_f32, vl); + + __riscv_vse16_v_f16m1((_Float16*)dst, res_f16, vl); + return (int)dst[0]; +} #else #error "FP16 is not supported" #endif diff --git a/cmake/checks/cpu_rvv_fp16.cpp b/cmake/checks/cpu_rvv_fp16.cpp new file mode 100644 index 0000000000..3bc70a4d0c --- /dev/null +++ b/cmake/checks/cpu_rvv_fp16.cpp @@ -0,0 +1,25 @@ +#include + +#if defined(__riscv) && __riscv && defined (__riscv_zvfh) && __riscv_zvfh +# include + +int test() +{ + const _Float16 input1[] = {0.5f, 1.5f, 2.5f, 3.5f}; + const _Float16 input2[] = {-0.5f, -1.5f, -2.5f, -3.5f}; + + size_t vl = __riscv_vsetvl_e16m1(4); + vfloat16m1_t vec1 = __riscv_vle16_v_f16m1(input1, vl); + vfloat16m1_t vec2 = __riscv_vle16_v_f16m1(input2, vl); + vfloat16m1_t result = __riscv_vfadd_vv_f16m1(vec1, vec2, vl); + return (int)__riscv_vfmv_f_s_f16m1_f16(result); +} +#else +#error "RISC-V Vector Extension with Half-Precision Floating-Point (zvfh) is not supported" +#endif + +int main() +{ + printf("%d\n", test()); + return 0; +} diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h index 607f286615..853d91ca23 100644 --- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h +++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h @@ -74,6 +74,8 @@ #ifdef CV_CPU_COMPILE_FP16 # if defined(__arm__) || defined(__aarch64__) || defined(_M_ARM) || defined(_M_ARM64) # include +# elif defined(__riscv_vector) +# include # else # include # endif @@ -250,6 +252,11 @@ struct VZeroUpperGuard { # define CV_FP16 1 #endif +#if defined(__riscv_zvfhmin) && __riscv_zvfhmin || (defined(__riscv_zvfh) && __riscv_zvfh) +# include +# define CV_FP16 1 +#endif + #endif // !__OPENCV_BUILD && !__CUDACC (Compatibility code) diff --git a/modules/core/include/opencv2/core/cv_cpu_helper.h b/modules/core/include/opencv2/core/cv_cpu_helper.h index 04b00d2024..4e09f2a9a8 100644 --- a/modules/core/include/opencv2/core/cv_cpu_helper.h +++ b/modules/core/include/opencv2/core/cv_cpu_helper.h @@ -567,6 +567,27 @@ #endif #define __CV_CPU_DISPATCH_CHAIN_RVV(fn, args, mode, ...) CV_CPU_CALL_RVV(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) +#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_RVV_ZVFH +# define CV_TRY_RVV_ZVFH 1 +# define CV_CPU_FORCE_RVV_ZVFH 1 +# define CV_CPU_HAS_SUPPORT_RVV_ZVFH 1 +# define CV_CPU_CALL_RVV_ZVFH(fn, args) return (cpu_baseline::fn args) +# define CV_CPU_CALL_RVV_ZVFH_(fn, args) return (opt_RVV_ZVFH::fn args) +#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_RVV_ZVFH +# define CV_TRY_RVV_ZVFH 1 +# define CV_CPU_FORCE_RVV_ZVFH 0 +# define CV_CPU_HAS_SUPPORT_RVV_ZVFH (cv::checkHardwareSupport(CV_CPU_RVV_ZVFH)) +# define CV_CPU_CALL_RVV_ZVFH(fn, args) if (CV_CPU_HAS_SUPPORT_RVV_ZVFH) return (opt_RVV_ZVFH::fn args) +# define CV_CPU_CALL_RVV_ZVFH_(fn, args) if (CV_CPU_HAS_SUPPORT_RVV_ZVFH) return (opt_RVV_ZVFH::fn args) +#else +# define CV_TRY_RVV_ZVFH 0 +# define CV_CPU_FORCE_RVV_ZVFH 0 +# define CV_CPU_HAS_SUPPORT_RVV_ZVFH 0 +# define CV_CPU_CALL_RVV_ZVFH(fn, args) +# define CV_CPU_CALL_RVV_ZVFH_(fn, args) +#endif +#define __CV_CPU_DISPATCH_CHAIN_RVV_ZVFH(fn, args, mode, ...) CV_CPU_CALL_RVV_ZVFH(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) + #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_LSX # define CV_TRY_LSX 1 # define CV_CPU_FORCE_LSX 1 diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index c8a0e17cdb..7960c49ab4 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -288,6 +288,7 @@ namespace cv { #define CV_CPU_VSX3 201 #define CV_CPU_RVV 210 +#define CV_CPU_RVV_ZVFH 211 #define CV_CPU_LSX 230 #define CV_CPU_LASX 231 @@ -350,6 +351,7 @@ enum CpuFeatures { CPU_VSX3 = 201, CPU_RVV = 210, + CPU_RVV_ZVFH = 211, CPU_LSX = 230, CPU_LASX = 231, @@ -384,6 +386,8 @@ enum CpuFeatures { #if defined __ARM_FP16_FORMAT_IEEE \ && !defined __CUDACC__ # define CV_FP16_TYPE 1 +#elif (defined(__riscv_zvfh) && __riscv_zvfh) || (defined(__riscv_zvfhmin) && __riscv_zvfhmin) +# define CV_FP16_TYPE 1 #else # define CV_FP16_TYPE 0 #endif @@ -838,12 +842,14 @@ class hfloat public: #if CV_FP16_TYPE hfloat() = default; - explicit hfloat(float x) { h = (__fp16)x; } operator float() const { return (float)h; } #if defined __ARM_FP16_FORMAT_IEEE + explicit hfloat(float x) { h = (__fp16)x; } protected: __fp16 h; #else + explicit hfloat(float x) { h = (_Float16)x; } + explicit operator _Float16() const { return h; } protected: _Float16 h; #endif diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index 91bf990e1e..67aba0bf27 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -343,6 +343,10 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN #define CV_SIMD_SCALABLE_64F 0 #endif +#ifndef CV_SIMD_SCALABLE_FP16 +#define CV_SIMD_SCALABLE_FP16 0 +#endif + //================================================================================================== template struct V_RegTraits @@ -412,6 +416,9 @@ template struct V_RegTraits CV_DEF_REG_TRAITS(v, v_int8, schar, s8, v_uint8, v_int16, v_int32, v_int8, void); CV_DEF_REG_TRAITS(v, v_uint16, ushort, u16, v_uint16, v_uint32, v_uint64, v_int16, void); CV_DEF_REG_TRAITS(v, v_int16, short, s16, v_uint16, v_int32, v_int64, v_int16, void); + #if CV_SIMD_SCALABLE_FP16 + CV_DEF_REG_TRAITS(v, v_float16, hfloat, f16, v_float16, v_float32, v_float64, v_int16, v_int16); + #endif CV_DEF_REG_TRAITS(v, v_uint32, unsigned, u32, v_uint32, v_uint64, void, v_int32, void); CV_DEF_REG_TRAITS(v, v_int32, int, s32, v_uint32, v_int64, void, v_int32, void); CV_DEF_REG_TRAITS(v, v_float32, float, f32, v_float32, v_float64, void, v_int32, v_int32); @@ -542,6 +549,7 @@ using namespace CV__SIMD_NAMESPACE; #define CV__SIMD_NAMESPACE simd namespace CV__SIMD_NAMESPACE { #define CV_SIMD 0 + #define CV_SIMD_FP16 0 #define CV_SIMD_WIDTH 128 /* 1024/8 */ #define VXPREFIX(func) v##func @@ -565,7 +573,7 @@ namespace CV__SIMD_NAMESPACE { inline v_int8 vx_setall_s8(schar v) { return VXPREFIX(_setall_s8)(v); } inline v_uint16 vx_setall_u16(ushort v) { return VXPREFIX(_setall_u16)(v); } inline v_int16 vx_setall_s16(short v) { return VXPREFIX(_setall_s16)(v); } -#if CV_SIMD_FP16 +#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16) inline v_float16 vx_setall_f16(hfloat v) { return VXPREFIX(_setall_f16)(v); } #endif inline v_int32 vx_setall_s32(int v) { return VXPREFIX(_setall_s32)(v); } @@ -585,7 +593,7 @@ namespace CV__SIMD_NAMESPACE { inline v_int8 vx_setzero_s8() { return VXPREFIX(_setzero_s8)(); } inline v_uint16 vx_setzero_u16() { return VXPREFIX(_setzero_u16)(); } inline v_int16 vx_setzero_s16() { return VXPREFIX(_setzero_s16)(); } -#if CV_SIMD_FP16 +#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16) inline v_float16 vx_setzero_f16() { return VXPREFIX(_setzero_f16)(); } #endif inline v_int32 vx_setzero_s32() { return VXPREFIX(_setzero_s32)(); } @@ -605,7 +613,7 @@ namespace CV__SIMD_NAMESPACE { inline v_int8 vx_load(const schar * ptr) { return VXPREFIX(_load)(ptr); } inline v_uint16 vx_load(const ushort * ptr) { return VXPREFIX(_load)(ptr); } inline v_int16 vx_load(const short * ptr) { return VXPREFIX(_load)(ptr); } -#if CV_SIMD_FP16 +#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16) inline v_float16 vx_load(const hfloat * ptr) { return VXPREFIX(_load)(ptr); } #endif inline v_int32 vx_load(const int * ptr) { return VXPREFIX(_load)(ptr); } @@ -625,7 +633,7 @@ namespace CV__SIMD_NAMESPACE { inline v_int8 vx_load_aligned(const schar * ptr) { return VXPREFIX(_load_aligned)(ptr); } inline v_uint16 vx_load_aligned(const ushort * ptr) { return VXPREFIX(_load_aligned)(ptr); } inline v_int16 vx_load_aligned(const short * ptr) { return VXPREFIX(_load_aligned)(ptr); } -#if CV_SIMD_FP16 +#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16) inline v_float16 vx_load_aligned(const hfloat * ptr) { return VXPREFIX(_load_aligned)(ptr); } #endif inline v_int32 vx_load_aligned(const int * ptr) { return VXPREFIX(_load_aligned)(ptr); } @@ -645,7 +653,7 @@ namespace CV__SIMD_NAMESPACE { inline v_int8 vx_load_low(const schar * ptr) { return VXPREFIX(_load_low)(ptr); } inline v_uint16 vx_load_low(const ushort * ptr) { return VXPREFIX(_load_low)(ptr); } inline v_int16 vx_load_low(const short * ptr) { return VXPREFIX(_load_low)(ptr); } -#if CV_SIMD_FP16 +#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16) inline v_float16 vx_load_low(const hfloat * ptr) { return VXPREFIX(_load_low)(ptr); } #endif inline v_int32 vx_load_low(const int * ptr) { return VXPREFIX(_load_low)(ptr); } @@ -665,7 +673,7 @@ namespace CV__SIMD_NAMESPACE { inline v_int8 vx_load_halves(const schar * ptr0, const schar * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); } inline v_uint16 vx_load_halves(const ushort * ptr0, const ushort * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); } inline v_int16 vx_load_halves(const short * ptr0, const short * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); } -#if CV_SIMD_FP16 +#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16) inline v_float16 vx_load_halves(const hfloat * ptr0, const hfloat * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); } #endif inline v_int32 vx_load_halves(const int * ptr0, const int * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); } @@ -685,7 +693,7 @@ namespace CV__SIMD_NAMESPACE { inline v_int8 vx_lut(const schar * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); } inline v_uint16 vx_lut(const ushort * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); } inline v_int16 vx_lut(const short* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); } -#if CV_SIMD_FP16 +#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16) inline v_float16 vx_lut(const hfloat * ptr, const int * idx) { return VXPREFIX(_lut)(ptr, idx); } #endif inline v_int32 vx_lut(const int* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); } @@ -705,7 +713,7 @@ namespace CV__SIMD_NAMESPACE { inline v_int8 vx_lut_pairs(const schar * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); } inline v_uint16 vx_lut_pairs(const ushort * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); } inline v_int16 vx_lut_pairs(const short* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); } -#if CV_SIMD_FP16 +#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16) inline v_float16 vx_lut_pairs(const hfloat * ptr, const int * idx) { return VXPREFIX(_lut_pairs)(ptr, idx); } #endif inline v_int32 vx_lut_pairs(const int* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); } diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp index c8d4ec37b5..2c797fb8ee 100644 --- a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp @@ -30,6 +30,12 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN #define CV_SIMD_SCALABLE 1 #define CV_SIMD_SCALABLE_64F 1 +#if defined(__riscv_zvfh) && __riscv_zvfh + #define CV_SIMD_SCALABLE_FP16 1 +#else + #define CV_SIMD_SCALABLE_FP16 0 +#endif + using v_uint8 = vuint8m1_t; using v_int8 = vint8m1_t; @@ -40,6 +46,9 @@ using v_int32 = vint32m1_t; using v_uint64 = vuint64m1_t; using v_int64 = vint64m1_t; +#if CV_SIMD_SCALABLE_FP16 +using v_float16 = vfloat16m1_t; +#endif using v_float32 = vfloat32m1_t; #if CV_SIMD_SCALABLE_64F using v_float64 = vfloat64m1_t; @@ -117,6 +126,13 @@ OPENCV_HAL_IMPL_RVV_TRAITS(vuint64m2_t, uint64_t, e64m2, 64) OPENCV_HAL_IMPL_RVV_TRAITS(vuint64m4_t, uint64_t, e64m4, 64) OPENCV_HAL_IMPL_RVV_TRAITS(vuint64m8_t, uint64_t, e64m8, 64) +#if CV_SIMD_SCALABLE_FP16 +OPENCV_HAL_IMPL_RVV_TRAITS(vfloat16m1_t, hfloat, e16m1, 16) +OPENCV_HAL_IMPL_RVV_TRAITS(vfloat16m2_t, hfloat, e16m2, 16) +OPENCV_HAL_IMPL_RVV_TRAITS(vfloat16m4_t, hfloat, e16m4, 16) +OPENCV_HAL_IMPL_RVV_TRAITS(vfloat16m8_t, hfloat, e16m8, 16) +#endif + OPENCV_HAL_IMPL_RVV_TRAITS(vfloat32m1_t, float, e32m1, 32) OPENCV_HAL_IMPL_RVV_TRAITS(vfloat32m2_t, float, e32m2, 32) OPENCV_HAL_IMPL_RVV_TRAITS(vfloat32m4_t, float, e32m4, 32) @@ -155,6 +171,12 @@ OPENCV_HAL_IMPL_RVV_GRT0_INT(int32, int) OPENCV_HAL_IMPL_RVV_GRT0_INT(uint64, uint64) OPENCV_HAL_IMPL_RVV_GRT0_INT(int64, int64) +#if CV_SIMD_SCALABLE_FP16 +inline hfloat v_get0(const v_float16& v) \ +{ \ + return (hfloat)__riscv_vfmv_f(v); \ +} +#endif inline float v_get0(const v_float32& v) \ { \ return __riscv_vfmv_f(v); \ @@ -197,6 +219,20 @@ inline v_##_Tpv v_setall_##suffix(_Tp v) \ return __riscv_vfmv_v_f_##suffix##m1(v, vl); \ } +#if CV_SIMD_SCALABLE_FP16 +inline v_float16 v_setzero_f16() +{ + return __riscv_vfmv_v_f_f16m1(0, VTraits::vlanes()); +} +inline v_float16 v_setall_f16(float v) // In some cases we may use v_setall_f16(1.0f) +{ + return __riscv_vfmv_v_f_f16m1((_Float16)v, VTraits::vlanes()); +} +inline v_float16 v_setall_f16(hfloat v) +{ + return __riscv_vfmv_v_f_f16m1((_Float16)v, VTraits::vlanes()); +} +#endif OPENCV_HAL_IMPL_RVV_INIT_FP(float32, float, f32, VTraits::vlanes()) #if CV_SIMD_SCALABLE_64F OPENCV_HAL_IMPL_RVV_INIT_FP(float64, double, f64, VTraits::vlanes()) @@ -216,6 +252,9 @@ OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int8, s8) OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int16, s16) OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int32, s32) OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int64, s64) +#if CV_SIMD_SCALABLE_FP16 +OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(float16, f16) +#endif OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(float32, f32) #if CV_SIMD_SCALABLE_64F OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(float64, f64) @@ -234,6 +273,10 @@ inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \ OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, int8, u8, s8, u8, i8) OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, int16, u16, s16, u16, i16) OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, int32, u32, s32, u32, i32) +#if CV_SIMD_SCALABLE_FP16 +OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, float16, u16, f16, u16, f16) +OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16, float16, s16, f16, i16, f16) +#endif OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, float32, u32, f32, u32, f32) OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32, float32, s32, f32, i32, f32) OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64, int64, u64, s64, u64, i64) @@ -277,6 +320,14 @@ OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int64, u32, s64, u, i, 32, 64) OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int8, u64, s8, u, i, 64, 8) OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int16, u64, s16, u, i, 64, 16) OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int32, u64, s32, u, i, 64, 32) +#if CV_SIMD_SCALABLE_FP16 +OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, float16, u8, f16, u, f, 8, 16) +OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, float16, u32, f16, u, f, 32, 16) +OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, float16, u64, f16, u, f, 64, 16) +OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8, float16, s8, f16, i, f, 8, 16) +OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int32, float16, s32, f16, i, f, 32, 16) +OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int64, float16, s64, f16, i, f, 64, 16) +#endif OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, float32, u8, f32, u, f, 8, 32) OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, float32, u16, f32, u, f, 16, 32) OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, float32, u64, f32, u, f, 64, 32) @@ -291,6 +342,17 @@ OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8, float64, s8, f64, i, f, 8, 64) OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16, float64, s16, f64, i, f, 16, 64) OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int32, float64, s32, f64, i, f, 32, 64) // Three times reinterpret +#if CV_SIMD_SCALABLE_FP16 +inline v_float16 v_reinterpret_as_f16(const v_float64& v) \ +{ \ + return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vreinterpret_v_u64m1_u16m1(__riscv_vreinterpret_v_f64m1_u64m1(v)));\ +} + +inline v_float64 v_reinterpret_as_f64(const v_float16& v) \ +{ \ + return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vreinterpret_v_u16m1_u64m1(__riscv_vreinterpret_v_f16m1_u16m1(v)));\ +} +#endif inline v_float32 v_reinterpret_as_f32(const v_float64& v) \ { \ return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u64m1_u32m1(__riscv_vreinterpret_v_f64m1_u64m1(v)));\ @@ -332,9 +394,12 @@ inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b, int i = s) \ } \ template inline _Tp v_extract_n(_Tpvec v, int i = s) \ { \ - return __riscv_vfmv_f(__riscv_vslidedown(v, i, vl)); \ + return (_Tp)__riscv_vfmv_f(__riscv_vslidedown(v, i, vl)); \ } +#if CV_SIMD_SCALABLE_FP16 +OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float16, hfloat, VTraits::vlanes()) +#endif OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float32, float, VTraits::vlanes()) #if CV_SIMD_SCALABLE_64F OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float64, double, VTraits::vlanes()) @@ -343,7 +408,7 @@ OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float64, double, VTraits::vlanes()) #define OPENCV_HAL_IMPL_RVV_EXTRACT(_Tpvec, _Tp, vl) \ inline _Tp v_extract_highest(_Tpvec v) \ { \ - return v_extract_n(v, vl-1); \ + return (_Tp)v_extract_n(v, vl-1); \ } OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint8, uchar, VTraits::vlanes()) @@ -354,6 +419,9 @@ OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint32, unsigned int, VTraits::vlanes()) OPENCV_HAL_IMPL_RVV_EXTRACT(v_int32, int, VTraits::vlanes()) OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint64, uint64, VTraits::vlanes()) OPENCV_HAL_IMPL_RVV_EXTRACT(v_int64, int64, VTraits::vlanes()) +#if CV_SIMD_SCALABLE_FP16 +OPENCV_HAL_IMPL_RVV_EXTRACT(v_float16, hfloat, VTraits::vlanes()) +#endif OPENCV_HAL_IMPL_RVV_EXTRACT(v_float32, float, VTraits::vlanes()) #if CV_SIMD_SCALABLE_64F OPENCV_HAL_IMPL_RVV_EXTRACT(v_float64, double, VTraits::vlanes()) @@ -408,6 +476,47 @@ _Tpvec v_load_##suffix(Targs... nScalars) \ return v_load({nScalars...}); \ } +#define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP_FP16(_Tpvec, _nTpvec, _Tp, hvl, vl, width, suffix) \ +inline _Tpvec v_load(const _Tp* ptr) \ +{ \ + return __riscv_vle##width##_v_##suffix##m1((_Float16*)ptr, vl); \ +} \ +inline _Tpvec v_load_aligned(const _Tp* ptr) \ +{ \ + return __riscv_vle##width##_v_##suffix##m1((_Float16*)ptr, vl); \ +} \ +inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \ +{ \ + __riscv_vse##width##_v_##suffix##m1((_Float16*)ptr, a, vl); \ +} \ +inline _Tpvec v_load_low(const _Tp* ptr) \ +{ \ + return __riscv_vle##width##_v_##suffix##m1((_Float16*)ptr, hvl); \ +} \ +inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ +{ \ + return __riscv_vslideup(__riscv_vle##width##_v_##suffix##m1((_Float16*)ptr0, hvl), __riscv_vle##width##_v_##suffix##m1((_Float16*)ptr1, hvl), hvl, vl); \ +} \ +inline void v_store(_Tp* ptr, const _Tpvec& a) \ +{ \ + __riscv_vse##width((_Float16*)ptr, a, vl); \ +} \ +inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ +{ \ + __riscv_vse##width((_Float16*)ptr, a, vl); \ +} \ +inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \ +{ \ + __riscv_vse##width((_Float16*)ptr, a, vl); \ +} \ +inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ +{ \ + __riscv_vse##width((_Float16*)ptr, a, hvl); \ +} \ +inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ +{ \ + __riscv_vse##width((_Float16*)ptr, __riscv_vslidedown_vx_##suffix##m1(a, hvl, vl), hvl); \ +} OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint8, vuint8m1_t, uchar, VTraits::vlanes() / 2, VTraits::vlanes(), 8, u8) OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int8, vint8m1_t, schar, VTraits::vlanes() / 2, VTraits::vlanes(), 8, i8) @@ -417,6 +526,9 @@ OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint32, vuint32m1_t, unsigned int, VTraits::vlanes() / 2, VTraits::vlanes(), 32, i32) OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint64, vuint64m1_t, uint64, VTraits::vlanes() / 2, VTraits::vlanes(), 64, u64) OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int64, vint64m1_t, int64, VTraits::vlanes() / 2, VTraits::vlanes(), 64, i64) +#if CV_SIMD_SCALABLE_FP16 +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP_FP16(v_float16, vfloat16m1_t, hfloat, VTraits::vlanes() /2 , VTraits::vlanes(), 16, f16) +#endif OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float32, vfloat32m1_t, float, VTraits::vlanes() /2 , VTraits::vlanes(), 32, f32) #if CV_SIMD_SCALABLE_64F @@ -430,16 +542,25 @@ inline _Tpvec v_lut(const _Tp* tab, const int* idx) \ auto vidx = __riscv_vmul(__riscv_vreinterpret_u32##suffix(__riscv_vle32_v_i32##suffix(idx, VTraits<_Tpvec>::vlanes())), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \ return __riscv_vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \ } +#define OPENCV_HAL_IMPL_RVV_LUT_FP16(_Tpvec, _Tp, suffix) \ +inline _Tpvec v_lut(const _Tp* tab, const int* idx) \ +{ \ + auto vidx = __riscv_vmul(__riscv_vreinterpret_u32##suffix(__riscv_vle32_v_i32##suffix(idx, VTraits<_Tpvec>::vlanes())), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \ + return __riscv_vloxei32((_Float16*)tab, vidx, VTraits<_Tpvec>::vlanes()); \ +} OPENCV_HAL_IMPL_RVV_LUT(v_int8, schar, m4) OPENCV_HAL_IMPL_RVV_LUT(v_int16, short, m2) OPENCV_HAL_IMPL_RVV_LUT(v_int32, int, m1) OPENCV_HAL_IMPL_RVV_LUT(v_int64, int64_t, mf2) +#if CV_SIMD_SCALABLE_FP16 +OPENCV_HAL_IMPL_RVV_LUT_FP16(v_float16, hfloat, m2) +#endif OPENCV_HAL_IMPL_RVV_LUT(v_float32, float, m1) #if CV_SIMD_SCALABLE_64F OPENCV_HAL_IMPL_RVV_LUT(v_float64, double, mf2) #endif -#define OPENCV_HAL_IMPL_RVV_LUT_PAIRS(_Tpvec, _Tp, suffix1, suffix2, v_trunc) \ +#define OPENCV_HAL_IMPL_RVV_LUT_PAIRS(_Tpvec, _Tp, _TpCast, suffix1, suffix2, v_trunc) \ inline _Tpvec v_lut_pairs(const _Tp* tab, const int* idx) \ { \ auto v0 = __riscv_vle32_v_u32##suffix1((unsigned*)idx, VTraits<_Tpvec>::vlanes()/2); \ @@ -449,19 +570,22 @@ inline _Tpvec v_lut_pairs(const _Tp* tab, const int* idx) \ auto sh1 = __riscv_vslide1up(v_trunc(__riscv_vreinterpret_u32##suffix2(w1)),0, VTraits<_Tpvec>::vlanes()); \ auto vid = __riscv_vor(sh1, v_trunc(__riscv_vreinterpret_u32##suffix2(w0)), VTraits<_Tpvec>::vlanes()); \ auto vidx = __riscv_vmul(vid, sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \ - return __riscv_vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \ + return __riscv_vloxei32((_TpCast *)tab, vidx, VTraits<_Tpvec>::vlanes()); \ } -OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int8, schar, m2, m4, OPENCV_HAL_NOP) -OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int16, short, m1, m2, OPENCV_HAL_NOP) -OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int32, int, mf2, m1, OPENCV_HAL_NOP) -OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float32, float, mf2, m1, OPENCV_HAL_NOP) -OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int64, int64_t, mf2, m1, __riscv_vlmul_trunc_u32mf2) +OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int8, schar, schar, m2, m4, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int16, short, short, m1, m2, OPENCV_HAL_NOP) +#if CV_SIMD_SCALABLE_FP16 +OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float16, hfloat, _Float16, m1, m2, OPENCV_HAL_NOP) +#endif +OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int32, int, int, mf2, m1, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float32, float, float, mf2, m1, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int64, int64_t, int64_t, mf2, m1, __riscv_vlmul_trunc_u32mf2) #if CV_SIMD_SCALABLE_64F -OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float64, double, mf2, m1, __riscv_vlmul_trunc_u32mf2) +OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float64, double, double, mf2, m1, __riscv_vlmul_trunc_u32mf2) #endif -#define OPENCV_HAL_IMPL_RVV_LUT_QUADS(_Tpvec, _Tp, suffix0, suffix1, suffix2, v_trunc) \ +#define OPENCV_HAL_IMPL_RVV_LUT_QUADS(_Tpvec, _Tp, _TpCast, suffix0, suffix1, suffix2, v_trunc) \ inline _Tpvec v_lut_quads(const _Tp* tab, const int* idx) \ { \ auto v0 = __riscv_vle32_v_u32##suffix0((unsigned*)idx, VTraits<_Tpvec>::vlanes()/4); \ @@ -481,12 +605,15 @@ inline _Tpvec v_lut_quads(const _Tp* tab, const int* idx) \ auto shwid1 = __riscv_vslide1up(__riscv_vreinterpret_u32##suffix2(wid1),0, VTraits<_Tpvec>::vlanes()); \ auto vid = __riscv_vor(shwid1, __riscv_vreinterpret_u32##suffix2(wid0), VTraits<_Tpvec>::vlanes()); \ auto vidx = __riscv_vmul(vid, sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \ - return __riscv_vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \ + return __riscv_vloxei32((_TpCast *)tab, vidx, VTraits<_Tpvec>::vlanes()); \ } -OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int8, schar, m1, m2, m4, OPENCV_HAL_NOP) -OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int16, short, mf2 , m1, m2, OPENCV_HAL_NOP) -OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int32, int, mf2, m1, m1, __riscv_vlmul_trunc_u32mf2) -OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_float32, float, mf2, m1, m1, __riscv_vlmul_trunc_u32mf2) +OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int8, schar, schar, m1, m2, m4, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int16, short, short, mf2 , m1, m2, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int32, int, int, mf2, m1, m1, __riscv_vlmul_trunc_u32mf2) +#if CV_SIMD_SCALABLE_FP16 +OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_float16, hfloat, _Float16, mf2 , m1, m2, OPENCV_HAL_NOP) +#endif +OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_float32, float, float, mf2, m1, m1, __riscv_vlmul_trunc_u32mf2) #define OPENCV_HAL_IMPL_RVV_LUT_VEC(_Tpvec, _Tp) \ inline _Tpvec v_lut(const _Tp* tab, const v_int32& vidx) \ @@ -557,6 +684,12 @@ OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, add, __riscv_vsaddu) OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, sub, __riscv_vssubu) OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, add, __riscv_vsadd) OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, sub, __riscv_vssub) +#if CV_SIMD_SCALABLE_FP16 +OPENCV_HAL_IMPL_RVV_BIN_OP(v_float16, add, __riscv_vfadd) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_float16, sub, __riscv_vfsub) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_float16, mul, __riscv_vfmul) +OPENCV_HAL_IMPL_RVV_BIN_OP(v_float16, div, __riscv_vfdiv) +#endif OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, add, __riscv_vadd) OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, sub, __riscv_vsub) OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, mul, __riscv_vmul) @@ -602,6 +735,10 @@ OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int64, __riscv_vadd) OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_uint32, __riscv_vmul) OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_int32, __riscv_vmul) OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_float32, __riscv_vfmul) +#if CV_SIMD_SCALABLE_FP16 +OPENCV_HAL_IMPL_RVV_BIN_MADD(v_float16, __riscv_vfadd) +OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_float16, __riscv_vfmul) +#endif #if CV_SIMD_SCALABLE_64F OPENCV_HAL_IMPL_RVV_BIN_MADD(v_float64, __riscv_vfadd) OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_float64, __riscv_vfmul) @@ -689,14 +826,30 @@ OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int32, VTraits::vlanes()) OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint64, VTraits::vlanes()) OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int64, VTraits::vlanes()) -#define OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(intrin) \ +#if CV_SIMD_SCALABLE_FP16 +#define OPENCV_HAL_IMPL_RVV_FLT16_BIT_OP(intrin) \ +inline v_float16 intrin (const v_float16& a, const v_float16& b) \ +{ \ + return __riscv_vreinterpret_f16m1(intrin(__riscv_vreinterpret_i16m1(a), __riscv_vreinterpret_i16m1(b))); \ +} +OPENCV_HAL_IMPL_RVV_FLT16_BIT_OP(v_and) +OPENCV_HAL_IMPL_RVV_FLT16_BIT_OP(v_or) +OPENCV_HAL_IMPL_RVV_FLT16_BIT_OP(v_xor) + +inline v_float16 v_not (const v_float16& a) \ +{ \ + return __riscv_vreinterpret_f16m1(v_not(__riscv_vreinterpret_i16m1(a))); \ +} +#endif + +#define OPENCV_HAL_IMPL_RVV_FLT32_BIT_OP(intrin) \ inline v_float32 intrin (const v_float32& a, const v_float32& b) \ { \ return __riscv_vreinterpret_f32m1(intrin(__riscv_vreinterpret_i32m1(a), __riscv_vreinterpret_i32m1(b))); \ } -OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(v_and) -OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(v_or) -OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(v_xor) +OPENCV_HAL_IMPL_RVV_FLT32_BIT_OP(v_and) +OPENCV_HAL_IMPL_RVV_FLT32_BIT_OP(v_or) +OPENCV_HAL_IMPL_RVV_FLT32_BIT_OP(v_xor) inline v_float32 v_not (const v_float32& a) \ { \ @@ -774,6 +927,18 @@ inline _Tpvec v_##op (const _Tpvec& a, const _Tpvec& b) \ return _Tpvec(res); \ } //TODO +#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP_FP16(_Tpvec, op, intrin, suffix) \ +inline _Tpvec v_##op (const _Tpvec& a, const _Tpvec& b) \ +{ \ + size_t VLEN = VTraits<_Tpvec>::vlanes(); \ + union { uint64_t u; _Float16 d; } ones; \ + ones.u = -1; \ + auto diff = intrin(a, b, VLEN); \ + auto z = __riscv_vfmv_v_f_##suffix##m1(0, VLEN); \ + auto res = __riscv_vfmerge(z, ones.d, diff, VLEN); \ + return _Tpvec(res); \ +} //TODO + #define OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(_Tpvec, suffix) \ OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, eq, __riscv_vmseq, suffix) \ OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ne, __riscv_vmsne, suffix) \ @@ -798,6 +963,13 @@ OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, gt, __riscv_vmfgt, suffix) \ OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, le, __riscv_vmfle, suffix) \ OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ge, __riscv_vmfge, suffix) +#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP_FP16(_Tpvec, suffix) \ +OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP_FP16(_Tpvec, eq, __riscv_vmfeq, suffix) \ +OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP_FP16(_Tpvec, ne, __riscv_vmfne, suffix) \ +OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP_FP16(_Tpvec, lt, __riscv_vmflt, suffix) \ +OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP_FP16(_Tpvec, gt, __riscv_vmfgt, suffix) \ +OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP_FP16(_Tpvec, le, __riscv_vmfle, suffix) \ +OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP_FP16(_Tpvec, ge, __riscv_vmfge, suffix) OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint8, u8) OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint16, u16) @@ -807,11 +979,19 @@ OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int8, i8) OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int16, i16) OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int32, i32) OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int64, i64) +#if CV_SIMD_SCALABLE_FP16 +OPENCV_HAL_IMPL_RVV_FLOAT_CMP_FP16(v_float16, f16) +#endif OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float32, f32) #if CV_SIMD_SCALABLE_64F OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float64, f64) #endif +#if CV_SIMD_SCALABLE_FP16 +inline v_float16 v_not_nan(const v_float16& a) +{ return v_eq(a, a); } +#endif + inline v_float32 v_not_nan(const v_float32& a) { return v_eq(a, a); } @@ -840,6 +1020,10 @@ OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32, v_min, __riscv_vminu, VTraits:: OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32, v_max, __riscv_vmaxu, VTraits::vlanes()) OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32, v_min, __riscv_vmin, VTraits::vlanes()) OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32, v_max, __riscv_vmax, VTraits::vlanes()) +#if CV_SIMD_SCALABLE_FP16 +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float16, v_min, __riscv_vfmin, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float16, v_max, __riscv_vfmax, VTraits::vlanes()) +#endif OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32, v_min, __riscv_vfmin, VTraits::vlanes()) OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32, v_max, __riscv_vfmax, VTraits::vlanes()) #if CV_SIMD_SCALABLE_64F @@ -990,6 +1174,10 @@ OPENCV_HAL_IMPL_RVV_REDUCE(v_int16, max, short, i16, VTraits::vlanes(), OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32, max, unsigned, u32, VTraits::vlanes(), redmaxu) OPENCV_HAL_IMPL_RVV_REDUCE(v_int32, max, int, i32, VTraits::vlanes(), redmax) OPENCV_HAL_IMPL_RVV_REDUCE(v_float32, max, float, f32, VTraits::vlanes(), fredmax) +#if CV_SIMD_SCALABLE_FP16 +OPENCV_HAL_IMPL_RVV_REDUCE(v_float16, max, hfloat, f16, VTraits::vlanes(), fredmax) +OPENCV_HAL_IMPL_RVV_REDUCE(v_float16, min, hfloat, f16, VTraits::vlanes(), fredmin) +#endif inline v_float32 v_reduce_sum4(const v_float32& a, const v_float32& b, const v_float32& c, const v_float32& d) @@ -1043,53 +1231,31 @@ inline v_float32 v_reduce_sum4(const v_float32& a, const v_float32& b, } ////////////// Square-Root ////////////// - -inline v_float32 v_sqrt(const v_float32& x) -{ - return __riscv_vfsqrt(x, VTraits::vlanes()); -} - -inline v_float32 v_invsqrt(const v_float32& x) -{ - v_float32 one = v_setall_f32(1.0f); - return v_div(one, v_sqrt(x)); -} - -#if CV_SIMD_SCALABLE_64F -inline v_float64 v_sqrt(const v_float64& x) -{ - return __riscv_vfsqrt(x, VTraits::vlanes()); +#define OPENCV_HAL_IMPL_RVV_SQR_FP(_Tpvec, _setAllFunc) \ +inline _Tpvec v_sqrt(const _Tpvec& x) \ +{ \ + return __riscv_vfsqrt(x, VTraits<_Tpvec>::vlanes()); \ +} \ +inline _Tpvec v_invsqrt(const _Tpvec& x) \ +{ \ + return v_div(_setAllFunc(1.0f), v_sqrt(x)); \ +} \ +inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \ +{ \ + _Tpvec x = __riscv_vfmacc(__riscv_vfmul(a, a, VTraits<_Tpvec>::vlanes()), b, b, VTraits<_Tpvec>::vlanes()); \ + return v_sqrt(x); \ +} \ +inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return __riscv_vfmacc(__riscv_vfmul(a, a, VTraits<_Tpvec>::vlanes()), b, b, VTraits<_Tpvec>::vlanes()); \ } -inline v_float64 v_invsqrt(const v_float64& x) -{ - v_float64 one = v_setall_f64(1.0f); - return v_div(one, v_sqrt(x)); -} +#if CV_SIMD_SCALABLE_FP16 +OPENCV_HAL_IMPL_RVV_SQR_FP(v_float16, v_setall_f16) #endif - -inline v_float32 v_magnitude(const v_float32& a, const v_float32& b) -{ - v_float32 x = __riscv_vfmacc(__riscv_vfmul(a, a, VTraits::vlanes()), b, b, VTraits::vlanes()); - return v_sqrt(x); -} - -inline v_float32 v_sqr_magnitude(const v_float32& a, const v_float32& b) -{ - return v_float32(__riscv_vfmacc(__riscv_vfmul(a, a, VTraits::vlanes()), b, b, VTraits::vlanes())); -} - +OPENCV_HAL_IMPL_RVV_SQR_FP(v_float32, v_setall_f32) #if CV_SIMD_SCALABLE_64F -inline v_float64 v_magnitude(const v_float64& a, const v_float64& b) -{ - v_float64 x = __riscv_vfmacc(__riscv_vfmul(a, a, VTraits::vlanes()), b, b, VTraits::vlanes()); - return v_sqrt(x); -} - -inline v_float64 v_sqr_magnitude(const v_float64& a, const v_float64& b) -{ - return __riscv_vfmacc(__riscv_vfmul(a, a, VTraits::vlanes()), b, b, VTraits::vlanes()); -} +OPENCV_HAL_IMPL_RVV_SQR_FP(v_float64, v_setall_f64) #endif ////////////// Multiply-Add ////////////// @@ -1113,6 +1279,18 @@ inline v_int32 v_muladd(const v_int32& a, const v_int32& b, const v_int32& c) return v_fma(a, b, c); } +#if CV_SIMD_SCALABLE_FP16 +inline v_float16 v_fma(const v_float16& a, const v_float16& b, const v_float16& c) +{ + return __riscv_vfmacc(c, a, b, VTraits::vlanes()); +} + +inline v_float16 v_muladd(const v_float16& a, const v_float16& b, const v_float16& c) +{ + return v_fma(a, b, c); +} +#endif + #if CV_SIMD_SCALABLE_64F inline v_float64 v_fma(const v_float64& a, const v_float64& b, const v_float64& c) { @@ -1153,6 +1331,13 @@ inline bool v_check_all(const v_uint16& a) inline bool v_check_any(const v_uint16& a) { return v_check_any(v_reinterpret_as_s16(a)); } +#if CV_SIMD_SCALABLE_FP16 +inline bool v_check_all(const v_float16& a) +{ return v_check_all(v_reinterpret_as_s16(a)); } +inline bool v_check_any(const v_float16& a) +{ return v_check_any(v_reinterpret_as_s16(a)); } +#endif + inline bool v_check_all(const v_uint32& a) { return v_check_all(v_reinterpret_as_s32(a)); } inline bool v_check_any(const v_uint32& a) @@ -1186,6 +1371,9 @@ inline _Tpvec v_##abs(const _Tpvec& a, const _Tpvec& b) \ OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint8, absdiff) OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint16, absdiff) OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint32, absdiff) +#if CV_SIMD_SCALABLE_FP16 +OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float16, absdiff) +#endif OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float32, absdiff) #if CV_SIMD_SCALABLE_64F OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float64, absdiff) @@ -1212,6 +1400,9 @@ inline _Tprvec v_abs(const _Tpvec& a) \ OPENCV_HAL_IMPL_RVV_ABS(v_uint8, v_int8, s8) OPENCV_HAL_IMPL_RVV_ABS(v_uint16, v_int16, s16) OPENCV_HAL_IMPL_RVV_ABS(v_uint32, v_int32, s32) +#if CV_SIMD_SCALABLE_FP16 +OPENCV_HAL_IMPL_RVV_ABS(v_float16, v_float16, f16) +#endif OPENCV_HAL_IMPL_RVV_ABS(v_float32, v_float32, f32) #if CV_SIMD_SCALABLE_64F OPENCV_HAL_IMPL_RVV_ABS(v_float64, v_float64, f64) @@ -1246,6 +1437,12 @@ OPENCV_HAL_IMPL_RVV_SELECT(v_uint32, VTraits::vlanes()) OPENCV_HAL_IMPL_RVV_SELECT(v_int8, VTraits::vlanes()) OPENCV_HAL_IMPL_RVV_SELECT(v_int16, VTraits::vlanes()) OPENCV_HAL_IMPL_RVV_SELECT(v_int32, VTraits::vlanes()) +#if CV_SIMD_SCALABLE_FP16 +inline v_float16 v_select(const v_float16& mask, const v_float16& a, const v_float16& b) \ +{ \ + return __riscv_vmerge(b, a, __riscv_vmfne(mask, 0, VTraits::vlanes()), VTraits::vlanes()); \ +} +#endif inline v_float32 v_select(const v_float32& mask, const v_float32& a, const v_float32& b) \ { \ @@ -1314,12 +1511,39 @@ template inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \ template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \ { CV_UNUSED(b); return a; } +#if CV_SIMD_SCALABLE_FP16 +OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float16, f16, VTraits::vlanes()) +#endif OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float32, f32, VTraits::vlanes()) #if CV_SIMD_SCALABLE_64F OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float64, f64, VTraits::vlanes()) #endif ////////////// Convert to float ////////////// + +#if CV_SIMD_SCALABLE_FP16 +inline v_float16 v_cvt_f16(const v_float32 &a) +{ + return __riscv_vfncvt_f(__riscv_vlmul_ext_f32m2(a), VTraits::vlanes()); +} +inline v_float16 v_cvt_f16(const v_float32 &a, const v_float32 &b) +{ + return __riscv_vfncvt_f(__riscv_vset(__riscv_vlmul_ext_f32m2(a),1,b), VTraits::vlanes()); +} +inline v_float16 v_cvt_f16(const v_int16 &a) +{ + return __riscv_vfcvt_f(a, VTraits::vlanes()); +} +inline v_float32 v_cvt_f32(const v_float16 &a) +{ + return __riscv_vget_f32m1(__riscv_vfwcvt_f(a, VTraits::vlanes()), 0); +} +inline v_float32 v_cvt_f32_high(const v_float16 &a) +{ + return __riscv_vget_f32m1(__riscv_vfwcvt_f(a, VTraits::vlanes()), 1); +} +#endif + inline v_float32 v_cvt_f32(const v_int32& a) { return __riscv_vfcvt_f_x_v_f32m1(a, VTraits::vlanes()); @@ -1367,13 +1591,16 @@ inline v_float64 v_cvt_f64(const v_int64& a) #define OPENCV_HAL_IMPL_RVV_BROADCAST(_Tpvec, suffix) \ template inline _Tpvec v_broadcast_element(_Tpvec v, int i = s) \ { \ - return v_setall_##suffix(v_extract_n(v, i)); \ + return v_setall_##suffix((_Float16)v_extract_n(v, i)); \ } \ inline _Tpvec v_broadcast_highest(_Tpvec v) \ { \ - return v_setall_##suffix(v_extract_n(v, VTraits<_Tpvec>::vlanes()-1)); \ + return v_setall_##suffix((_Float16)v_extract_n(v, VTraits<_Tpvec>::vlanes()-1)); \ } +#if CV_SIMD_SCALABLE_FP16 +OPENCV_HAL_IMPL_RVV_BROADCAST(v_float16, f16) +#endif OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint32, u32) OPENCV_HAL_IMPL_RVV_BROADCAST(v_int32, s32) OPENCV_HAL_IMPL_RVV_BROADCAST(v_float32, f32) @@ -1390,6 +1617,9 @@ OPENCV_HAL_IMPL_RVV_REVERSE(v_uint8, 8) OPENCV_HAL_IMPL_RVV_REVERSE(v_int8, 8) OPENCV_HAL_IMPL_RVV_REVERSE(v_uint16, 16) OPENCV_HAL_IMPL_RVV_REVERSE(v_int16, 16) +#if CV_SIMD_SCALABLE_FP16 +OPENCV_HAL_IMPL_RVV_REVERSE(v_float16, 16) +#endif OPENCV_HAL_IMPL_RVV_REVERSE(v_uint32, 32) OPENCV_HAL_IMPL_RVV_REVERSE(v_int32, 32) OPENCV_HAL_IMPL_RVV_REVERSE(v_float32, 32) @@ -1531,6 +1761,9 @@ OPENCV_HAL_IMPL_RVV_ZIP(v_uint8, vuint8m2_t, u8, 8, 16, OPENCV_HAL_NOP, OPENCV_H OPENCV_HAL_IMPL_RVV_ZIP(v_int8, vint8m2_t, i8, 8, 16, __riscv_vreinterpret_u8m2, __riscv_vreinterpret_u8m1) OPENCV_HAL_IMPL_RVV_ZIP(v_uint16, vuint16m2_t, u16, 16, 32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) OPENCV_HAL_IMPL_RVV_ZIP(v_int16, vint16m2_t, i16, 16, 32, __riscv_vreinterpret_u16m2, __riscv_vreinterpret_u16m1) +#if CV_SIMD_SCALABLE_FP16 +OPENCV_HAL_IMPL_RVV_ZIP(v_float16, vfloat16m2_t, f16, 16, 32, __riscv_vreinterpret_u16m2, __riscv_vreinterpret_u16m1) +#endif OPENCV_HAL_IMPL_RVV_ZIP(v_uint32, vuint32m2_t, u32, 32, 64, OPENCV_HAL_NOP, OPENCV_HAL_NOP) OPENCV_HAL_IMPL_RVV_ZIP(v_int32, vint32m2_t, i32, 32, 64, __riscv_vreinterpret_u32m2, __riscv_vreinterpret_u32m1) OPENCV_HAL_IMPL_RVV_ZIP(v_float32, vfloat32m2_t, f32, 32, 64, __riscv_vreinterpret_u32m2, __riscv_vreinterpret_u32m1) @@ -1580,66 +1813,72 @@ OPENCV_HAL_IMPL_RVV_UNPACKS(v_uint16, 16) OPENCV_HAL_IMPL_RVV_UNPACKS(v_int16, 16) OPENCV_HAL_IMPL_RVV_UNPACKS(v_uint32, 32) OPENCV_HAL_IMPL_RVV_UNPACKS(v_int32, 32) +#if CV_SIMD_SCALABLE_FP16 +OPENCV_HAL_IMPL_RVV_UNPACKS(v_float16, 16) +#endif OPENCV_HAL_IMPL_RVV_UNPACKS(v_float32, 32) #if CV_SIMD_SCALABLE_64F OPENCV_HAL_IMPL_RVV_UNPACKS(v_float64, 64) #endif -#define OPENCV_HAL_IMPL_RVV_INTERLEAVED(_Tpvec, _Tp, suffix, width, hwidth, vl) \ +#define OPENCV_HAL_IMPL_RVV_INTERLEAVED(_Tpvec, _Tp, _TpCast, suffix, width, hwidth, vl) \ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \ { \ - a = __riscv_vlse##width##_v_##suffix##m1(ptr , sizeof(_Tp)*2, VTraits::vlanes()); \ - b = __riscv_vlse##width##_v_##suffix##m1(ptr+1, sizeof(_Tp)*2, VTraits::vlanes()); \ + a = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)ptr , sizeof(_Tp)*2, VTraits::vlanes()); \ + b = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)(ptr+1), sizeof(_Tp)*2, VTraits::vlanes()); \ }\ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \ { \ - a = __riscv_vlse##width##_v_##suffix##m1(ptr , sizeof(_Tp)*3, VTraits::vlanes()); \ - b = __riscv_vlse##width##_v_##suffix##m1(ptr+1, sizeof(_Tp)*3, VTraits::vlanes()); \ - c = __riscv_vlse##width##_v_##suffix##m1(ptr+2, sizeof(_Tp)*3, VTraits::vlanes()); \ + a = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)ptr , sizeof(_Tp)*3, VTraits::vlanes()); \ + b = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)(ptr+1), sizeof(_Tp)*3, VTraits::vlanes()); \ + c = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)(ptr+2), sizeof(_Tp)*3, VTraits::vlanes()); \ } \ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \ v_##_Tpvec& c, v_##_Tpvec& d) \ { \ \ - a = __riscv_vlse##width##_v_##suffix##m1(ptr , sizeof(_Tp)*4, VTraits::vlanes()); \ - b = __riscv_vlse##width##_v_##suffix##m1(ptr+1, sizeof(_Tp)*4, VTraits::vlanes()); \ - c = __riscv_vlse##width##_v_##suffix##m1(ptr+2, sizeof(_Tp)*4, VTraits::vlanes()); \ - d = __riscv_vlse##width##_v_##suffix##m1(ptr+3, sizeof(_Tp)*4, VTraits::vlanes()); \ + a = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)ptr , sizeof(_Tp)*4, VTraits::vlanes()); \ + b = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)(ptr+1), sizeof(_Tp)*4, VTraits::vlanes()); \ + c = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)(ptr+2), sizeof(_Tp)*4, VTraits::vlanes()); \ + d = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)(ptr+3), sizeof(_Tp)*4, VTraits::vlanes()); \ } \ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \ hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \ { \ - __riscv_vsse##width(ptr, sizeof(_Tp)*2, a, VTraits::vlanes()); \ - __riscv_vsse##width(ptr+1, sizeof(_Tp)*2, b, VTraits::vlanes()); \ + __riscv_vsse##width((_TpCast *)ptr, sizeof(_Tp)*2, a, VTraits::vlanes()); \ + __riscv_vsse##width((_TpCast *)(ptr+1), sizeof(_Tp)*2, b, VTraits::vlanes()); \ } \ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \ const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \ { \ - __riscv_vsse##width(ptr, sizeof(_Tp)*3, a, VTraits::vlanes()); \ - __riscv_vsse##width(ptr+1, sizeof(_Tp)*3, b, VTraits::vlanes()); \ - __riscv_vsse##width(ptr+2, sizeof(_Tp)*3, c, VTraits::vlanes()); \ + __riscv_vsse##width((_TpCast *)ptr, sizeof(_Tp)*3, a, VTraits::vlanes()); \ + __riscv_vsse##width((_TpCast *)(ptr+1), sizeof(_Tp)*3, b, VTraits::vlanes()); \ + __riscv_vsse##width((_TpCast *)(ptr+2), sizeof(_Tp)*3, c, VTraits::vlanes()); \ } \ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \ const v_##_Tpvec& c, const v_##_Tpvec& d, \ hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \ { \ - __riscv_vsse##width(ptr, sizeof(_Tp)*4, a, VTraits::vlanes()); \ - __riscv_vsse##width(ptr+1, sizeof(_Tp)*4, b, VTraits::vlanes()); \ - __riscv_vsse##width(ptr+2, sizeof(_Tp)*4, c, VTraits::vlanes()); \ - __riscv_vsse##width(ptr+3, sizeof(_Tp)*4, d, VTraits::vlanes()); \ -} - -OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint8, uchar, u8, 8, 4, VTraits::vlanes()) -OPENCV_HAL_IMPL_RVV_INTERLEAVED(int8, schar, i8, 8, 4, VTraits::vlanes()) -OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint16, ushort, u16, 16, 8, VTraits::vlanes()) -OPENCV_HAL_IMPL_RVV_INTERLEAVED(int16, short, i16, 16, 8, VTraits::vlanes()) -OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint32, unsigned, u32, 32, 16, VTraits::vlanes()) -OPENCV_HAL_IMPL_RVV_INTERLEAVED(int32, int, i32, 32, 16, VTraits::vlanes()) -OPENCV_HAL_IMPL_RVV_INTERLEAVED(float32, float, f32, 32, 16, VTraits::vlanes()) -OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint64, uint64, u64, 64, 32, VTraits::vlanes()) -OPENCV_HAL_IMPL_RVV_INTERLEAVED(int64, int64, i64, 64, 32, VTraits::vlanes()) + __riscv_vsse##width((_TpCast *)ptr, sizeof(_Tp)*4, a, VTraits::vlanes()); \ + __riscv_vsse##width((_TpCast *)(ptr+1), sizeof(_Tp)*4, b, VTraits::vlanes()); \ + __riscv_vsse##width((_TpCast *)(ptr+2), sizeof(_Tp)*4, c, VTraits::vlanes()); \ + __riscv_vsse##width((_TpCast *)(ptr+3), sizeof(_Tp)*4, d, VTraits::vlanes()); \ +} + +OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint8, uchar, uchar, u8, 8, 4, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_INTERLEAVED(int8, schar, schar, i8, 8, 4, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint16, ushort, ushort, u16, 16, 8, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_INTERLEAVED(int16, short, short, i16, 16, 8, VTraits::vlanes()) +#if CV_SIMD_SCALABLE_FP16 +OPENCV_HAL_IMPL_RVV_INTERLEAVED(float16, hfloat, _Float16, f16, 16, 8, VTraits::vlanes()) +#endif +OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint32, unsigned, unsigned, u32, 32, 16, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_INTERLEAVED(int32, int, int, i32, 32, 16, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_INTERLEAVED(float32, float, float, f32, 32, 16, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint64, uint64, uint64, u64, 64, 32, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_INTERLEAVED(int64, int64, int64, i64, 64, 32, VTraits::vlanes()) #if CV_SIMD_SCALABLE_64F -OPENCV_HAL_IMPL_RVV_INTERLEAVED(float64, double, f64, 64, 32, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_INTERLEAVED(float64, double, double, f64, 64, 32, VTraits::vlanes()) #endif static uint64_t idx_interleave_pairs[] = { \ @@ -1781,6 +2020,10 @@ inline int64 v_signmask(const v_uint8& a) { return v_signmask(v_reinterpret_as_s8(a)); } inline int64 v_signmask(const v_uint16& a) { return v_signmask(v_reinterpret_as_s16(a)); } +#if CV_SIMD_SCALABLE_FP16 +inline int v_signmask(const v_float16& a) +{ return v_signmask(v_reinterpret_as_s16(a)); } +#endif inline int v_signmask(const v_uint32& a) { return v_signmask(v_reinterpret_as_s32(a)); } inline int v_signmask(const v_float32& a) @@ -1862,6 +2105,35 @@ inline void v_pack_store(hfloat* ptr, const v_float32& v) } #endif ////////////// Rounding ////////////// +#if CV_SIMD_SCALABLE_FP16 +inline v_int16 v_round(const v_float16& a) +{ + return __riscv_vfcvt_x(a, VTraits::vlanes()); +} + +inline v_int16 v_floor(const v_float16& a) +{ +#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>11999 + return __riscv_vfcvt_x_f_v_i16m1_rm(a, 1 /*RNE, round-to-nearest-even*/, VTraits::vlanes()); +#else + return __riscv_vfcvt_x(vfsub(a, 0.5f - 1e-5, VTraits::vlanes()), VTraits::vlanes()); +#endif +} + +inline v_int16 v_ceil(const v_float16& a) +{ +#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>11999 + return __riscv_vfcvt_x_f_v_i16m1_rm(a, 3 /*ROD, round-to-odd*/, VTraits::vlanes()); +#else + return __riscv_vfcvt_x(vfadd(a, 0.5f - 1e-5, VTraits::vlanes()), VTraits::vlanes()); +#endif +} + +inline v_int16 v_trunc(const v_float16& a) +{ + return __riscv_vfcvt_rtz_x(a, VTraits::vlanes()); +} +#endif inline v_int32 v_round(const v_float32& a) { // return vfcvt_x(vfadd(a, 1e-6, VTraits::vlanes()), VTraits::vlanes()); @@ -2155,6 +2427,41 @@ inline v_float64 v_dotprod_expand_fast(const v_int32& a, const v_int32& b, const #endif // TODO: only 128 bit now. +#if CV_SIMD_SCALABLE_FP16 +inline v_float16 v_matmul( const v_float16 &v, + const v_float16 &m0, const v_float16 &m1, + const v_float16 &m2, const v_float16 &m3, + const v_float16 &m4, const v_float16 &m5, + const v_float16 &m6, const v_float16 &m7) { + vfloat16m1_t res; + res = __riscv_vfmul_vf_f16m1(m0, (_Float16)v_extract_n(v, 0), VTraits::vlanes()); + res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 1), m1, VTraits::vlanes()); + res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 2), m2, VTraits::vlanes()); + res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 3), m3, VTraits::vlanes()); + res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 4), m4, VTraits::vlanes()); + res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 5), m5, VTraits::vlanes()); + res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 6), m6, VTraits::vlanes()); + res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 7), m7, VTraits::vlanes()); + return res; +} +inline v_float16 v_matmuladd( const v_float16 &v, + const v_float16 &m0, const v_float16 &m1, + const v_float16 &m2, const v_float16 &m3, + const v_float16 &m4, const v_float16 &m5, + const v_float16 &m6, + const v_float16 &a) { + vfloat16m1_t res; + res = __riscv_vfmul_vf_f16m1(m0, (_Float16)v_extract_n(v, 0), VTraits::vlanes()); + res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 1), m1, VTraits::vlanes()); + res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 2), m2, VTraits::vlanes()); + res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 3), m3, VTraits::vlanes()); + res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 4), m4, VTraits::vlanes()); + res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 5), m5, VTraits::vlanes()); + res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 6), m6, VTraits::vlanes()); + return __riscv_vfadd(res, a, VTraits::vlanes()); +} +#endif + inline v_float32 v_matmul(const v_float32& v, const v_float32& m0, const v_float32& m1, const v_float32& m2, const v_float32& m3) diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index 3b8a7e583d..893231191c 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -434,6 +434,7 @@ struct HWFeatures g_hwFeatureNames[CPU_AVX512_ICL] = "AVX512-ICL"; g_hwFeatureNames[CPU_RVV] = "RVV"; + g_hwFeatureNames[CPU_RVV_ZVFH] = "RVV_ZVFH"; g_hwFeatureNames[CPU_LSX] = "LSX"; g_hwFeatureNames[CPU_LASX] = "LASX"; @@ -712,6 +713,12 @@ struct HWFeatures #if defined __riscv && defined __riscv_vector have[CV_CPU_RVV] = true; + #if (defined __riscv_zvfh && __riscv_zvfh) || (defined __riscv_zvfhmin && __riscv_zvfhmin) + have[CV_CPU_FP16] = true; + #endif + #if defined __riscv_zvfh && __riscv_zvfh + have[CV_CPU_RVV_ZVFH] = true; + #endif #endif #if defined __loongarch64 && defined __linux__ diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index eb1866d7c4..e2d0de42f1 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -159,8 +159,8 @@ template std::ostream & operator<<(std::ostream & out, const Data::vlanes(); ++i) { - // out << std::hex << +V_TypeTraits::lane_type>::reinterpret_int(d.d[i]); - out << +d.d[i]; + out << std::hex << +V_TypeTraits::lane_type>::reinterpret_int(d.d[i]); + // out << +d.d[i]; // Note: No operator '<<' for _Float16 if (i + 1 < VTraits::vlanes()) out << ", "; } @@ -182,7 +182,7 @@ template<> inline void EXPECT_COMPARE_EQ_(const double a, const double b EXPECT_DOUBLE_EQ( a, b ); } -#if CV_SIMD_FP16 +#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16) template<> inline void EXPECT_COMPARE_EQ_(const hfloat a, const hfloat b) { EXPECT_LT(std::abs(float(a - b)), 0.126); @@ -564,7 +564,7 @@ template struct TheTest // Handle accuracy for fp16 TheTest & test_div_fp16() { -#if CV_SIMD_FP16 +#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16) Data dataA, dataB; dataB.reverse(); R a = dataA, b = dataB; @@ -1572,7 +1572,7 @@ template struct TheTest TheTest & test_matmul_fp16() { -#if CV_SIMD_FP16 +#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16) Data dataV, data0, data1, data2, data3, data4, data5, data6, data7; data1.reverse(); data2 += 2; @@ -1657,7 +1657,8 @@ template struct TheTest TheTest & test_transpose8x8_fp16() { -#if CV_SIMD_FP16 +#if (CV_SIMD_FP16 /*|| CV_SIMD_SCALABLE_FP16*/) +// Note: The scalable backend does not yet implement fixed-length functions Data dataA0, dataA1, dataA2, dataA3, dataA4, dataA5, dataA6, dataA7; dataA1 *= 2; dataA2 *= 4; @@ -1713,7 +1714,8 @@ template struct TheTest TheTest & test_reduce_sum8() { -#if CV_SIMD_FP16 +#if (CV_SIMD_FP16 /*|| CV_SIMD_SCALABLE_FP16*/) +// Note: The scalable backend does not yet implement fixed-length functions Data dataA, dataB, dataC, dataD, dataW, dataX, dataY, dataZ; dataB *= 0.01f; dataC *= 0.001f; @@ -1773,7 +1775,7 @@ template struct TheTest TheTest & test_loadstore_fp16() { -#if CV_SIMD_FP16 +#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16) AlignedData data; AlignedData out; @@ -1804,7 +1806,7 @@ template struct TheTest TheTest & test_float_cvt_fp16() { -#if CV_SIMD_FP16 +#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16) AlignedData data; // check conversion @@ -2449,7 +2451,7 @@ void test_hal_intrin_float16() DUMP_ENTRY(v_float16); #if CV_FP16 TheTest().test_loadstore_fp16_f32(); -#if CV_SIMD_FP16 +#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16) TheTest() .test_loadstore_fp16() .test_float_cvt_fp16() @@ -2476,6 +2478,8 @@ void test_hal_intrin_float16() .test_extract_n<0>().test_extract_n<1>() .test_exp_fp16() .test_log_fp16() +#else + std::cout << "SKIP: CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16 is not available" << std::endl; #endif ; #else