diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake index 058443821a..1e0e812afc 100644 --- a/cmake/OpenCVCompilerOptimizations.cmake +++ b/cmake/OpenCVCompilerOptimizations.cmake @@ -46,7 +46,7 @@ set(CPU_ALL_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;POPCNT;AVX;FP16;AVX2;FMA3;AVX_512F") list(APPEND CPU_ALL_OPTIMIZATIONS "AVX512_COMMON;AVX512_KNL;AVX512_KNM;AVX512_SKX;AVX512_CNL;AVX512_CLX;AVX512_ICL") -list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16) +list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16 NEON_DOTPROD) list(APPEND CPU_ALL_OPTIMIZATIONS MSA) list(APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3) list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS) @@ -326,6 +326,7 @@ if(X86 OR X86_64) elseif(ARM OR AARCH64) ocv_update(CPU_NEON_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_neon.cpp") ocv_update(CPU_FP16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_fp16.cpp") + ocv_update(CPU_NEON_DOTPROD_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_dotprod.cpp") if(NOT AARCH64) ocv_update(CPU_KNOWN_OPTIMIZATIONS "VFPV3;NEON;FP16") if(NOT MSVC) @@ -337,9 +338,11 @@ elseif(ARM OR AARCH64) endif() ocv_update(CPU_FP16_IMPLIES "NEON") else() - ocv_update(CPU_KNOWN_OPTIMIZATIONS "NEON;FP16") + ocv_update(CPU_KNOWN_OPTIMIZATIONS "NEON;FP16;NEON_DOTPROD") ocv_update(CPU_NEON_FLAGS_ON "") ocv_update(CPU_FP16_IMPLIES "NEON") + ocv_update(CPU_NEON_DOTPROD_FLAGS_ON "-march=armv8.2-a+dotprod") + ocv_update(CPU_NEON_DOTPROD_IMPLIES "NEON") set(CPU_BASELINE "NEON;FP16" CACHE STRING "${HELP_CPU_BASELINE}") endif() elseif(MIPS) diff --git a/cmake/checks/cpu_dotprod.cpp b/cmake/checks/cpu_dotprod.cpp new file mode 100644 index 0000000000..4f39c50659 --- /dev/null +++ b/cmake/checks/cpu_dotprod.cpp @@ -0,0 +1,24 @@ +#include + +#if defined __GNUC__ && (defined __arm__ || defined __aarch64__) +#include "arm_neon.h" +int test() +{ + const unsigned int src[] = { 0, 0, 0, 0 }; + unsigned int dst[4]; + uint32x4_t v_src = *(uint32x4_t*)src; + uint8x16_t v_m0 = *(uint8x16_t*)src; + uint8x16_t v_m1 = *(uint8x16_t*)src; + uint32x4_t v_dst = vdotq_u32(v_src, v_m0, v_m1); + *(uint32x4_t*)dst = v_dst; + return (int)dst[0]; +} +#else +#error "DOTPROD is not supported" +#endif + +int main() +{ + printf("%d\n", test()); + return 0; +} diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt index 568a8afef1..0163cf570c 100644 --- a/modules/core/CMakeLists.txt +++ b/modules/core/CMakeLists.txt @@ -6,7 +6,7 @@ ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2 VSX3) ocv_add_dispatched_file(convert SSE2 AVX2 VSX3) ocv_add_dispatched_file(convert_scale SSE2 AVX2) ocv_add_dispatched_file(count_non_zero SSE2 AVX2) -ocv_add_dispatched_file(matmul SSE2 SSE4_1 AVX2 AVX512_SKX) +ocv_add_dispatched_file(matmul SSE2 SSE4_1 AVX2 AVX512_SKX NEON_DOTPROD) ocv_add_dispatched_file(mean SSE2 AVX2) ocv_add_dispatched_file(merge SSE2 AVX2) ocv_add_dispatched_file(split SSE2 AVX2) diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h index e92798d6c9..758f905a64 100644 --- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h +++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h @@ -79,6 +79,10 @@ # endif # define CV_FP16 1 #endif +#ifdef CV_CPU_COMPILE_NEON_DOTPROD +# include +# define CV_NEON_DOT 1 +#endif #ifdef CV_CPU_COMPILE_AVX2 # include # define CV_AVX2 1 diff --git a/modules/core/include/opencv2/core/cv_cpu_helper.h b/modules/core/include/opencv2/core/cv_cpu_helper.h index aaa89ed415..25cf3477c3 100644 --- a/modules/core/include/opencv2/core/cv_cpu_helper.h +++ b/modules/core/include/opencv2/core/cv_cpu_helper.h @@ -420,6 +420,27 @@ #endif #define __CV_CPU_DISPATCH_CHAIN_NEON(fn, args, mode, ...) CV_CPU_CALL_NEON(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) +#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON_DOTPROD +# define CV_TRY_NEON_DOTPROD 1 +# define CV_CPU_FORCE_NEON_DOTPROD 1 +# define CV_CPU_HAS_SUPPORT_NEON_DOTPROD 1 +# define CV_CPU_CALL_NEON_DOTPROD(fn, args) return (cpu_baseline::fn args) +# define CV_CPU_CALL_NEON_DOTPROD_(fn, args) return (opt_NEON_DOTPROD::fn args) +#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_NEON_DOTPROD +# define CV_TRY_NEON_DOTPROD 1 +# define CV_CPU_FORCE_NEON_DOTPROD 0 +# define CV_CPU_HAS_SUPPORT_NEON_DOTPROD (cv::checkHardwareSupport(CV_CPU_NEON_DOTPROD)) +# define CV_CPU_CALL_NEON_DOTPROD(fn, args) if (CV_CPU_HAS_SUPPORT_NEON_DOTPROD) return (opt_NEON_DOTPROD::fn args) +# define CV_CPU_CALL_NEON_DOTPROD_(fn, args) if (CV_CPU_HAS_SUPPORT_NEON_DOTPROD) return (opt_NEON_DOTPROD::fn args) +#else +# define CV_TRY_NEON_DOTPROD 0 +# define CV_CPU_FORCE_NEON_DOTPROD 0 +# define CV_CPU_HAS_SUPPORT_NEON_DOTPROD 0 +# define CV_CPU_CALL_NEON_DOTPROD(fn, args) +# define CV_CPU_CALL_NEON_DOTPROD_(fn, args) +#endif +#define __CV_CPU_DISPATCH_CHAIN_NEON_DOTPROD(fn, args, mode, ...) CV_CPU_CALL_NEON_DOTPROD(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) + #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_MSA # define CV_TRY_MSA 1 # define CV_CPU_FORCE_MSA 1 diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index c2cdcad075..9102316968 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -282,6 +282,7 @@ namespace cv { #define CV_CPU_AVX_5124FMAPS 27 #define CV_CPU_NEON 100 +#define CV_CPU_NEON_DOTPROD 101 #define CV_CPU_MSA 150 @@ -334,6 +335,7 @@ enum CpuFeatures { CPU_AVX_5124FMAPS = 27, CPU_NEON = 100, + CPU_NEON_DOTPROD = 101, CPU_MSA = 150, diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index e17972a3fc..5792694a40 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -78,8 +78,6 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN #define CV_NEON_AARCH64 0 #endif -// TODO -#define CV_NEON_DOT 0 //////////// Utils //////////// @@ -667,11 +665,22 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64 } // 8 >> 32 +#ifdef CV_NEON_DOT +#define OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_OP(_Tpvec1, _Tpvec2, suffix) \ +inline _Tpvec1 v_dotprod_expand(const _Tpvec2& a, const _Tpvec2& b) \ +{ \ + return _Tpvec1(vdotq_##suffix(vdupq_n_##suffix(0), a.val, b.val));\ +} \ +inline _Tpvec1 v_dotprod_expand(const _Tpvec2& a, const _Tpvec2& b, const _Tpvec1& c) \ +{ \ + return _Tpvec1(vdotq_##suffix(c.val, a.val, b.val)); \ +} + +OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_OP(v_uint32x4, v_uint8x16, u32) +OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_OP(v_int32x4, v_int8x16, s32) +#else inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b) { -#if CV_NEON_DOT - return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val)); -#else const uint8x16_t zero = vreinterpretq_u8_u32(vdupq_n_u32(0)); const uint8x16_t mask = vreinterpretq_u8_u32(vdupq_n_u32(0x00FF00FF)); const uint16x8_t zero32 = vreinterpretq_u16_u32(vdupq_n_u32(0)); @@ -687,23 +696,15 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b) uint32x4_t s1 = vaddq_u32(vshrq_n_u32(vreinterpretq_u32_u16(even), 16), vshrq_n_u32(vreinterpretq_u32_u16(odd), 16)); return v_uint32x4(vaddq_u32(s0, s1)); -#endif } inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) { -#if CV_NEON_DOT - return v_uint32x4(vdotq_u32(c.val, a.val, b.val)); -#else return v_dotprod_expand(a, b) + c; -#endif } inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b) { -#if CV_NEON_DOT - return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val)); -#else int16x8_t p0 = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val)); int16x8_t p1 = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val)); int16x8_t uzp1, uzp2; @@ -712,18 +713,13 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b) int16x4_t uzpl1, uzpl2; _v128_unzip(vget_low_s16(sum), vget_high_s16(sum), uzpl1, uzpl2); return v_int32x4(vaddl_s16(uzpl1, uzpl2)); -#endif } inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c) { -#if CV_NEON_DOT - return v_int32x4(vdotq_s32(c.val, a.val, b.val)); -#else return v_dotprod_expand(a, b) + c; -#endif } - +#endif // 16 >> 64 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b) { @@ -832,45 +828,44 @@ inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_ } // 8 >> 32 +#ifdef CV_NEON_DOT +#define OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_FAST_OP(_Tpvec1, _Tpvec2, suffix) \ +inline _Tpvec1 v_dotprod_expand_fast(const _Tpvec2& a, const _Tpvec2& b) \ +{ \ + return v_dotprod_expand(a, b); \ +} \ +inline _Tpvec1 v_dotprod_expand_fast(const _Tpvec2& a, const _Tpvec2& b, const _Tpvec1& c) \ +{ \ + return v_dotprod_expand(a, b, c); \ +} + +OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_FAST_OP(v_uint32x4, v_uint8x16, u32) +OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_FAST_OP(v_int32x4, v_int8x16, s32) +#else inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b) { -#if CV_NEON_DOT - return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val)); -#else uint16x8_t p0 = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val)); uint16x8_t p1 = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val)); uint32x4_t s0 = vaddl_u16(vget_low_u16(p0), vget_low_u16(p1)); uint32x4_t s1 = vaddl_u16(vget_high_u16(p0), vget_high_u16(p1)); return v_uint32x4(vaddq_u32(s0, s1)); -#endif } inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) { -#if CV_NEON_DOT - return v_uint32x4(vdotq_u32(c.val, a.val, b.val)); -#else return v_dotprod_expand_fast(a, b) + c; -#endif } inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b) { -#if CV_NEON_DOT - return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val)); -#else int16x8_t prod = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val)); prod = vmlal_s8(prod, vget_high_s8(a.val), vget_high_s8(b.val)); return v_int32x4(vaddl_s16(vget_low_s16(prod), vget_high_s16(prod))); -#endif } inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c) { -#if CV_NEON_DOT - return v_int32x4(vdotq_s32(c.val, a.val, b.val)); -#else return v_dotprod_expand_fast(a, b) + c; -#endif } +#endif // 16 >> 64 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b) diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index adb957908d..3507009196 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -411,6 +411,7 @@ struct HWFeatures g_hwFeatureNames[CPU_AVX_5124FMAPS] = "AVX5124FMAPS"; g_hwFeatureNames[CPU_NEON] = "NEON"; + g_hwFeatureNames[CPU_NEON_DOTPROD] = "NEON_DOTPROD"; g_hwFeatureNames[CPU_VSX] = "VSX"; g_hwFeatureNames[CPU_VSX3] = "VSX3"; @@ -555,6 +556,24 @@ struct HWFeatures #ifdef __aarch64__ have[CV_CPU_NEON] = true; have[CV_CPU_FP16] = true; + int cpufile = open("/proc/self/auxv", O_RDONLY); + + if (cpufile >= 0) + { + Elf64_auxv_t auxv; + const size_t size_auxv_t = sizeof(auxv); + + while ((size_t)read(cpufile, &auxv, size_auxv_t) == size_auxv_t) + { + if (auxv.a_type == AT_HWCAP) + { + have[CV_CPU_NEON_DOTPROD] = (auxv.a_un.a_val & (1 << 20)) != 0; + break; + } + } + + close(cpufile); + } #elif defined __arm__ && defined __ANDROID__ #if defined HAVE_CPUFEATURES CV_LOG_INFO(NULL, "calling android_getCpuFeatures() ...");