From ba4d6c859d21536f84e0328c16f4cc3e96bf3065 Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Wed, 18 Oct 2023 22:06:20 +0300 Subject: [PATCH] added detection & dispatching of some modern NEON instructions (NEON_FP16, NEON_BF16) (#24420) * added more or less cross-platform (based on POSIX signal() semantics) method to detect various NEON extensions, such as FP16 SIMD arithmetics, BF16 SIMD arithmetics, SIMD dotprod etc. It could be propagated to other instruction sets if necessary. * hopefully fixed compile errors * continue to fix CI * another attempt to fix build on Linux aarch64 * * reverted to the original method to detect special arm neon instructions without signal() * renamed FP16_SIMD & BF16_SIMD to NEON_FP16 and NEON_BF16, respectively * removed extra whitespaces --- cmake/OpenCVCompilerOptimizations.cmake | 11 ++++- cmake/checks/cpu_neon_bf16.cpp | 46 +++++++++++++++++++ .../{cpu_dotprod.cpp => cpu_neon_dotprod.cpp} | 0 cmake/checks/cpu_neon_fp16.cpp | 46 +++++++++++++++++++ modules/core/include/opencv2/core/cvdef.h | 4 ++ modules/core/src/system.cpp | 36 ++++++++++----- 6 files changed, 130 insertions(+), 13 deletions(-) create mode 100644 cmake/checks/cpu_neon_bf16.cpp rename cmake/checks/{cpu_dotprod.cpp => cpu_neon_dotprod.cpp} (100%) create mode 100644 cmake/checks/cpu_neon_fp16.cpp diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake index 90ad7d783c..68867a25d0 100644 --- a/cmake/OpenCVCompilerOptimizations.cmake +++ b/cmake/OpenCVCompilerOptimizations.cmake @@ -331,7 +331,9 @@ if(X86 OR X86_64) elseif(ARM OR AARCH64) ocv_update(CPU_NEON_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_neon.cpp") ocv_update(CPU_FP16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_fp16.cpp") - ocv_update(CPU_NEON_DOTPROD_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_dotprod.cpp") + ocv_update(CPU_NEON_FP16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_neon_fp16.cpp") + ocv_update(CPU_NEON_BF16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_neon_bf16.cpp") + ocv_update(CPU_NEON_DOTPROD_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_neon_dotprod.cpp") if(NOT AARCH64) ocv_update(CPU_KNOWN_OPTIMIZATIONS "VFPV3;NEON;FP16") if(NOT MSVC) @@ -343,12 +345,17 @@ elseif(ARM OR AARCH64) endif() ocv_update(CPU_FP16_IMPLIES "NEON") else() - ocv_update(CPU_KNOWN_OPTIMIZATIONS "NEON;FP16;NEON_DOTPROD") + ocv_update(CPU_KNOWN_OPTIMIZATIONS "NEON;FP16;NEON_DOTPROD;NEON_FP16;NEON_BF16") ocv_update(CPU_NEON_FLAGS_ON "") ocv_update(CPU_FP16_IMPLIES "NEON") ocv_update(CPU_NEON_DOTPROD_FLAGS_ON "-march=armv8.2-a+dotprod") ocv_update(CPU_NEON_DOTPROD_IMPLIES "NEON") + ocv_update(CPU_NEON_FP16_FLAGS_ON "-march=armv8.2-a+fp16") + ocv_update(CPU_NEON_FP16_IMPLIES "NEON") + ocv_update(CPU_NEON_BF16_FLAGS_ON "-march=armv8.2-a+fp16+bf16") + ocv_update(CPU_NEON_BF16_IMPLIES "NEON") set(CPU_BASELINE "NEON;FP16" CACHE STRING "${HELP_CPU_BASELINE}") + set(CPU_DISPATCH "NEON_FP16;NEON_BF16;NEON_DOTPROD" CACHE STRING "${HELP_CPU_DISPATCH}") endif() elseif(MIPS) ocv_update(CPU_MSA_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_msa.cpp") diff --git a/cmake/checks/cpu_neon_bf16.cpp b/cmake/checks/cpu_neon_bf16.cpp new file mode 100644 index 0000000000..c18d63c579 --- /dev/null +++ b/cmake/checks/cpu_neon_bf16.cpp @@ -0,0 +1,46 @@ +#if defined __GNUC__ && (defined __arm__ || defined __aarch64__) +#include +#include "arm_neon.h" + +/*#if defined __clang__ +#pragma clang attribute push (__attribute__((target("bf16"))), apply_to=function) +#elif defined GCC +#pragma GCC push_options +#pragma GCC target("armv8.2-a", "bf16") +#endif*/ +bfloat16x8_t vld1q_as_bf16(const float* src) +{ + float32x4_t s0 = vld1q_f32(src), s1 = vld1q_f32(src + 4); + return vcombine_bf16(vcvt_bf16_f32(s0), vcvt_bf16_f32(s1)); +} + +void vprintreg(const char* name, const float32x4_t& r) +{ + float data[4]; + vst1q_f32(data, r); + printf("%s: (%.2f, %.2f, %.2f, %.2f)\n", + name, data[0], data[1], data[2], data[3]); +} + +void test() +{ + const float src1[] = { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f }; + const float src2[] = { 1.f, 3.f, 6.f, 10.f, 15.f, 21.f, 28.f, 36.f }; + bfloat16x8_t s1 = vld1q_as_bf16(src1), s2 = vld1q_as_bf16(src2); + float32x4_t d = vbfdotq_f32(vdupq_n_f32(0.f), s1, s2); + vprintreg("(s1[0]*s2[0] + s1[1]*s2[1], ... s1[6]*s2[6] + s1[7]*s2[7])", d); +} +/*#if defined __clang__ +#pragma clang attribute pop +#elif defined GCC +#pragma GCC pop_options +#endif*/ +#else +#error "BF16 is not supported" +#endif + +int main() +{ + test(); + return 0; +} diff --git a/cmake/checks/cpu_dotprod.cpp b/cmake/checks/cpu_neon_dotprod.cpp similarity index 100% rename from cmake/checks/cpu_dotprod.cpp rename to cmake/checks/cpu_neon_dotprod.cpp diff --git a/cmake/checks/cpu_neon_fp16.cpp b/cmake/checks/cpu_neon_fp16.cpp new file mode 100644 index 0000000000..20fbab25d5 --- /dev/null +++ b/cmake/checks/cpu_neon_fp16.cpp @@ -0,0 +1,46 @@ +#include + +#if defined __GNUC__ && (defined __arm__ || defined __aarch64__) +#include "arm_neon.h" + +float16x8_t vld1q_as_f16(const float* src) +{ + float32x4_t s0 = vld1q_f32(src), s1 = vld1q_f32(src + 4); + return vcombine_f16(vcvt_f16_f32(s0), vcvt_f16_f32(s1)); +} + +void vprintreg(const char* name, const float16x8_t& r) +{ + float data[8]; + vst1q_f32(data, vcvt_f32_f16(vget_low_f16(r))); + vst1q_f32(data + 4, vcvt_f32_f16(vget_high_f16(r))); + printf("%s: (%.2f, %.2f, %.2f, %.2f, %.2f, %.2f, %.2f, %.2f)\n", + name, data[0], data[1], data[2], data[3], + data[4], data[5], data[6], data[7]); +} + +void test() +{ + const float src1[] = { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f }; + const float src2[] = { 1.f, 3.f, 6.f, 10.f, 15.f, 21.f, 28.f, 36.f }; + float16x8_t s1 = vld1q_as_f16(src1), s2 = vld1q_as_f16(src2); + float16x8_t d = vsubq_f16(s1, s1); + d = vfmaq_laneq_f16(d, s1, s2, 0); + d = vfmaq_laneq_f16(d, s1, s2, 1); + d = vfmaq_laneq_f16(d, s1, s2, 2); + d = vfmaq_laneq_f16(d, s1, s2, 3); + d = vfmaq_laneq_f16(d, s1, s2, 4); + d = vfmaq_laneq_f16(d, s1, s2, 5); + d = vfmaq_laneq_f16(d, s1, s2, 6); + d = vfmaq_laneq_f16(d, s1, s2, 7); + vprintreg("s1*s2[0]+s1*s2[1] + ... + s1*s2[7]", d); +} +#else +#error "FP16 is not supported" +#endif + +int main() +{ + test(); + return 0; +} diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index 8307ca7d1c..9793c0ba68 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -269,6 +269,8 @@ namespace cv { #define CV_CPU_NEON 100 #define CV_CPU_NEON_DOTPROD 101 +#define CV_CPU_NEON_FP16 102 +#define CV_CPU_NEON_BF16 103 #define CV_CPU_MSA 150 @@ -328,6 +330,8 @@ enum CpuFeatures { CPU_NEON = 100, CPU_NEON_DOTPROD = 101, + CPU_NEON_FP16 = 102, + CPU_NEON_BF16 = 103, CPU_MSA = 150, diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index 44e1e04a03..8b9f47dc2d 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -408,6 +408,8 @@ struct HWFeatures g_hwFeatureNames[CPU_NEON] = "NEON"; g_hwFeatureNames[CPU_NEON_DOTPROD] = "NEON_DOTPROD"; + g_hwFeatureNames[CPU_NEON_FP16] = "NEON_FP16"; + g_hwFeatureNames[CPU_NEON_BF16] = "NEON_BF16"; g_hwFeatureNames[CPU_VSX] = "VSX"; g_hwFeatureNames[CPU_VSX3] = "VSX3"; @@ -566,10 +568,15 @@ struct HWFeatures while ((size_t)read(cpufile, &auxv, size_auxv_t) == size_auxv_t) { + // see https://elixir.bootlin.com/linux/latest/source/arch/arm64/include/uapi/asm/hwcap.h if (auxv.a_type == AT_HWCAP) { - have[CV_CPU_NEON_DOTPROD] = (auxv.a_un.a_val & (1 << 20)) != 0; - break; + have[CV_CPU_NEON_DOTPROD] = (auxv.a_un.a_val & (1 << 20)) != 0; // HWCAP_ASIMDDP + have[CV_CPU_NEON_FP16] = (auxv.a_un.a_val & (1 << 10)) != 0; // HWCAP_ASIMDHP + } + else if (auxv.a_type == AT_HWCAP2) + { + have[CV_CPU_NEON_BF16] = (auxv.a_un.a_val & (1 << 14)) != 0; // HWCAP2_BF16 } } @@ -623,16 +630,23 @@ struct HWFeatures have[CV_CPU_NEON] = true; #endif #if (defined __ARM_FP && (((__ARM_FP & 0x2) != 0) && defined __ARM_NEON__)) - have[CV_CPU_FP16] = true; - #endif - #if (defined __ARM_FEATURE_DOTPROD) - int has_feat_dotprod = 0; - size_t has_feat_dotprod_size = sizeof(has_feat_dotprod); - sysctlbyname("hw.optional.arm.FEAT_DotProd", &has_feat_dotprod, &has_feat_dotprod_size, NULL, 0); - if (has_feat_dotprod) { - have[CV_CPU_NEON_DOTPROD] = true; - } + have[CV_CPU_FP16] = have[CV_CPU_NEON_FP16] = true; #endif + // system.cpp may be compiled w/o special -march=armv8...+dotprod, -march=armv8...+bf16 etc., + // so we check for the features in any case, no mater what are the compile flags. + // We check the real hardware capabilities here. + int has_feat_dotprod = 0; + size_t has_feat_dotprod_size = sizeof(has_feat_dotprod); + sysctlbyname("hw.optional.arm.FEAT_DotProd", &has_feat_dotprod, &has_feat_dotprod_size, NULL, 0); + if (has_feat_dotprod) { + have[CV_CPU_NEON_DOTPROD] = true; + } + int has_feat_bf16 = 0; + size_t has_feat_bf16_size = sizeof(has_feat_bf16); + sysctlbyname("hw.optional.arm.FEAT_BF16", &has_feat_bf16, &has_feat_bf16_size, NULL, 0); + if (has_feat_bf16) { + have[CV_CPU_NEON_BF16] = true; + } #elif (defined __clang__) #if (defined __ARM_NEON__ || (defined __ARM_NEON && defined __aarch64__)) have[CV_CPU_NEON] = true;