diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake index 90ad7d783c..68867a25d0 100644 --- a/cmake/OpenCVCompilerOptimizations.cmake +++ b/cmake/OpenCVCompilerOptimizations.cmake @@ -331,7 +331,9 @@ if(X86 OR X86_64) elseif(ARM OR AARCH64) ocv_update(CPU_NEON_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_neon.cpp") ocv_update(CPU_FP16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_fp16.cpp") - ocv_update(CPU_NEON_DOTPROD_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_dotprod.cpp") + ocv_update(CPU_NEON_FP16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_neon_fp16.cpp") + ocv_update(CPU_NEON_BF16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_neon_bf16.cpp") + ocv_update(CPU_NEON_DOTPROD_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_neon_dotprod.cpp") if(NOT AARCH64) ocv_update(CPU_KNOWN_OPTIMIZATIONS "VFPV3;NEON;FP16") if(NOT MSVC) @@ -343,12 +345,17 @@ elseif(ARM OR AARCH64) endif() ocv_update(CPU_FP16_IMPLIES "NEON") else() - ocv_update(CPU_KNOWN_OPTIMIZATIONS "NEON;FP16;NEON_DOTPROD") + ocv_update(CPU_KNOWN_OPTIMIZATIONS "NEON;FP16;NEON_DOTPROD;NEON_FP16;NEON_BF16") ocv_update(CPU_NEON_FLAGS_ON "") ocv_update(CPU_FP16_IMPLIES "NEON") ocv_update(CPU_NEON_DOTPROD_FLAGS_ON "-march=armv8.2-a+dotprod") ocv_update(CPU_NEON_DOTPROD_IMPLIES "NEON") + ocv_update(CPU_NEON_FP16_FLAGS_ON "-march=armv8.2-a+fp16") + ocv_update(CPU_NEON_FP16_IMPLIES "NEON") + ocv_update(CPU_NEON_BF16_FLAGS_ON "-march=armv8.2-a+fp16+bf16") + ocv_update(CPU_NEON_BF16_IMPLIES "NEON") set(CPU_BASELINE "NEON;FP16" CACHE STRING "${HELP_CPU_BASELINE}") + set(CPU_DISPATCH "NEON_FP16;NEON_BF16;NEON_DOTPROD" CACHE STRING "${HELP_CPU_DISPATCH}") endif() elseif(MIPS) ocv_update(CPU_MSA_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_msa.cpp") diff --git a/cmake/checks/cpu_neon_bf16.cpp b/cmake/checks/cpu_neon_bf16.cpp new file mode 100644 index 0000000000..c18d63c579 --- /dev/null +++ b/cmake/checks/cpu_neon_bf16.cpp @@ -0,0 +1,46 @@ +#if defined __GNUC__ && (defined __arm__ || defined __aarch64__) +#include +#include "arm_neon.h" + +/*#if defined __clang__ +#pragma clang attribute push (__attribute__((target("bf16"))), apply_to=function) +#elif defined GCC +#pragma GCC push_options +#pragma GCC target("armv8.2-a", "bf16") +#endif*/ +bfloat16x8_t vld1q_as_bf16(const float* src) +{ + float32x4_t s0 = vld1q_f32(src), s1 = vld1q_f32(src + 4); + return vcombine_bf16(vcvt_bf16_f32(s0), vcvt_bf16_f32(s1)); +} + +void vprintreg(const char* name, const float32x4_t& r) +{ + float data[4]; + vst1q_f32(data, r); + printf("%s: (%.2f, %.2f, %.2f, %.2f)\n", + name, data[0], data[1], data[2], data[3]); +} + +void test() +{ + const float src1[] = { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f }; + const float src2[] = { 1.f, 3.f, 6.f, 10.f, 15.f, 21.f, 28.f, 36.f }; + bfloat16x8_t s1 = vld1q_as_bf16(src1), s2 = vld1q_as_bf16(src2); + float32x4_t d = vbfdotq_f32(vdupq_n_f32(0.f), s1, s2); + vprintreg("(s1[0]*s2[0] + s1[1]*s2[1], ... s1[6]*s2[6] + s1[7]*s2[7])", d); +} +/*#if defined __clang__ +#pragma clang attribute pop +#elif defined GCC +#pragma GCC pop_options +#endif*/ +#else +#error "BF16 is not supported" +#endif + +int main() +{ + test(); + return 0; +} diff --git a/cmake/checks/cpu_dotprod.cpp b/cmake/checks/cpu_neon_dotprod.cpp similarity index 100% rename from cmake/checks/cpu_dotprod.cpp rename to cmake/checks/cpu_neon_dotprod.cpp diff --git a/cmake/checks/cpu_neon_fp16.cpp b/cmake/checks/cpu_neon_fp16.cpp new file mode 100644 index 0000000000..20fbab25d5 --- /dev/null +++ b/cmake/checks/cpu_neon_fp16.cpp @@ -0,0 +1,46 @@ +#include + +#if defined __GNUC__ && (defined __arm__ || defined __aarch64__) +#include "arm_neon.h" + +float16x8_t vld1q_as_f16(const float* src) +{ + float32x4_t s0 = vld1q_f32(src), s1 = vld1q_f32(src + 4); + return vcombine_f16(vcvt_f16_f32(s0), vcvt_f16_f32(s1)); +} + +void vprintreg(const char* name, const float16x8_t& r) +{ + float data[8]; + vst1q_f32(data, vcvt_f32_f16(vget_low_f16(r))); + vst1q_f32(data + 4, vcvt_f32_f16(vget_high_f16(r))); + printf("%s: (%.2f, %.2f, %.2f, %.2f, %.2f, %.2f, %.2f, %.2f)\n", + name, data[0], data[1], data[2], data[3], + data[4], data[5], data[6], data[7]); +} + +void test() +{ + const float src1[] = { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f }; + const float src2[] = { 1.f, 3.f, 6.f, 10.f, 15.f, 21.f, 28.f, 36.f }; + float16x8_t s1 = vld1q_as_f16(src1), s2 = vld1q_as_f16(src2); + float16x8_t d = vsubq_f16(s1, s1); + d = vfmaq_laneq_f16(d, s1, s2, 0); + d = vfmaq_laneq_f16(d, s1, s2, 1); + d = vfmaq_laneq_f16(d, s1, s2, 2); + d = vfmaq_laneq_f16(d, s1, s2, 3); + d = vfmaq_laneq_f16(d, s1, s2, 4); + d = vfmaq_laneq_f16(d, s1, s2, 5); + d = vfmaq_laneq_f16(d, s1, s2, 6); + d = vfmaq_laneq_f16(d, s1, s2, 7); + vprintreg("s1*s2[0]+s1*s2[1] + ... + s1*s2[7]", d); +} +#else +#error "FP16 is not supported" +#endif + +int main() +{ + test(); + return 0; +} diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index 8307ca7d1c..9793c0ba68 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -269,6 +269,8 @@ namespace cv { #define CV_CPU_NEON 100 #define CV_CPU_NEON_DOTPROD 101 +#define CV_CPU_NEON_FP16 102 +#define CV_CPU_NEON_BF16 103 #define CV_CPU_MSA 150 @@ -328,6 +330,8 @@ enum CpuFeatures { CPU_NEON = 100, CPU_NEON_DOTPROD = 101, + CPU_NEON_FP16 = 102, + CPU_NEON_BF16 = 103, CPU_MSA = 150, diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index 44e1e04a03..8b9f47dc2d 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -408,6 +408,8 @@ struct HWFeatures g_hwFeatureNames[CPU_NEON] = "NEON"; g_hwFeatureNames[CPU_NEON_DOTPROD] = "NEON_DOTPROD"; + g_hwFeatureNames[CPU_NEON_FP16] = "NEON_FP16"; + g_hwFeatureNames[CPU_NEON_BF16] = "NEON_BF16"; g_hwFeatureNames[CPU_VSX] = "VSX"; g_hwFeatureNames[CPU_VSX3] = "VSX3"; @@ -566,10 +568,15 @@ struct HWFeatures while ((size_t)read(cpufile, &auxv, size_auxv_t) == size_auxv_t) { + // see https://elixir.bootlin.com/linux/latest/source/arch/arm64/include/uapi/asm/hwcap.h if (auxv.a_type == AT_HWCAP) { - have[CV_CPU_NEON_DOTPROD] = (auxv.a_un.a_val & (1 << 20)) != 0; - break; + have[CV_CPU_NEON_DOTPROD] = (auxv.a_un.a_val & (1 << 20)) != 0; // HWCAP_ASIMDDP + have[CV_CPU_NEON_FP16] = (auxv.a_un.a_val & (1 << 10)) != 0; // HWCAP_ASIMDHP + } + else if (auxv.a_type == AT_HWCAP2) + { + have[CV_CPU_NEON_BF16] = (auxv.a_un.a_val & (1 << 14)) != 0; // HWCAP2_BF16 } } @@ -623,16 +630,23 @@ struct HWFeatures have[CV_CPU_NEON] = true; #endif #if (defined __ARM_FP && (((__ARM_FP & 0x2) != 0) && defined __ARM_NEON__)) - have[CV_CPU_FP16] = true; - #endif - #if (defined __ARM_FEATURE_DOTPROD) - int has_feat_dotprod = 0; - size_t has_feat_dotprod_size = sizeof(has_feat_dotprod); - sysctlbyname("hw.optional.arm.FEAT_DotProd", &has_feat_dotprod, &has_feat_dotprod_size, NULL, 0); - if (has_feat_dotprod) { - have[CV_CPU_NEON_DOTPROD] = true; - } + have[CV_CPU_FP16] = have[CV_CPU_NEON_FP16] = true; #endif + // system.cpp may be compiled w/o special -march=armv8...+dotprod, -march=armv8...+bf16 etc., + // so we check for the features in any case, no mater what are the compile flags. + // We check the real hardware capabilities here. + int has_feat_dotprod = 0; + size_t has_feat_dotprod_size = sizeof(has_feat_dotprod); + sysctlbyname("hw.optional.arm.FEAT_DotProd", &has_feat_dotprod, &has_feat_dotprod_size, NULL, 0); + if (has_feat_dotprod) { + have[CV_CPU_NEON_DOTPROD] = true; + } + int has_feat_bf16 = 0; + size_t has_feat_bf16_size = sizeof(has_feat_bf16); + sysctlbyname("hw.optional.arm.FEAT_BF16", &has_feat_bf16, &has_feat_bf16_size, NULL, 0); + if (has_feat_bf16) { + have[CV_CPU_NEON_BF16] = true; + } #elif (defined __clang__) #if (defined __ARM_NEON__ || (defined __ARM_NEON && defined __aarch64__)) have[CV_CPU_NEON] = true;