Merge pull request #21630 from shibayan:arm64-msvc-neon

* Added NEON support in builds for Windows on ARM * Fixed `HAVE_CPU_NEON_SUPPORT` display broken during compiler test * Fixed a build error prior to Visual Studio 2022
3 years ago · d354ad1c34
parent 119d8b3aca
commit d354ad1c34
4 changed files with 22 additions and 12 deletions
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@ -314,6 +314,10 @@ if(MSVC)
    set(OPENCV_EXTRA_C_FLAGS "${OPENCV_EXTRA_C_FLAGS} /FS")
    set(OPENCV_EXTRA_CXX_FLAGS "${OPENCV_EXTRA_CXX_FLAGS} /FS")
  endif()
+
+  if(AARCH64 AND NOT MSVC_VERSION LESS 1930)
+    set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /D _ARM64_DISTINCT_NEON_TYPES")
+  endif()
 endif()

 if(PROJECT_NAME STREQUAL "OpenCV")
--- a/cmake/checks/cpu_neon.cpp
+++ b/cmake/checks/cpu_neon.cpp
@ -1,6 +1,7 @@
 #include <stdio.h>

 #if defined _WIN32 && (defined(_M_ARM) || defined(_M_ARM64))
+# define _ARM64_DISTINCT_NEON_TYPES
 # include <Intrin.h>
 # include <arm_neon.h>
 # define CV_NEON 1
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@ -591,28 +591,26 @@ inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,

 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
 {
-    return v_int16x8(vcombine_s16(
-                                  vshrn_n_s32(vmull_s16( vget_low_s16(a.val),  vget_low_s16(b.val)), 16),
-                                  vshrn_n_s32(
 #if CV_NEON_AARCH64
-                                    vmull_high_s16(a.val, b.val)
+    int32x4_t c = vmull_high_s16(a.val, b.val);
 #else // #if CV_NEON_AARCH64
-                                    vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val))
+    int32x4_t c = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
 #endif // #if CV_NEON_AARCH64
-                                    , 16)
+    return v_int16x8(vcombine_s16(
+                                  vshrn_n_s32(vmull_s16( vget_low_s16(a.val),  vget_low_s16(b.val)), 16),
+                                  vshrn_n_s32(c, 16)
                                 ));
 }
 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
 {
-    return v_uint16x8(vcombine_u16(
-                                   vshrn_n_u32(vmull_u16( vget_low_u16(a.val),  vget_low_u16(b.val)), 16),
-                                   vshrn_n_u32(
 #if CV_NEON_AARCH64
-                                    vmull_high_u16(a.val, b.val)
+    uint32x4_t c = vmull_high_u16(a.val, b.val);
 #else // #if CV_NEON_AARCH64
-                                    vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val))
+    uint32x4_t c = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
 #endif // #if CV_NEON_AARCH64
-                                    , 16)
+    return v_uint16x8(vcombine_u16(
+                                   vshrn_n_u32(vmull_u16( vget_low_u16(a.val),  vget_low_u16(b.val)), 16),
+                                   vshrn_n_u32(c, 16)
                                  ));
 }

@ -1937,10 +1935,14 @@ inline v_int32x4 v_round(const v_float32x4& a)
 {
    float32x4_t a_ = a.val;
    int32x4_t result;
+#if defined _MSC_VER
+    result = vcvtnq_s32_f32(a_);
+#else
    __asm__ ("fcvtns %0.4s, %1.4s"
             : "=w"(result)
             : "w"(a_)
             : /* No clobbers */);
+#endif
    return v_int32x4(result);
 }
 #else
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@ -615,6 +615,9 @@ struct HWFeatures
    #if defined _ARM_ && (defined(_WIN32_WCE) && _WIN32_WCE >= 0x800)
        have[CV_CPU_NEON] = true;
    #endif
+    #if defined _M_ARM64
+        have[CV_CPU_NEON] = true;
+    #endif
    #ifdef __riscv_vector
        have[CV_CPU_RISCVV] = true;
    #endif