Merge pull request #24205 from PeterJohnson:fix-msvc-arm64

ht_dec.c: Improve MSVC arm64 popcount performance #24205

Use NEON instructions for ARM64 (implementation based on https://github.com/microsoft/STL/pull/2127, which is Apache licensed).

Godbolt output here: https://godbolt.org/z/q7GPTqT14
Related patch to openjpeg: https://github.com/uclouvain/openjpeg/pull/1479

### Pull Request Readiness Checklist

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
pull/24681/head
Peter Johnson 11 months ago committed by GitHub
parent 098efb6d3d
commit 72a987ef5a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 7
      3rdparty/openjpeg/openjp2/ht_dec.c

@ -55,6 +55,10 @@
#define OPJ_COMPILER_GNUC #define OPJ_COMPILER_GNUC
#endif #endif
#if defined(OPJ_COMPILER_MSVC) && defined(_M_ARM64)
#include <arm64_neon.h>
#endif
//************************************************************************/ //************************************************************************/
/** @brief Displays the error message for disabling the decoding of SPP and /** @brief Displays the error message for disabling the decoding of SPP and
* MRP passes * MRP passes
@ -71,6 +75,9 @@ OPJ_UINT32 population_count(OPJ_UINT32 val)
{ {
#if defined(OPJ_COMPILER_MSVC) && (defined(_M_IX86) || defined(_M_AMD64)) #if defined(OPJ_COMPILER_MSVC) && (defined(_M_IX86) || defined(_M_AMD64))
return (OPJ_UINT32)__popcnt(val); return (OPJ_UINT32)__popcnt(val);
#elif defined(OPJ_COMPILER_MSVC) && defined(_M_ARM64)
const __n64 temp = neon_cnt(__uint64ToN64_v(val));
return neon_addv8(temp).n8_i8[0];
#elif (defined OPJ_COMPILER_GNUC) #elif (defined OPJ_COMPILER_GNUC)
return (OPJ_UINT32)__builtin_popcount(val); return (OPJ_UINT32)__builtin_popcount(val);
#else #else

Loading…
Cancel
Save