From 49f80cb3c42a91a8baa966b38664bdae4296db22 Mon Sep 17 00:00:00 2001 From: Yuantao Feng Date: Tue, 21 May 2024 21:58:16 +0800 Subject: [PATCH] Merge pull request #24804 from fengyuentau:fix_lapack_warnings core: try to solve warnings caused by Apple's new LAPACK interface #24804 Resolves https://github.com/opencv/opencv/issues/24660 Apple's BLAS documentation: https://developer.apple.com/documentation/accelerate/blas?language=objc New interface since macOS >= 13.3, iOS >= 16.4. Todo: - [x] Detect macOS version. - [x] ~Detect iOS versions (major and minor version).~ No calling of Accelerate New LAPACK on iOS. - [x] Solve calling `cblas_cgemm` and `cblas_zgemm`. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake --- cmake/OpenCVFindLAPACK.cmake | 15 ++++ modules/core/src/hal_internal.cpp | 143 ++++++++++++++++++++++-------- 2 files changed, 122 insertions(+), 36 deletions(-) diff --git a/cmake/OpenCVFindLAPACK.cmake b/cmake/OpenCVFindLAPACK.cmake index 9b1b60f19e..d559ab88b9 100644 --- a/cmake/OpenCVFindLAPACK.cmake +++ b/cmake/OpenCVFindLAPACK.cmake @@ -106,11 +106,26 @@ macro(ocv_lapack_check) list(APPEND __link_directories ${LAPACK_LINK_LIBRARIES}) endif() + set(LAPACK_TRY_COMPILE_DEF "") + if(LAPACK_IMPL STREQUAL "LAPACK/Apple" AND NOT IOS) # https://github.com/opencv/opencv/issues/24660 + # Get macOS version + execute_process(COMMAND sw_vers -productVersion + OUTPUT_VARIABLE MACOS_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE) + # Enable Accelerate New LAPACK if macOS >= 13.3 + if (MACOS_VERSION VERSION_GREATER "13.3" OR MACOS_VERSION VERSION_EQUAL "13.3") + set(LAPACK_TRY_COMPILE_DEF "-DACCELERATE_NEW_LAPACK") + add_compile_definitions(ACCELERATE_NEW_LAPACK) + add_compile_definitions(ACCELERATE_LAPACK_ILP64) + endif() + endif() + try_compile(__VALID_LAPACK "${OpenCV_BINARY_DIR}" "${OpenCV_SOURCE_DIR}/cmake/checks/lapack_check.cpp" CMAKE_FLAGS "-DINCLUDE_DIRECTORIES:STRING=${LAPACK_INCLUDE_DIR}\;${CMAKE_BINARY_DIR}" "-DLINK_DIRECTORIES:STRING=${__link_directories}" + COMPILE_DEFINITIONS ${LAPACK_TRY_COMPILE_DEF} LINK_LIBRARIES ${LAPACK_LIBRARIES} OUTPUT_VARIABLE TRY_OUT ) diff --git a/modules/core/src/hal_internal.cpp b/modules/core/src/hal_internal.cpp index 28227688c0..377c688015 100644 --- a/modules/core/src/hal_internal.cpp +++ b/modules/core/src/hal_internal.cpp @@ -111,8 +111,18 @@ set_value(fptype *dst, size_t dst_ld, fptype value, size_t m, size_t n) template static inline int lapack_LU(fptype* a, size_t a_step, int m, fptype* b, size_t b_step, int n, int* info) { - int lda = (int)(a_step / sizeof(fptype)), sign = 0; - int* piv = new int[m]; +#if defined (ACCELERATE_NEW_LAPACK) && defined (ACCELERATE_LAPACK_ILP64) + cv::AutoBuffer piv_buff(m); + long lda = (long)(a_step / sizeof(fptype)); + long _m = static_cast(m), _n = static_cast(n); + long _info[1]; +#else + cv::AutoBuffer piv_buff(m); + int lda = (int)(a_step / sizeof(fptype)); + int _m = m, _n = n; + int* _info = info; +#endif + auto piv = piv_buff.data(); transpose_square_inplace(a, lda, m); @@ -121,9 +131,9 @@ lapack_LU(fptype* a, size_t a_step, int m, fptype* b, size_t b_step, int n, int* if(n == 1 && b_step == sizeof(fptype)) { if(typeid(fptype) == typeid(float)) - sgesv_(&m, &n, (float*)a, &lda, piv, (float*)b, &m, info); + sgesv_(&_m, &_n, (float*)a, &lda, piv, (float*)b, &_m, _info); else if(typeid(fptype) == typeid(double)) - dgesv_(&m, &n, (double*)a, &lda, piv, (double*)b, &m, info); + dgesv_(&_m, &_n, (double*)a, &lda, piv, (double*)b, &_m, _info); } else { @@ -133,9 +143,9 @@ lapack_LU(fptype* a, size_t a_step, int m, fptype* b, size_t b_step, int n, int* transpose(b, ldb, tmpB, m, m, n); if(typeid(fptype) == typeid(float)) - sgesv_(&m, &n, (float*)a, &lda, piv, (float*)tmpB, &m, info); + sgesv_(&_m, &_n, (float*)a, &lda, piv, (float*)tmpB, &_m, _info); else if(typeid(fptype) == typeid(double)) - dgesv_(&m, &n, (double*)a, &lda, piv, (double*)tmpB, &m, info); + dgesv_(&_m, &_n, (double*)a, &lda, piv, (double*)tmpB, &_m, _info); transpose(tmpB, m, b, ldb, n, m); delete[] tmpB; @@ -144,11 +154,16 @@ lapack_LU(fptype* a, size_t a_step, int m, fptype* b, size_t b_step, int n, int* else { if(typeid(fptype) == typeid(float)) - sgetrf_(&m, &m, (float*)a, &lda, piv, info); + sgetrf_(&_m, &_m, (float*)a, &lda, piv, _info); else if(typeid(fptype) == typeid(double)) - dgetrf_(&m, &m, (double*)a, &lda, piv, info); + dgetrf_(&_m, &_m, (double*)a, &lda, piv, _info); } +#if defined (ACCELERATE_NEW_LAPACK) && defined (ACCELERATE_LAPACK_ILP64) + *info = static_cast(_info[0]); +#endif + + int sign = 0; if(*info == 0) { for(int i = 0; i < m; i++) @@ -158,15 +173,21 @@ lapack_LU(fptype* a, size_t a_step, int m, fptype* b, size_t b_step, int n, int* else *info = 0; //in opencv LU function zero means error - delete[] piv; return CV_HAL_ERROR_OK; } template static inline int lapack_Cholesky(fptype* a, size_t a_step, int m, fptype* b, size_t b_step, int n, bool* info) { +#if defined (ACCELERATE_NEW_LAPACK) && defined (ACCELERATE_LAPACK_ILP64) + long _m = static_cast(m), _n = static_cast(n); + long lapackStatus = 0; + long lda = (long)(a_step / sizeof(fptype)); +#else + int _m = m, _n = n; int lapackStatus = 0; int lda = (int)(a_step / sizeof(fptype)); +#endif char L[] = {'L', '\0'}; if(b) @@ -174,9 +195,9 @@ lapack_Cholesky(fptype* a, size_t a_step, int m, fptype* b, size_t b_step, int n if(n == 1 && b_step == sizeof(fptype)) { if(typeid(fptype) == typeid(float)) - OCV_LAPACK_FUNC(sposv)(L, &m, &n, (float*)a, &lda, (float*)b, &m, &lapackStatus); + OCV_LAPACK_FUNC(sposv)(L, &_m, &_n, (float*)a, &lda, (float*)b, &_m, &lapackStatus); else if(typeid(fptype) == typeid(double)) - OCV_LAPACK_FUNC(dposv)(L, &m, &n, (double*)a, &lda, (double*)b, &m, &lapackStatus); + OCV_LAPACK_FUNC(dposv)(L, &_m, &_n, (double*)a, &lda, (double*)b, &_m, &lapackStatus); } else { @@ -185,9 +206,9 @@ lapack_Cholesky(fptype* a, size_t a_step, int m, fptype* b, size_t b_step, int n transpose(b, ldb, tmpB, m, m, n); if(typeid(fptype) == typeid(float)) - OCV_LAPACK_FUNC(sposv)(L, &m, &n, (float*)a, &lda, (float*)tmpB, &m, &lapackStatus); + OCV_LAPACK_FUNC(sposv)(L, &_m, &_n, (float*)a, &lda, (float*)tmpB, &_m, &lapackStatus); else if(typeid(fptype) == typeid(double)) - OCV_LAPACK_FUNC(dposv)(L, &m, &n, (double*)a, &lda, (double*)tmpB, &m, &lapackStatus); + OCV_LAPACK_FUNC(dposv)(L, &_m, &_n, (double*)a, &lda, (double*)tmpB, &_m, &lapackStatus); transpose(tmpB, m, b, ldb, n, m); delete[] tmpB; @@ -196,9 +217,9 @@ lapack_Cholesky(fptype* a, size_t a_step, int m, fptype* b, size_t b_step, int n else { if(typeid(fptype) == typeid(float)) - OCV_LAPACK_FUNC(spotrf)(L, &m, (float*)a, &lda, &lapackStatus); + OCV_LAPACK_FUNC(spotrf)(L, &_m, (float*)a, &lda, &lapackStatus); else if(typeid(fptype) == typeid(double)) - OCV_LAPACK_FUNC(dpotrf)(L, &m, (double*)a, &lda, &lapackStatus); + OCV_LAPACK_FUNC(dpotrf)(L, &_m, (double*)a, &lda, &lapackStatus); } if(lapackStatus == 0) *info = true; @@ -210,11 +231,24 @@ lapack_Cholesky(fptype* a, size_t a_step, int m, fptype* b, size_t b_step, int n template static inline int lapack_SVD(fptype* a, size_t a_step, fptype *w, fptype* u, size_t u_step, fptype* vt, size_t v_step, int m, int n, int flags, int* info) { +#if defined (ACCELERATE_NEW_LAPACK) && defined (ACCELERATE_LAPACK_ILP64) + long _m = static_cast(m), _n = static_cast(n); + long _info[1]; + long lda = (long)(a_step / sizeof(fptype)); + long ldv = (long)(v_step / sizeof(fptype)); + long ldu = (long)(u_step / sizeof(fptype)); + long lwork = -1; + cv::AutoBuffer iworkBuf_(8 * std::min(m, n)); +#else + int _m = m, _n = n; + int* _info = info; int lda = (int)(a_step / sizeof(fptype)); int ldv = (int)(v_step / sizeof(fptype)); int ldu = (int)(u_step / sizeof(fptype)); int lwork = -1; - int* iworkBuf = new int[8*std::min(m, n)]; + cv::AutoBuffer iworkBuf_(8 * std::min(m, n)); +#endif + auto iworkBuf = iworkBuf_.data(); fptype work1 = 0; //A already transposed and m>=n @@ -238,9 +272,9 @@ lapack_SVD(fptype* a, size_t a_step, fptype *w, fptype* u, size_t u_step, fptype } if(typeid(fptype) == typeid(float)) - OCV_LAPACK_FUNC(sgesdd)(mode, &m, &n, (float*)a, &lda, (float*)w, (float*)u, &ldu, (float*)vt, &ldv, (float*)&work1, &lwork, iworkBuf, info); + OCV_LAPACK_FUNC(sgesdd)(mode, &_m, &_n, (float*)a, &lda, (float*)w, (float*)u, &ldu, (float*)vt, &ldv, (float*)&work1, &lwork, iworkBuf, _info); else if(typeid(fptype) == typeid(double)) - OCV_LAPACK_FUNC(dgesdd)(mode, &m, &n, (double*)a, &lda, (double*)w, (double*)u, &ldu, (double*)vt, &ldv, (double*)&work1, &lwork, iworkBuf, info); + OCV_LAPACK_FUNC(dgesdd)(mode, &_m, &_n, (double*)a, &lda, (double*)w, (double*)u, &ldu, (double*)vt, &ldv, (double*)&work1, &lwork, iworkBuf, _info); lwork = (int)round(work1); //optimal buffer size fptype* buffer = new fptype[lwork + 1]; @@ -251,9 +285,13 @@ lapack_SVD(fptype* a, size_t a_step, fptype *w, fptype* u, size_t u_step, fptype CV_ANNOTATE_MEMORY_IS_INITIALIZED(buffer, sizeof(fptype) * (lwork + 1)); if(typeid(fptype) == typeid(float)) - OCV_LAPACK_FUNC(sgesdd)(mode, &m, &n, (float*)a, &lda, (float*)w, (float*)u, &ldu, (float*)vt, &ldv, (float*)buffer, &lwork, iworkBuf, info); + OCV_LAPACK_FUNC(sgesdd)(mode, &_m, &_n, (float*)a, &lda, (float*)w, (float*)u, &ldu, (float*)vt, &ldv, (float*)buffer, &lwork, iworkBuf, _info); else if(typeid(fptype) == typeid(double)) - OCV_LAPACK_FUNC(dgesdd)(mode, &m, &n, (double*)a, &lda, (double*)w, (double*)u, &ldu, (double*)vt, &ldv, (double*)buffer, &lwork, iworkBuf, info); + OCV_LAPACK_FUNC(dgesdd)(mode, &_m, &_n, (double*)a, &lda, (double*)w, (double*)u, &ldu, (double*)vt, &ldv, (double*)buffer, &lwork, iworkBuf, _info); + +#if defined (ACCELERATE_NEW_LAPACK) && defined (ACCELERATE_LAPACK_ILP64) + *info = static_cast(_info[0]); +#endif // Make sure MSAN sees the memory as having been written. // MSAN does not think it has been written because a different language was called. @@ -276,7 +314,6 @@ lapack_SVD(fptype* a, size_t a_step, fptype *w, fptype* u, size_t u_step, fptype delete[] u; } - delete[] iworkBuf; delete[] buffer; return CV_HAL_ERROR_OK; } @@ -284,14 +321,27 @@ lapack_SVD(fptype* a, size_t a_step, fptype *w, fptype* u, size_t u_step, fptype template static inline int lapack_QR(fptype* a, size_t a_step, int m, int n, int k, fptype* b, size_t b_step, fptype* dst, int* info) { +#if defined (ACCELERATE_NEW_LAPACK) && defined (ACCELERATE_LAPACK_ILP64) + long _m = static_cast(m), _n = static_cast(n), _k = static_cast(k); + long _info[1]; + long lda = (long)(a_step / sizeof(fptype)); + long lwork = -1; + long ldtmpA; +#else + int _m = m, _n = n, _k = k; + int* _info = info; int lda = (int)(a_step / sizeof(fptype)); + int lwork = -1; + int ldtmpA; +#endif + char mode[] = { 'N', '\0' }; if(m < n) return CV_HAL_ERROR_NOT_IMPLEMENTED; std::vector tmpAMemHolder; fptype* tmpA; - int ldtmpA; + if (m == n) { transpose_square_inplace(a, lda, m); @@ -306,7 +356,6 @@ lapack_QR(fptype* a, size_t a_step, int m, int n, int k, fptype* b, size_t b_ste transpose(a, lda, tmpA, m, m, n); } - int lwork = -1; fptype work1 = 0.; if (b) @@ -314,18 +363,18 @@ lapack_QR(fptype* a, size_t a_step, int m, int n, int k, fptype* b, size_t b_ste if (k == 1 && b_step == sizeof(fptype)) { if (typeid(fptype) == typeid(float)) - OCV_LAPACK_FUNC(sgels)(mode, &m, &n, &k, (float*)tmpA, &ldtmpA, (float*)b, &m, (float*)&work1, &lwork, info); + OCV_LAPACK_FUNC(sgels)(mode, &_m, &_n, &_k, (float*)tmpA, &ldtmpA, (float*)b, &_m, (float*)&work1, &lwork, _info); else if (typeid(fptype) == typeid(double)) - OCV_LAPACK_FUNC(dgels)(mode, &m, &n, &k, (double*)tmpA, &ldtmpA, (double*)b, &m, (double*)&work1, &lwork, info); + OCV_LAPACK_FUNC(dgels)(mode, &_m, &_n, &_k, (double*)tmpA, &ldtmpA, (double*)b, &_m, (double*)&work1, &lwork, _info); lwork = cvRound(work1); //optimal buffer size std::vector workBufMemHolder(lwork + 1); fptype* buffer = &workBufMemHolder.front(); if (typeid(fptype) == typeid(float)) - OCV_LAPACK_FUNC(sgels)(mode, &m, &n, &k, (float*)tmpA, &ldtmpA, (float*)b, &m, (float*)buffer, &lwork, info); + OCV_LAPACK_FUNC(sgels)(mode, &_m, &_n, &_k, (float*)tmpA, &ldtmpA, (float*)b, &_m, (float*)buffer, &lwork, _info); else if (typeid(fptype) == typeid(double)) - OCV_LAPACK_FUNC(dgels)(mode, &m, &n, &k, (double*)tmpA, &ldtmpA, (double*)b, &m, (double*)buffer, &lwork, info); + OCV_LAPACK_FUNC(dgels)(mode, &_m, &_n, &_k, (double*)tmpA, &ldtmpA, (double*)b, &_m, (double*)buffer, &lwork, _info); } else { @@ -335,18 +384,18 @@ lapack_QR(fptype* a, size_t a_step, int m, int n, int k, fptype* b, size_t b_ste transpose(b, ldb, tmpB, m, m, k); if (typeid(fptype) == typeid(float)) - OCV_LAPACK_FUNC(sgels)(mode, &m, &n, &k, (float*)tmpA, &ldtmpA, (float*)tmpB, &m, (float*)&work1, &lwork, info); + OCV_LAPACK_FUNC(sgels)(mode, &_m, &_n, &_k, (float*)tmpA, &ldtmpA, (float*)tmpB, &_m, (float*)&work1, &lwork, _info); else if (typeid(fptype) == typeid(double)) - OCV_LAPACK_FUNC(dgels)(mode, &m, &n, &k, (double*)tmpA, &ldtmpA, (double*)tmpB, &m, (double*)&work1, &lwork, info); + OCV_LAPACK_FUNC(dgels)(mode, &_m, &_n, &_k, (double*)tmpA, &ldtmpA, (double*)tmpB, &_m, (double*)&work1, &lwork, _info); lwork = cvRound(work1); //optimal buffer size std::vector workBufMemHolder(lwork + 1); fptype* buffer = &workBufMemHolder.front(); if (typeid(fptype) == typeid(float)) - OCV_LAPACK_FUNC(sgels)(mode, &m, &n, &k, (float*)tmpA, &ldtmpA, (float*)tmpB, &m, (float*)buffer, &lwork, info); + OCV_LAPACK_FUNC(sgels)(mode, &_m, &_n, &_k, (float*)tmpA, &ldtmpA, (float*)tmpB, &_m, (float*)buffer, &lwork, _info); else if (typeid(fptype) == typeid(double)) - OCV_LAPACK_FUNC(dgels)(mode, &m, &n, &k, (double*)tmpA, &ldtmpA, (double*)tmpB, &m, (double*)buffer, &lwork, info); + OCV_LAPACK_FUNC(dgels)(mode, &_m, &_n, &_k, (double*)tmpA, &ldtmpA, (double*)tmpB, &_m, (double*)buffer, &lwork, _info); transpose(tmpB, m, b, ldb, k, m); } @@ -354,18 +403,18 @@ lapack_QR(fptype* a, size_t a_step, int m, int n, int k, fptype* b, size_t b_ste else { if (typeid(fptype) == typeid(float)) - sgeqrf_(&m, &n, (float*)tmpA, &ldtmpA, (float*)dst, (float*)&work1, &lwork, info); + sgeqrf_(&_m, &_n, (float*)tmpA, &ldtmpA, (float*)dst, (float*)&work1, &lwork, _info); else if (typeid(fptype) == typeid(double)) - dgeqrf_(&m, &n, (double*)tmpA, &ldtmpA, (double*)dst, (double*)&work1, &lwork, info); + dgeqrf_(&_m, &_n, (double*)tmpA, &ldtmpA, (double*)dst, (double*)&work1, &lwork, _info); lwork = cvRound(work1); //optimal buffer size std::vector workBufMemHolder(lwork + 1); fptype* buffer = &workBufMemHolder.front(); if (typeid(fptype) == typeid(float)) - sgeqrf_(&m, &n, (float*)tmpA, &ldtmpA, (float*)dst, (float*)buffer, &lwork, info); + sgeqrf_(&_m, &_n, (float*)tmpA, &ldtmpA, (float*)dst, (float*)buffer, &lwork, _info); else if (typeid(fptype) == typeid(double)) - dgeqrf_(&m, &n, (double*)tmpA, &ldtmpA, (double*)dst, (double*)buffer, &lwork, info); + dgeqrf_(&_m, &_n, (double*)tmpA, &ldtmpA, (double*)dst, (double*)buffer, &lwork, _info); } CV_ANNOTATE_MEMORY_IS_INITIALIZED(info, sizeof(int)); @@ -374,6 +423,10 @@ lapack_QR(fptype* a, size_t a_step, int m, int n, int k, fptype* b, size_t b_ste else transpose(tmpA, m, a, lda, n, m); +#if defined (ACCELERATE_NEW_LAPACK) && defined (ACCELERATE_LAPACK_ILP64) + *info = static_cast(_info[0]); +#endif + if (*info != 0) *info = 0; else @@ -458,7 +511,6 @@ lapack_gemm(const fptype *src1, size_t src1_step, const fptype *src2, size_t src return CV_HAL_ERROR_OK; } - template static inline int lapack_gemm_c(const fptype *src1, size_t src1_step, const fptype *src2, size_t src2_step, fptype alpha, const fptype *src3, size_t src3_step, fptype beta, fptype *dst, size_t dst_step, int a_m, int a_n, int d_n, int flags) @@ -529,10 +581,29 @@ lapack_gemm_c(const fptype *src1, size_t src1_step, const fptype *src2, size_t s else if(src3_step == 0 && beta != 0.0) set_value((std::complex*)dst, lddst, std::complex(0.0, 0.0), d_m, d_n); + // FIXME: this is a workaround. Support ILP64 in HAL API. +#if defined (ACCELERATE_NEW_LAPACK) && defined (ACCELERATE_LAPACK_ILP64) + int M = a_m, N = d_n, K = a_n; + if(typeid(fptype) == typeid(float)) { + auto src1_cast = (std::complex*)(src1); + auto src2_cast = (std::complex*)(src2); + auto dst_cast = (std::complex*)(dst); + long lda = ldsrc1, ldb = ldsrc2, ldc = lddst; + cblas_cgemm(CblasRowMajor, transA, transB, M, N, K, (std::complex*)&cAlpha, src1_cast, lda, src2_cast, ldb, (std::complex*)&cBeta, dst_cast, ldc); + } + else if(typeid(fptype) == typeid(double)) { + auto src1_cast = (std::complex*)(src1); + auto src2_cast = (std::complex*)(src2); + auto dst_cast = (std::complex*)(dst); + long lda = ldsrc1, ldb = ldsrc2, ldc = lddst; + cblas_zgemm(CblasRowMajor, transA, transB, M, N, K, (std::complex*)&cAlpha, src1_cast, lda, src2_cast, ldb, (std::complex*)&cBeta, dst_cast, ldc); + } +#else if(typeid(fptype) == typeid(float)) cblas_cgemm(CblasRowMajor, transA, transB, a_m, d_n, a_n, (float*)reinterpret_cast(cAlpha), (float*)src1, ldsrc1, (float*)src2, ldsrc2, (float*)reinterpret_cast(cBeta), (float*)dst, lddst); else if(typeid(fptype) == typeid(double)) cblas_zgemm(CblasRowMajor, transA, transB, a_m, d_n, a_n, (double*)reinterpret_cast(cAlpha), (double*)src1, ldsrc1, (double*)src2, ldsrc2, (double*)reinterpret_cast(cBeta), (double*)dst, lddst); +#endif return CV_HAL_ERROR_OK; }