From a2bba1b9e6771b8b27906bf9e3fcea6f2cd3bd2c Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Wed, 15 Apr 2015 00:36:27 +0300 Subject: [PATCH 01/12] HAL extensions: added initial version of universal intrinsics (C implementation and SSE2, NEON - TBD). added empty files where some functionality from core and imgproc will be moved to --- modules/core/include/opencv2/core/base.hpp | 131 - modules/core/include/opencv2/core/cvdef.h | 181 -- modules/core/include/opencv2/core/private.hpp | 8 - modules/core/include/opencv2/core/types_c.h | 16 - modules/hal/include/opencv2/hal.hpp | 45 +- modules/hal/include/opencv2/hal/defs.h | 391 ++- modules/hal/include/opencv2/hal/intrin.hpp | 2254 +++++++++++++++++ modules/hal/src/arithm.cpp | 47 + modules/hal/src/color.cpp | 47 + modules/hal/src/filter.cpp | 47 + modules/hal/src/mathfuncs.cpp | 47 + modules/hal/src/matrix.cpp | 47 + modules/hal/src/precomp.hpp | 42 + modules/hal/src/resize.cpp | 47 + modules/hal/src/{norm.cpp => stat.cpp} | 0 modules/hal/src/warp.cpp | 47 + 16 files changed, 3055 insertions(+), 342 deletions(-) create mode 100644 modules/hal/include/opencv2/hal/intrin.hpp create mode 100644 modules/hal/src/arithm.cpp create mode 100644 modules/hal/src/color.cpp create mode 100644 modules/hal/src/filter.cpp create mode 100644 modules/hal/src/mathfuncs.cpp create mode 100644 modules/hal/src/matrix.cpp create mode 100644 modules/hal/src/resize.cpp rename modules/hal/src/{norm.cpp => stat.cpp} (100%) create mode 100644 modules/hal/src/warp.cpp diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp index 08a6642fba..83661a2fd6 100644 --- a/modules/core/include/opencv2/core/base.hpp +++ b/modules/core/include/opencv2/core/base.hpp @@ -400,136 +400,6 @@ configurations while CV_DbgAssert is only retained in the Debug configuration. # define CV_DbgAssert(expr) #endif - -/////////////// saturate_cast (used in image & signal processing) /////////////////// - -/** -Template function for accurate conversion from one primitive type to another. - -The functions saturate_cast resemble the standard C++ cast operations, such as static_cast\() -and others. They perform an efficient and accurate conversion from one primitive type to another -(see the introduction chapter). saturate in the name means that when the input value v is out of the -range of the target type, the result is not formed just by taking low bits of the input, but instead -the value is clipped. For example: -@code - uchar a = saturate_cast(-100); // a = 0 (UCHAR_MIN) - short b = saturate_cast(33333.33333); // b = 32767 (SHRT_MAX) -@endcode -Such clipping is done when the target type is unsigned char , signed char , unsigned short or -signed short . For 32-bit integers, no clipping is done. - -When the parameter is a floating-point value and the target type is an integer (8-, 16- or 32-bit), -the floating-point value is first rounded to the nearest integer and then clipped if needed (when -the target type is 8- or 16-bit). - -This operation is used in the simplest or most complex image processing functions in OpenCV. - -@param v Function parameter. -@sa add, subtract, multiply, divide, Mat::convertTo -*/ -template static inline _Tp saturate_cast(uchar v) { return _Tp(v); } -/** @overload */ -template static inline _Tp saturate_cast(schar v) { return _Tp(v); } -/** @overload */ -template static inline _Tp saturate_cast(ushort v) { return _Tp(v); } -/** @overload */ -template static inline _Tp saturate_cast(short v) { return _Tp(v); } -/** @overload */ -template static inline _Tp saturate_cast(unsigned v) { return _Tp(v); } -/** @overload */ -template static inline _Tp saturate_cast(int v) { return _Tp(v); } -/** @overload */ -template static inline _Tp saturate_cast(float v) { return _Tp(v); } -/** @overload */ -template static inline _Tp saturate_cast(double v) { return _Tp(v); } -/** @overload */ -template static inline _Tp saturate_cast(int64 v) { return _Tp(v); } -/** @overload */ -template static inline _Tp saturate_cast(uint64 v) { return _Tp(v); } - -//! @cond IGNORED - -template<> inline uchar saturate_cast(schar v) { return (uchar)std::max((int)v, 0); } -template<> inline uchar saturate_cast(ushort v) { return (uchar)std::min((unsigned)v, (unsigned)UCHAR_MAX); } -template<> inline uchar saturate_cast(int v) { return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); } -template<> inline uchar saturate_cast(short v) { return saturate_cast((int)v); } -template<> inline uchar saturate_cast(unsigned v) { return (uchar)std::min(v, (unsigned)UCHAR_MAX); } -template<> inline uchar saturate_cast(float v) { int iv = cvRound(v); return saturate_cast(iv); } -template<> inline uchar saturate_cast(double v) { int iv = cvRound(v); return saturate_cast(iv); } -template<> inline uchar saturate_cast(int64 v) { return (uchar)((uint64)v <= (uint64)UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); } -template<> inline uchar saturate_cast(uint64 v) { return (uchar)std::min(v, (uint64)UCHAR_MAX); } - -template<> inline schar saturate_cast(uchar v) { return (schar)std::min((int)v, SCHAR_MAX); } -template<> inline schar saturate_cast(ushort v) { return (schar)std::min((unsigned)v, (unsigned)SCHAR_MAX); } -template<> inline schar saturate_cast(int v) { return (schar)((unsigned)(v-SCHAR_MIN) <= (unsigned)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); } -template<> inline schar saturate_cast(short v) { return saturate_cast((int)v); } -template<> inline schar saturate_cast(unsigned v) { return (schar)std::min(v, (unsigned)SCHAR_MAX); } -template<> inline schar saturate_cast(float v) { int iv = cvRound(v); return saturate_cast(iv); } -template<> inline schar saturate_cast(double v) { int iv = cvRound(v); return saturate_cast(iv); } -template<> inline schar saturate_cast(int64 v) { return (schar)((uint64)((int64)v-SCHAR_MIN) <= (uint64)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); } -template<> inline schar saturate_cast(uint64 v) { return (schar)std::min(v, (uint64)SCHAR_MAX); } - -template<> inline ushort saturate_cast(schar v) { return (ushort)std::max((int)v, 0); } -template<> inline ushort saturate_cast(short v) { return (ushort)std::max((int)v, 0); } -template<> inline ushort saturate_cast(int v) { return (ushort)((unsigned)v <= (unsigned)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); } -template<> inline ushort saturate_cast(unsigned v) { return (ushort)std::min(v, (unsigned)USHRT_MAX); } -template<> inline ushort saturate_cast(float v) { int iv = cvRound(v); return saturate_cast(iv); } -template<> inline ushort saturate_cast(double v) { int iv = cvRound(v); return saturate_cast(iv); } -template<> inline ushort saturate_cast(int64 v) { return (ushort)((uint64)v <= (uint64)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); } -template<> inline ushort saturate_cast(uint64 v) { return (ushort)std::min(v, (uint64)USHRT_MAX); } - -template<> inline short saturate_cast(ushort v) { return (short)std::min((int)v, SHRT_MAX); } -template<> inline short saturate_cast(int v) { return (short)((unsigned)(v - SHRT_MIN) <= (unsigned)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); } -template<> inline short saturate_cast(unsigned v) { return (short)std::min(v, (unsigned)SHRT_MAX); } -template<> inline short saturate_cast(float v) { int iv = cvRound(v); return saturate_cast(iv); } -template<> inline short saturate_cast(double v) { int iv = cvRound(v); return saturate_cast(iv); } -template<> inline short saturate_cast(int64 v) { return (short)((uint64)((int64)v - SHRT_MIN) <= (uint64)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); } -template<> inline short saturate_cast(uint64 v) { return (short)std::min(v, (uint64)SHRT_MAX); } - -template<> inline int saturate_cast(float v) { return cvRound(v); } -template<> inline int saturate_cast(double v) { return cvRound(v); } - -// we intentionally do not clip negative numbers, to make -1 become 0xffffffff etc. -template<> inline unsigned saturate_cast(float v) { return cvRound(v); } -template<> inline unsigned saturate_cast(double v) { return cvRound(v); } - -//! @endcond - -//////////////////////////////// low-level functions //////////////////////////////// - -CV_EXPORTS int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n); -CV_EXPORTS int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n); -CV_EXPORTS bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n); -CV_EXPORTS bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n); - -CV_EXPORTS int normL1_(const uchar* a, const uchar* b, int n); -CV_EXPORTS float normL1_(const float* a, const float* b, int n); -CV_EXPORTS float normL2Sqr_(const float* a, const float* b, int n); - -CV_EXPORTS void exp(const float* src, float* dst, int n); -CV_EXPORTS void log(const float* src, float* dst, int n); - -CV_EXPORTS void fastAtan2(const float* y, const float* x, float* dst, int n, bool angleInDegrees); -CV_EXPORTS void magnitude(const float* x, const float* y, float* dst, int n); - -/** @brief Computes the cube root of an argument. - -The function cubeRoot computes \f$\sqrt[3]{\texttt{val}}\f$. Negative arguments are handled correctly. -NaN and Inf are not handled. The accuracy approaches the maximum possible accuracy for -single-precision data. -@param val A function argument. - */ -CV_EXPORTS_W float cubeRoot(float val); - -/** @brief Calculates the angle of a 2D vector in degrees. - -The function fastAtan2 calculates the full-range angle of an input 2D vector. The angle is measured -in degrees and varies from 0 to 360 degrees. The accuracy is about 0.3 degrees. -@param x x-coordinate of the vector. -@param y y-coordinate of the vector. - */ -CV_EXPORTS_W float fastAtan2(float y, float x); - /* * Hamming distance functor - counts the bit differences between two strings - useful for the Brief descriptor * bit count of A exclusive XOR'ed with B @@ -549,7 +419,6 @@ typedef Hamming HammingLUT; /////////////////////////////////// inline norms //////////////////////////////////// - template static inline _AccTp normL2Sqr(const _Tp* a, int n) { diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index af0a271a41..1d933b5c30 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -70,16 +70,6 @@ # define CV_EXPORTS #endif -#ifndef CV_INLINE -# if defined __cplusplus -# define CV_INLINE static inline -# elif defined _MSC_VER -# define CV_INLINE __inline -# else -# define CV_INLINE static -# endif -#endif - #ifndef CV_EXTERN_C # ifdef __cplusplus # define CV_EXTERN_C extern "C" @@ -186,19 +176,6 @@ #define CV_ELEM_SIZE(type) \ (CV_MAT_CN(type) << ((((sizeof(size_t)/4+1)*16384|0x3a50) >> CV_MAT_DEPTH(type)*2) & 3)) - -/****************************************************************************************\ -* fast math * -\****************************************************************************************/ - -#if defined __BORLANDC__ -# include -#elif defined __cplusplus -# include -#else -# include -#endif - #ifndef MIN # define MIN(a,b) ((a) > (b) ? (b) : (a)) #endif @@ -207,164 +184,6 @@ # define MAX(a,b) ((a) < (b) ? (b) : (a)) #endif -#ifdef HAVE_TEGRA_OPTIMIZATION -# include "tegra_round.hpp" -#endif - -//! @addtogroup core_utils -//! @{ - -#if CV_VFP -// 1. general scheme -#define ARM_ROUND(_value, _asm_string) \ - int res; \ - float temp; \ - asm(_asm_string : [res] "=r" (res), [temp] "=w" (temp) : [value] "w" (_value)); \ - return res; -// 2. version for double -#ifdef __clang__ -#define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %[value] \n vmov %[res], %[temp]") -#else -#define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %P[value] \n vmov %[res], %[temp]") -#endif -// 3. version for float -#define ARM_ROUND_FLT(value) ARM_ROUND(value, "vcvtr.s32.f32 %[temp], %[value]\n vmov %[res], %[temp]") -#endif // CV_VFP - -/** @brief Rounds floating-point number to the nearest integer - -@param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the -result is not defined. - */ -CV_INLINE int cvRound( double value ) -{ -#if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__) - __m128d t = _mm_set_sd( value ); - return _mm_cvtsd_si32(t); -#elif defined _MSC_VER && defined _M_IX86 - int t; - __asm - { - fld value; - fistp t; - } - return t; -#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION - TEGRA_ROUND_DBL(value); -#elif defined CV_ICC || defined __GNUC__ -# if CV_VFP - ARM_ROUND_DBL(value) -# else - return (int)lrint(value); -# endif -#else - double intpart, fractpart; - fractpart = modf(value, &intpart); - if ((fabs(fractpart) != 0.5) || ((((int)intpart) % 2) != 0)) - return (int)(value + (value >= 0 ? 0.5 : -0.5)); - else - return (int)intpart; -#endif -} - -#ifdef __cplusplus - -/** @overload */ -CV_INLINE int cvRound(float value) -{ -#if defined ANDROID && (defined CV_ICC || defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION - TEGRA_ROUND_FLT(value); -#elif CV_VFP && !defined HAVE_TEGRA_OPTIMIZATION - ARM_ROUND_FLT(value) -#else - return cvRound((double)value); -#endif -} - -/** @overload */ -CV_INLINE int cvRound(int value) -{ - return value; -} - -#endif // __cplusplus - -/** @brief Rounds floating-point number to the nearest integer not larger than the original. - -The function computes an integer i such that: -\f[i \le \texttt{value} < i+1\f] -@param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the -result is not defined. - */ -CV_INLINE int cvFloor( double value ) -{ -#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__) - __m128d t = _mm_set_sd( value ); - int i = _mm_cvtsd_si32(t); - return i - _mm_movemask_pd(_mm_cmplt_sd(t, _mm_cvtsi32_sd(t,i))); -#elif defined __GNUC__ - int i = (int)value; - return i - (i > value); -#else - int i = cvRound(value); - float diff = (float)(value - i); - return i - (diff < 0); -#endif -} - -/** @brief Rounds floating-point number to the nearest integer not larger than the original. - -The function computes an integer i such that: -\f[i \le \texttt{value} < i+1\f] -@param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the -result is not defined. -*/ -CV_INLINE int cvCeil( double value ) -{ -#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)) && !defined(__CUDACC__) - __m128d t = _mm_set_sd( value ); - int i = _mm_cvtsd_si32(t); - return i + _mm_movemask_pd(_mm_cmplt_sd(_mm_cvtsi32_sd(t,i), t)); -#elif defined __GNUC__ - int i = (int)value; - return i + (i < value); -#else - int i = cvRound(value); - float diff = (float)(i - value); - return i + (diff < 0); -#endif -} - -/** @brief Determines if the argument is Not A Number. - -@param value The input floating-point value - -The function returns 1 if the argument is Not A Number (as defined by IEEE754 standard), 0 -otherwise. */ -CV_INLINE int cvIsNaN( double value ) -{ - union { uint64 u; double f; } ieee754; - ieee754.f = value; - return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) + - ((unsigned)ieee754.u != 0) > 0x7ff00000; -} - -/** @brief Determines if the argument is Infinity. - -@param value The input floating-point value - -The function returns 1 if the argument is a plus or minus infinity (as defined by IEEE754 standard) -and 0 otherwise. */ -CV_INLINE int cvIsInf( double value ) -{ - union { uint64 u; double f; } ieee754; - ieee754.f = value; - return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) == 0x7ff00000 && - (unsigned)ieee754.u == 0; -} - -//! @} core_utils - /****************************************************************************************\ * exchange-add operation for atomic operations on reference counters * \****************************************************************************************/ diff --git a/modules/core/include/opencv2/core/private.hpp b/modules/core/include/opencv2/core/private.hpp index 58d78e5848..4f9f487778 100644 --- a/modules/core/include/opencv2/core/private.hpp +++ b/modules/core/include/opencv2/core/private.hpp @@ -136,14 +136,6 @@ namespace cv /* the alignment of all the allocated buffers */ #define CV_MALLOC_ALIGN 16 -#ifdef __GNUC__ -# define CV_DECL_ALIGNED(x) __attribute__ ((aligned (x))) -#elif defined _MSC_VER -# define CV_DECL_ALIGNED(x) __declspec(align(x)) -#else -# define CV_DECL_ALIGNED(x) -#endif - /* IEEE754 constants and macros */ #define CV_TOGGLE_FLT(x) ((x)^((int)(x) < 0 ? 0x7fffffff : 0)) #define CV_TOGGLE_DBL(x) ((x)^((int64)(x) < 0 ? CV_BIG_INT(0x7fffffffffffffff) : 0)) diff --git a/modules/core/include/opencv2/core/types_c.h b/modules/core/include/opencv2/core/types_c.h index 16e613053d..cb39587a9a 100644 --- a/modules/core/include/opencv2/core/types_c.h +++ b/modules/core/include/opencv2/core/types_c.h @@ -113,22 +113,6 @@ bytes of the header. In C++ interface the role of CvArr is played by InputArray */ typedef void CvArr; -typedef union Cv32suf -{ - int i; - unsigned u; - float f; -} -Cv32suf; - -typedef union Cv64suf -{ - int64 i; - uint64 u; - double f; -} -Cv64suf; - typedef int CVStatus; /** @see cv::Error::Code */ diff --git a/modules/hal/include/opencv2/hal.hpp b/modules/hal/include/opencv2/hal.hpp index d0e1350803..7641c46cad 100644 --- a/modules/hal/include/opencv2/hal.hpp +++ b/modules/hal/include/opencv2/hal.hpp @@ -55,7 +55,7 @@ namespace cv { namespace hal { namespace Error { -enum Code +enum { Ok = 0, Unknown = -1 @@ -63,11 +63,46 @@ enum Code } -Error::Code normHamming(const uchar* a, int n, int & result); -Error::Code normHamming(const uchar* a, const uchar* b, int n, int & result); +int normHamming(const uchar* a, int n); +int normHamming(const uchar* a, const uchar* b, int n); -Error::Code normHamming(const uchar* a, int n, int cellSize, int & result); -Error::Code normHamming(const uchar* a, const uchar* b, int n, int cellSize, int & result); +int normHamming(const uchar* a, int n, int cellSize); +int normHamming(const uchar* a, const uchar* b, int n, int cellSize); + +//////////////////////////////// low-level functions //////////////////////////////// + +int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n); +int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n); +bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n); +bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n); + +int normL1_(const uchar* a, const uchar* b, int n); +float normL1_(const float* a, const float* b, int n); +float normL2Sqr_(const float* a, const float* b, int n); + +void exp(const float* src, float* dst, int n); +void log(const float* src, float* dst, int n); + +void fastAtan2(const float* y, const float* x, float* dst, int n, bool angleInDegrees); +void magnitude(const float* x, const float* y, float* dst, int n); + +/** @brief Computes the cube root of an argument. + + The function cubeRoot computes \f$\sqrt[3]{\texttt{val}}\f$. Negative arguments are handled correctly. + NaN and Inf are not handled. The accuracy approaches the maximum possible accuracy for + single-precision data. + @param val A function argument. + */ +float cubeRoot(float val); + +/** @brief Calculates the angle of a 2D vector in degrees. + + The function fastAtan2 calculates the full-range angle of an input 2D vector. The angle is measured + in degrees and varies from 0 to 360 degrees. The accuracy is about 0.3 degrees. + @param x x-coordinate of the vector. + @param y y-coordinate of the vector. + */ +float fastAtan2(float y, float x); }} //cv::hal diff --git a/modules/hal/include/opencv2/hal/defs.h b/modules/hal/include/opencv2/hal/defs.h index 96595961f0..6e1ff2a0ad 100644 --- a/modules/hal/include/opencv2/hal/defs.h +++ b/modules/hal/include/opencv2/hal/defs.h @@ -1,3 +1,4 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// // // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. // @@ -48,6 +49,8 @@ # define _CRT_SECURE_NO_DEPRECATE /* to avoid multiple Visual Studio warnings */ #endif +#include + #if defined __ICL # define CV_ICC __ICL #elif defined __ICC @@ -60,12 +63,30 @@ # define CV_ICC __INTEL_COMPILER #endif +#ifndef CV_INLINE +# if defined __cplusplus +# define CV_INLINE static inline +# elif defined _MSC_VER +# define CV_INLINE __inline +# else +# define CV_INLINE static +# endif +#endif + #if defined CV_ICC && !defined CV_ENABLE_UNROLLED # define CV_ENABLE_UNROLLED 0 #else # define CV_ENABLE_UNROLLED 1 #endif +#ifdef __GNUC__ +# define CV_DECL_ALIGNED(x) __attribute__ ((aligned (x))) +#elif defined _MSC_VER +# define CV_DECL_ALIGNED(x) __declspec(align(x)) +#else +# define CV_DECL_ALIGNED(x) +#endif + /* CPU features and intrinsics support */ #define CV_CPU_NONE 0 #define CV_CPU_MMX 1 @@ -99,7 +120,7 @@ // do not include SSE/AVX/NEON headers for NVCC compiler #ifndef __CUDACC__ -#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2) +#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2) # include # define CV_MMX 1 # define CV_SSE 1 @@ -281,4 +302,372 @@ typedef signed char schar; #define CV_2PI 6.283185307179586476925286766559 #define CV_LOG2 0.69314718055994530941723212145818 +typedef union Cv32suf +{ + int i; + unsigned u; + float f; +} +Cv32suf; + +typedef union Cv64suf +{ + int64 i; + uint64 u; + double f; +} +Cv64suf; + + +/****************************************************************************************\ +* fast math * +\****************************************************************************************/ + +#if defined __BORLANDC__ +# include +#elif defined __cplusplus +# include +#else +# include +#endif + +#ifdef HAVE_TEGRA_OPTIMIZATION +# include "tegra_round.hpp" +#endif + +//! @addtogroup core_utils +//! @{ + +#if CV_VFP + // 1. general scheme + #define ARM_ROUND(_value, _asm_string) \ + int res; \ + float temp; \ + asm(_asm_string : [res] "=r" (res), [temp] "=w" (temp) : [value] "w" (_value)); \ + return res + // 2. version for double + #ifdef __clang__ + #define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %[value] \n vmov %[res], %[temp]") + #else + #define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %P[value] \n vmov %[res], %[temp]") + #endif + // 3. version for float + #define ARM_ROUND_FLT(value) ARM_ROUND(value, "vcvtr.s32.f32 %[temp], %[value]\n vmov %[res], %[temp]") +#endif // CV_VFP + +/** @brief Rounds floating-point number to the nearest integer + + @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the + result is not defined. + */ +CV_INLINE int +cvRound( double value ) +{ +#if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ \ + && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__) + __m128d t = _mm_set_sd( value ); + return _mm_cvtsd_si32(t); +#elif defined _MSC_VER && defined _M_IX86 + int t; + __asm + { + fld value; + fistp t; + } + return t; +#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \ + defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION + TEGRA_ROUND_DBL(value); +#elif defined CV_ICC || defined __GNUC__ +# if CV_VFP + ARM_ROUND_DBL(value) +# else + return (int)lrint(value); +# endif +#else + /* it's ok if round does not comply with IEEE754 standard; + the tests should allow +/-1 difference when the tested functions use round */ + return (int)(value + (value >= 0 ? 0.5 : -0.5)); +#endif +} + + +/** @brief Rounds floating-point number to the nearest integer not larger than the original. + + The function computes an integer i such that: + \f[i \le \texttt{value} < i+1\f] + @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the + result is not defined. + */ +CV_INLINE int cvFloor( double value ) +{ +#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__) + __m128d t = _mm_set_sd( value ); + int i = _mm_cvtsd_si32(t); + return i - _mm_movemask_pd(_mm_cmplt_sd(t, _mm_cvtsi32_sd(t,i))); +#elif defined __GNUC__ + int i = (int)value; + return i - (i > value); +#else + int i = cvRound(value); + float diff = (float)(value - i); + return i - (diff < 0); +#endif +} + +/** @brief Rounds floating-point number to the nearest integer not larger than the original. + + The function computes an integer i such that: + \f[i \le \texttt{value} < i+1\f] + @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the + result is not defined. + */ +CV_INLINE int cvCeil( double value ) +{ +#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)) && !defined(__CUDACC__) + __m128d t = _mm_set_sd( value ); + int i = _mm_cvtsd_si32(t); + return i + _mm_movemask_pd(_mm_cmplt_sd(_mm_cvtsi32_sd(t,i), t)); +#elif defined __GNUC__ + int i = (int)value; + return i + (i < value); +#else + int i = cvRound(value); + float diff = (float)(i - value); + return i + (diff < 0); +#endif +} + +/** @brief Determines if the argument is Not A Number. + + @param value The input floating-point value + + The function returns 1 if the argument is Not A Number (as defined by IEEE754 standard), 0 + otherwise. */ +CV_INLINE int cvIsNaN( double value ) +{ + Cv64suf ieee754; + ieee754.f = value; + return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) + + ((unsigned)ieee754.u != 0) > 0x7ff00000; +} + +/** @brief Determines if the argument is Infinity. + + @param value The input floating-point value + + The function returns 1 if the argument is a plus or minus infinity (as defined by IEEE754 standard) + and 0 otherwise. */ +CV_INLINE int cvIsInf( double value ) +{ + Cv64suf ieee754; + ieee754.f = value; + return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) == 0x7ff00000 && + (unsigned)ieee754.u == 0; +} + +#ifdef __cplusplus + +/** @overload */ +CV_INLINE int cvRound(float value) +{ +#if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ && \ + defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__) + __m128 t = _mm_set_ss( value ); + return _mm_cvtss_si32(t); +#elif defined _MSC_VER && defined _M_IX86 + int t; + __asm + { + fld value; + fistp t; + } + return t; +#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \ + defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION + TEGRA_ROUND_FLT(value); +#elif defined CV_ICC || defined __GNUC__ +# if CV_VFP + ARM_ROUND_FLT(value) +# else + return (int)lrintf(value); +# endif +#else + /* it's ok if round does not comply with IEEE754 standard; + the tests should allow +/-1 difference when the tested functions use round */ + return (int)(value + (value >= 0 ? 0.5f : -0.5f)); +#endif +} + +/** @overload */ +CV_INLINE int cvRound( int value ) +{ + return value; +} + +/** @overload */ +CV_INLINE int cvFloor( float value ) +{ +#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__) + __m128 t = _mm_set_ss( value ); + int i = _mm_cvtss_si32(t); + return i - _mm_movemask_ps(_mm_cmplt_ss(t, _mm_cvtsi32_ss(t,i))); +#elif defined __GNUC__ + int i = (int)value; + return i - (i > value); +#else + int i = cvRound(value); + float diff = (float)(value - i); + return i - (diff < 0); +#endif +} + +/** @overload */ +CV_INLINE int cvFloor( int value ) +{ + return value; +} + +/** @overload */ +CV_INLINE int cvCeil( float value ) +{ +#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)) && !defined(__CUDACC__) + __m128 t = _mm_set_ss( value ); + int i = _mm_cvtss_si32(t); + return i + _mm_movemask_ps(_mm_cmplt_ss(_mm_cvtsi32_ss(t,i), t)); +#elif defined __GNUC__ + int i = (int)value; + return i + (i < value); +#else + int i = cvRound(value); + float diff = (float)(i - value); + return i + (diff < 0); +#endif +} + +/** @overload */ +CV_INLINE int cvCeil( int value ) +{ + return value; +} + +/** @overload */ +CV_INLINE int cvIsNaN( float value ) +{ + Cv32suf ieee754; + ieee754.f = value; + return (ieee754.u & 0x7fffffff) > 0x7f800000; +} + +/** @overload */ +CV_INLINE int cvIsInf( float value ) +{ + Cv32suf ieee754; + ieee754.f = value; + return (ieee754.u & 0x7fffffff) == 0x7f800000; +} + +#include + +namespace cv +{ + +/////////////// saturate_cast (used in image & signal processing) /////////////////// + +/** + Template function for accurate conversion from one primitive type to another. + + The functions saturate_cast resemble the standard C++ cast operations, such as static_cast\() + and others. They perform an efficient and accurate conversion from one primitive type to another + (see the introduction chapter). saturate in the name means that when the input value v is out of the + range of the target type, the result is not formed just by taking low bits of the input, but instead + the value is clipped. For example: + @code + uchar a = saturate_cast(-100); // a = 0 (UCHAR_MIN) + short b = saturate_cast(33333.33333); // b = 32767 (SHRT_MAX) + @endcode + Such clipping is done when the target type is unsigned char , signed char , unsigned short or + signed short . For 32-bit integers, no clipping is done. + + When the parameter is a floating-point value and the target type is an integer (8-, 16- or 32-bit), + the floating-point value is first rounded to the nearest integer and then clipped if needed (when + the target type is 8- or 16-bit). + + This operation is used in the simplest or most complex image processing functions in OpenCV. + + @param v Function parameter. + @sa add, subtract, multiply, divide, Mat::convertTo + */ +template static inline _Tp saturate_cast(uchar v) { return _Tp(v); } +/** @overload */ +template static inline _Tp saturate_cast(schar v) { return _Tp(v); } +/** @overload */ +template static inline _Tp saturate_cast(ushort v) { return _Tp(v); } +/** @overload */ +template static inline _Tp saturate_cast(short v) { return _Tp(v); } +/** @overload */ +template static inline _Tp saturate_cast(unsigned v) { return _Tp(v); } +/** @overload */ +template static inline _Tp saturate_cast(int v) { return _Tp(v); } +/** @overload */ +template static inline _Tp saturate_cast(float v) { return _Tp(v); } +/** @overload */ +template static inline _Tp saturate_cast(double v) { return _Tp(v); } +/** @overload */ +template static inline _Tp saturate_cast(int64 v) { return _Tp(v); } +/** @overload */ +template static inline _Tp saturate_cast(uint64 v) { return _Tp(v); } + +//! @cond IGNORED + +template<> inline uchar saturate_cast(schar v) { return (uchar)std::max((int)v, 0); } +template<> inline uchar saturate_cast(ushort v) { return (uchar)std::min((unsigned)v, (unsigned)UCHAR_MAX); } +template<> inline uchar saturate_cast(int v) { return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); } +template<> inline uchar saturate_cast(short v) { return saturate_cast((int)v); } +template<> inline uchar saturate_cast(unsigned v) { return (uchar)std::min(v, (unsigned)UCHAR_MAX); } +template<> inline uchar saturate_cast(float v) { int iv = cvRound(v); return saturate_cast(iv); } +template<> inline uchar saturate_cast(double v) { int iv = cvRound(v); return saturate_cast(iv); } +template<> inline uchar saturate_cast(int64 v) { return (uchar)((uint64)v <= (uint64)UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); } +template<> inline uchar saturate_cast(uint64 v) { return (uchar)std::min(v, (uint64)UCHAR_MAX); } + +template<> inline schar saturate_cast(uchar v) { return (schar)std::min((int)v, SCHAR_MAX); } +template<> inline schar saturate_cast(ushort v) { return (schar)std::min((unsigned)v, (unsigned)SCHAR_MAX); } +template<> inline schar saturate_cast(int v) { return (schar)((unsigned)(v-SCHAR_MIN) <= (unsigned)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); } +template<> inline schar saturate_cast(short v) { return saturate_cast((int)v); } +template<> inline schar saturate_cast(unsigned v) { return (schar)std::min(v, (unsigned)SCHAR_MAX); } +template<> inline schar saturate_cast(float v) { int iv = cvRound(v); return saturate_cast(iv); } +template<> inline schar saturate_cast(double v) { int iv = cvRound(v); return saturate_cast(iv); } +template<> inline schar saturate_cast(int64 v) { return (schar)((uint64)((int64)v-SCHAR_MIN) <= (uint64)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); } +template<> inline schar saturate_cast(uint64 v) { return (schar)std::min(v, (uint64)SCHAR_MAX); } + +template<> inline ushort saturate_cast(schar v) { return (ushort)std::max((int)v, 0); } +template<> inline ushort saturate_cast(short v) { return (ushort)std::max((int)v, 0); } +template<> inline ushort saturate_cast(int v) { return (ushort)((unsigned)v <= (unsigned)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); } +template<> inline ushort saturate_cast(unsigned v) { return (ushort)std::min(v, (unsigned)USHRT_MAX); } +template<> inline ushort saturate_cast(float v) { int iv = cvRound(v); return saturate_cast(iv); } +template<> inline ushort saturate_cast(double v) { int iv = cvRound(v); return saturate_cast(iv); } +template<> inline ushort saturate_cast(int64 v) { return (ushort)((uint64)v <= (uint64)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); } +template<> inline ushort saturate_cast(uint64 v) { return (ushort)std::min(v, (uint64)USHRT_MAX); } + +template<> inline short saturate_cast(ushort v) { return (short)std::min((int)v, SHRT_MAX); } +template<> inline short saturate_cast(int v) { return (short)((unsigned)(v - SHRT_MIN) <= (unsigned)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); } +template<> inline short saturate_cast(unsigned v) { return (short)std::min(v, (unsigned)SHRT_MAX); } +template<> inline short saturate_cast(float v) { int iv = cvRound(v); return saturate_cast(iv); } +template<> inline short saturate_cast(double v) { int iv = cvRound(v); return saturate_cast(iv); } +template<> inline short saturate_cast(int64 v) { return (short)((uint64)((int64)v - SHRT_MIN) <= (uint64)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); } +template<> inline short saturate_cast(uint64 v) { return (short)std::min(v, (uint64)SHRT_MAX); } + +template<> inline int saturate_cast(float v) { return cvRound(v); } +template<> inline int saturate_cast(double v) { return cvRound(v); } + +// we intentionally do not clip negative numbers, to make -1 become 0xffffffff etc. +template<> inline unsigned saturate_cast(float v) { return cvRound(v); } +template<> inline unsigned saturate_cast(double v) { return cvRound(v); } + +//! @endcond + +} + +#endif // __cplusplus + #endif //__OPENCV_HAL_H__ diff --git a/modules/hal/include/opencv2/hal/intrin.hpp b/modules/hal/include/opencv2/hal/intrin.hpp new file mode 100644 index 0000000000..b7b147a199 --- /dev/null +++ b/modules/hal/include/opencv2/hal/intrin.hpp @@ -0,0 +1,2254 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Copyright (C) 2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_HAL_INTRIN_HPP__ +#define __OPENCV_HAL_INTRIN_HPP__ + +#include +#include +#include + +#define OPENCV_HAL_ADD(a, b) ((a) + (b)) +#define OPENCV_HAL_AND(a, b) ((a) & (b)) +#define OPENCV_HAL_NOP(a) (a) +#define OPENCV_HAL_1ST(a, b) (a) + +namespace cv { namespace hal { + +template struct TypeTraits +{ + typedef _Tp int_type; + typedef _Tp uint_type; + typedef _Tp abs_type; + typedef _Tp sum_type; + + enum { delta = 0, shift = 0 }; + + static int_type reinterpret_int(_Tp x) { return x; } + static uint_type reinterpet_uint(_Tp x) { return x; } + static _Tp reinterpret_from_int(int_type x) { return (_Tp)x; } +}; + +template<> struct TypeTraits +{ + typedef uchar value_type; + typedef schar int_type; + typedef uchar uint_type; + typedef uchar abs_type; + typedef int sum_type; + + typedef ushort w_type; + + enum { delta = 128, shift = 8 }; + + static int_type reinterpret_int(value_type x) { return (int_type)x; } + static uint_type reinterpret_uint(value_type x) { return (uint_type)x; } + static value_type reinterpret_from_int(int_type x) { return (value_type)x; } +}; + +template<> struct TypeTraits +{ + typedef schar value_type; + typedef schar int_type; + typedef uchar uint_type; + typedef uchar abs_type; + typedef int sum_type; + + typedef short w_type; + + enum { delta = 128, shift = 8 }; + + static int_type reinterpret_int(value_type x) { return (int_type)x; } + static uint_type reinterpret_uint(value_type x) { return (uint_type)x; } + static value_type reinterpret_from_int(int_type x) { return (value_type)x; } +}; + +template<> struct TypeTraits +{ + typedef ushort value_type; + typedef short int_type; + typedef ushort uint_type; + typedef ushort abs_type; + typedef int sum_type; + + typedef unsigned w_type; + typedef uchar nu_type; + + enum { delta = 32768, shift = 16 }; + + static int_type reinterpret_int(value_type x) { return (int_type)x; } + static uint_type reinterpret_uint(value_type x) { return (uint_type)x; } + static value_type reinterpret_from_int(int_type x) { return (value_type)x; } +}; + +template<> struct TypeTraits +{ + typedef short value_type; + typedef short int_type; + typedef ushort uint_type; + typedef ushort abs_type; + typedef int sum_type; + + typedef int w_type; + typedef uchar nu_type; + typedef schar n_type; + + enum { delta = 128, shift = 8 }; + + static int_type reinterpret_int(value_type x) { return (int_type)x; } + static uint_type reinterpret_uint(value_type x) { return (uint_type)x; } + static value_type reinterpret_from_int(int_type x) { return (value_type)x; } +}; + +template<> struct TypeTraits +{ + typedef unsigned value_type; + typedef int int_type; + typedef unsigned uint_type; + typedef unsigned abs_type; + typedef unsigned sum_type; + + typedef ushort nu_type; + + static int_type reinterpret_int(value_type x) { return (int_type)x; } + static uint_type reinterpret_uint(value_type x) { return (uint_type)x; } + static value_type reinterpret_from_int(int_type x) { return (value_type)x; } +}; + +template<> struct TypeTraits +{ + typedef int value_type; + typedef int int_type; + typedef unsigned uint_type; + typedef unsigned abs_type; + typedef int sum_type; + + typedef short n_type; + typedef ushort nu_type; + + static int_type reinterpret_int(value_type x) { return (int_type)x; } + static uint_type reinterpret_uint(value_type x) { return (uint_type)x; } + static value_type reinterpret_from_int(int_type x) { return (value_type)x; } +}; + +template<> struct TypeTraits +{ + typedef float value_type; + typedef int int_type; + typedef unsigned uint_type; + typedef float abs_type; + typedef float sum_type; + + typedef double w_type; + + static int_type reinterpret_int(value_type x) + { + Cv32suf u; + u.f = x; + return u.i; + } + static uint_type reinterpet_uint(value_type x) + { + Cv32suf u; + u.f = x; + return u.u; + } + static value_type reinterpret_from_int(int_type x) + { + Cv32suf u; + u.i = x; + return u.f; + } +}; + +template<> struct TypeTraits +{ + typedef double value_type; + typedef int64 int_type; + typedef uint64 uint_type; + typedef double abs_type; + typedef double sum_type; + static int_type reinterpret_int(value_type x) + { + Cv64suf u; + u.f = x; + return u.i; + } + static uint_type reinterpet_uint(value_type x) + { + Cv64suf u; + u.f = x; + return u.u; + } + static value_type reinterpret_from_int(int_type x) + { + Cv64suf u; + u.i = x; + return u.f; + } +}; + +template struct v_reg +{ + typedef _Tp scalar_type; + typedef v_reg::int_type, n> int_vec; + typedef v_reg::abs_type, n> abs_vec; + enum { channels = n }; + + explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; } + v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; } + v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; } + v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3, + _Tp s4, _Tp s5, _Tp s6, _Tp s7) + { + s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; + s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7; + } + v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3, + _Tp s4, _Tp s5, _Tp s6, _Tp s7, + _Tp s8, _Tp s9, _Tp s10, _Tp s11, + _Tp s12, _Tp s13, _Tp s14, _Tp s15) + { + s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; + s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7; + s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11; + s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15; + } + + v_reg() {} + v_reg(const v_reg<_Tp, n> & r) + { + for( int i = 0; i < n; i++ ) + s[i] = r.s[i]; + } + + _Tp get(const int i) const { return s[i]; } + _Tp get0() const { return s[0]; } + v_reg<_Tp, n> high() const + { + v_reg<_Tp, n> c; + int i; + for( i = 0; i < n/2; i++ ) + { + c.s[i] = s[i+(n/2)]; + c.s[i+(n/2)] = 0; + } + return c; + } + + static v_reg<_Tp, n> zero() + { + v_reg<_Tp, n> c; + for( int i = 0; i < n; i++ ) + c.s[i] = (_Tp)0; + return c; + } + + static v_reg<_Tp, n> all(_Tp s) + { + v_reg<_Tp, n> c; + for( int i = 0; i < n; i++ ) + c.s[i] = s; + return c; + } + + template static v_reg<_Tp2, n2> reinterpret_as(const v_reg<_Tp, n>& a) + { + size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n); + v_reg<_Tp2, n2> c; + memcpy(&c.s[0], &a.s[0], bytes); + return c; + } + + _Tp s[n]; +}; + +#define OPENCV_HAL_IMPL_BIN_OP(bin_op) \ +template inline v_reg<_Tp, n> operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +{ \ + v_reg<_Tp, n> c; \ + for( int i = 0; i < n; i++ ) \ + c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \ + return c; \ +} \ +template inline v_reg<_Tp, n>& operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +{ \ + for( int i = 0; i < n; i++ ) \ + a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \ + return a; \ +} + +OPENCV_HAL_IMPL_BIN_OP(+) +OPENCV_HAL_IMPL_BIN_OP(-) +OPENCV_HAL_IMPL_BIN_OP(*) +OPENCV_HAL_IMPL_BIN_OP(/) + +#define OPENCV_HAL_IMPL_BIT_OP(bit_op) \ +template inline v_reg<_Tp, n> operator bit_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +{ \ + v_reg<_Tp, n> c; \ + typedef typename TypeTraits<_Tp>::int_type itype; \ + for( int i = 0; i < n; i++ ) \ + c.s[i] = TypeTraits<_Tp>::reinterpret_from_int((itype)(TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \ + TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \ + return c; \ +} \ +template inline v_reg<_Tp, n>& operator bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +{ \ + typedef typename TypeTraits<_Tp>::int_type itype; \ + for( int i = 0; i < n; i++ ) \ + a.s[i] = TypeTraits<_Tp>::reinterpret_from_int((itype)(TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \ + TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \ + return a; \ +} + +OPENCV_HAL_IMPL_BIT_OP(&) +OPENCV_HAL_IMPL_BIT_OP(|) +OPENCV_HAL_IMPL_BIT_OP(^) + +template inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) +{ + v_reg<_Tp, n> c; + typedef typename TypeTraits<_Tp>::int_type itype; + for( int i = 0; i < n; i++ ) + c.s[i] = TypeTraits<_Tp>::reinterpret_from_int(~TypeTraits<_Tp>::reinterpret_int(a.s[i])); + return c; +} + +#define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \ +template inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \ +{ \ + v_reg<_Tp2, n> c; \ + for( int i = 0; i < n; i++ ) \ + c.s[i] = cfunc(a.s[i]); \ + return c; \ +} + +OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp) +OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp) +OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp) +OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp) +OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp) +OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename TypeTraits<_Tp>::abs_type)std::abs, typename TypeTraits<_Tp>::abs_type) +OPENCV_HAL_IMPL_MATH_FUNC(v_round, cvRound, int) +OPENCV_HAL_IMPL_MATH_FUNC(v_floor, cvFloor, int) +OPENCV_HAL_IMPL_MATH_FUNC(v_ceil, cvCeil, int) +OPENCV_HAL_IMPL_MATH_FUNC(v_trunc, int, int) + +#define OPENCV_HAL_IMPL_MINMAX_FUNC(func, hfunc, cfunc) \ +template inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +{ \ + v_reg<_Tp, n> c; \ + for( int i = 0; i < n; i++ ) \ + c.s[i] = cfunc(a.s[i], b.s[i]); \ + return c; \ +} \ +template inline _Tp hfunc(const v_reg<_Tp, n>& a) \ +{ \ + _Tp c = a.s[0]; \ + for( int i = 1; i < n; i++ ) \ + c = cfunc(c, a.s[i]); \ + return c; \ +} + +OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, v_reduce_min, std::min) +OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, v_reduce_max, std::max) + +template inline void v_minmax(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, + v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval) +{ + for( int i = 0; i < n; i++ ) + { + minval.s[i] = std::min(a.s[i], b.s[i]); + maxval.s[i] = std::max(a.s[i], b.s[i]); + } +} + + +#define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \ +template inline v_reg<_Tp, n> \ + operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +{ \ + typedef typename TypeTraits<_Tp>::int_type itype; \ + v_reg<_Tp, n> c; \ + for( int i = 0; i < n; i++ ) \ + c.s[i] = TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \ + return c; \ +} + +OPENCV_HAL_IMPL_CMP_OP(<) +OPENCV_HAL_IMPL_CMP_OP(>) +OPENCV_HAL_IMPL_CMP_OP(<=) +OPENCV_HAL_IMPL_CMP_OP(>=) +OPENCV_HAL_IMPL_CMP_OP(==) +OPENCV_HAL_IMPL_CMP_OP(!=) + +#define OPENCV_HAL_IMPL_ADDSUB_OP(func, bin_op, cast_op, _Tp2) \ +template inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +{ \ + typedef _Tp2 rtype; \ + v_reg c; \ + for( int i = 0; i < n; i++ ) \ + c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \ + return c; \ +} + +OPENCV_HAL_IMPL_ADDSUB_OP(v_add_wrap, +, (_Tp), _Tp) +OPENCV_HAL_IMPL_ADDSUB_OP(v_sub_wrap, -, (_Tp), _Tp) +OPENCV_HAL_IMPL_ADDSUB_OP(v_absdiff, -, (rtype)std::abs, typename TypeTraits<_Tp>::abs_type) + +template inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + v_reg<_Tp, n> c; + for( int i = 0; i < n; i++ ) + c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]); + return c; +} + +template inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + v_reg<_Tp, n> c; + for( int i = 0; i < n; i++ ) + c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i]; + return c; +} + +template inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, + const v_reg<_Tp, n>& c) +{ + v_reg<_Tp, n> d; + for( int i = 0; i < n; i++ ) + d.s[i] = a.s[i]*b.s[i] + c.s[i]; + return d; +} + +template inline v_reg<_Tp, n> v_mullo(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + v_reg<_Tp, n> c; + for( int i = 0; i < n; i++ ) + c.s[i] = (_Tp)(a.s[i]*b.s[i]); + return c; +} + +template inline v_reg<_Tp, n> v_mulhi2(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + v_reg<_Tp, n> c; + for( int i = 0; i < n; i++ ) + c.s[i] = (_Tp)((a.s[i]*b.s[i]*2 + TypeTraits<_Tp>::delta) >> TypeTraits<_Tp>::shift); + return c; +} + +#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \ +template inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \ +{ \ + v_reg<_Tp, n> c; \ + for( int i = 0; i < n; i++ ) \ + c.s[i] = (_Tp)(a.s[i] shift_op imm); \ + return c; \ +} + +OPENCV_HAL_IMPL_SHIFT_OP(<<) +OPENCV_HAL_IMPL_SHIFT_OP(>>) + +template inline typename TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a) +{ + typename TypeTraits<_Tp>::sum_type c = a.s[0]; + for( int i = 1; i < n; i++ ) + c += a.s[i]; + return c; +} + +template inline int v_signmask(const v_reg<_Tp, n>& a) +{ + int mask = 0; + for( int i = 0; i < n; i++ ) + mask |= (TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i; + return mask; +} + +template inline bool v_check_all(const v_reg<_Tp, n>& a) +{ + for( int i = 0; i < n; i++ ) + if( TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 ) + return false; + return true; +} + +template inline bool v_check_any(const v_reg<_Tp, n>& a) +{ + for( int i = 0; i < n; i++ ) + if( TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 ) + return true; + return false; +} + +template inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask, + const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + v_reg<_Tp, n> c; + for( int i = 0; i < n; i++ ) + c.s[i] = TypeTraits<_Tp>::reinterpret_int(mask.s[i]) < 0 ? b.s[i] : a.s[i]; + return c; +} + +template inline void v_expand(const v_reg<_Tp, n>& a, + v_reg::w_type, n/2>& b0, + v_reg::w_type, n/2>& b1) +{ + for( int i = 0; i < (n/2); i++ ) + { + b0.s[i] = a.s[i]; + b1.s[i] = a.s[i+(n/2)]; + } +} + +template inline v_reg::int_type, n> + v_reinterpret_int(const v_reg<_Tp, n>& a) +{ + v_reg::int_type, n> c; + for( int i = 0; i < n; i++ ) + c.s[i] = TypeTraits<_Tp>::reinterpret_int(a.s[i]); + return c; +} + +template inline v_reg::uint_type, n> + v_reinterpret_uint(const v_reg<_Tp, n>& a) +{ + v_reg::uint_type, n> c; + for( int i = 0; i < n; i++ ) + c.s[i] = TypeTraits<_Tp>::reinterpret_uint(a.s[i]); + return c; +} + +template inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1, + v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 ) +{ + int i; + for( i = 0; i < n/2; i++ ) + { + b0.s[i*2] = a0.s[i]; + b0.s[i*2+1] = a1.s[i]; + } + for( ; i < n; i++ ) + { + b1.s[i*2-n] = a0.s[i]; + b1.s[i*2-n+1] = a1.s[i]; + } +} + +template inline v_reg<_Tp, n> v_load(const _Tp* ptr) +{ + return v_reg<_Tp, n>(ptr); +} + +template inline v_reg<_Tp, n> v_load_aligned(const _Tp* ptr) +{ + return v_reg<_Tp, n>(ptr); +} + +template inline void v_load_halves(const _Tp* loptr, const _Tp* hiptr) +{ + v_reg<_Tp, n> c; + for( int i = 0; i < n/2; i++ ) + { + c.s[i] = loptr[i]; + c.s[i+n/2] = hiptr[i]; + } + return c; +} + +template inline v_reg::w_type, n> v_load_expand(const _Tp* ptr) +{ + typedef typename TypeTraits<_Tp>::w_type w_type; + v_reg c; + for( int i = 0; i < n; i++ ) + { + c.s[i] = ptr[i]; + } + return c; +} + +template inline v_reg::w_type>::w_type, n> v_load_expand_q(const _Tp* ptr) +{ + typedef typename TypeTraits::w_type>::w_type w_type; + v_reg c; + for( int i = 0; i < n; i++ ) + { + c.s[i] = ptr[i]; + } + return c; +} + +template inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a, + v_reg<_Tp, n>& b, v_reg<_Tp, n>& c) +{ + int i, i3; + for( i = i3 = 0; i < n; i++, i3 += 3 ) + { + a.s[i] = ptr[i3]; + b.s[i] = ptr[i3+1]; + c.s[i] = ptr[i3+2]; + } +} + +template inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a, + v_reg<_Tp, n>& b, v_reg<_Tp, n>& c, + v_reg<_Tp, n>& d) +{ + int i, i4; + for( i = i4 = 0; i < n; i++, i4 += 4 ) + { + a.s[i] = ptr[i4]; + b.s[i] = ptr[i4+1]; + c.s[i] = ptr[i4+2]; + d.s[i] = ptr[i4+3]; + } +} + +template inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, + const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c) +{ + int i, i3; + for( i = i3 = 0; i < n; i++, i3 += 3 ) + { + ptr[i3] = a.s[i]; + ptr[i3+1] = b.s[i]; + ptr[i3+2] = c.s[i]; + } +} + +template inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, + const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c, + const v_reg<_Tp, n>& d) +{ + int i, i4; + for( i = i4 = 0; i < n; i++, i4 += 4 ) + { + ptr[i4] = a.s[i]; + ptr[i4+1] = b.s[i]; + ptr[i4+2] = c.s[i]; + ptr[i4+3] = d.s[i]; + } +} + +template inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a) +{ + for( int i = 0; i < n; i++ ) + ptr[i] = a.s[i]; +} + +template inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a) +{ + for( int i = 0; i < (n/2); i++ ) + ptr[i] = a.s[i]; +} + +template inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a) +{ + for( int i = 0; i < (n/2); i++ ) + ptr[i] = a.s[i+(n/2)]; +} + +template inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a) +{ + for( int i = 0; i < n; i++ ) + ptr[i] = a.s[i]; +} + +template inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + v_reg<_Tp, n> c; + for( int i = 0; i < (n/2); i++ ) + { + c.s[i] = a.s[i]; + c.s[i+(n/2)] = b.s[i]; + } +} + +template inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + v_reg<_Tp, n> c; + for( int i = 0; i < (n/2); i++ ) + { + c.s[i] = a.s[i+(n/2)]; + c.s[i+(n/2)] = b.s[i+(n/2)]; + } +} + +template inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, + v_reg<_Tp, n>& low, v_reg<_Tp, n>& high) +{ + for( int i = 0; i < (n/2); i++ ) + { + low.s[i] = a.s[i]; + low.s[i+(n/2)] = b.s[i]; + high.s[i] = a.s[i+(n/2)]; + high.s[i+(n/2)] = b.s[i+(n/2)]; + } +} + +template inline v_reg v_round(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = cvRound(a.s[i]); + return c; +} + +template inline v_reg v_floor(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = cvFloor(a.s[i]); + return c; +} + +template inline v_reg v_ceil(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = cvCeil(a.s[i]); + return c; +} + +template inline v_reg v_trunc(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = (int)(a.s[i]); + return c; +} + +template inline v_reg v_round(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + { + c.s[i] = cvRound(a.s[i]); + c.s[i+n] = 0; + } + return c; +} + +template inline v_reg v_floor(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + { + c.s[i] = cvFloor(a.s[i]); + c.s[i+n] = 0; + } + return c; +} + +template inline v_reg v_ceil(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + { + c.s[i] = cvCeil(a.s[i]); + c.s[i+n] = 0; + } + return c; +} + +template inline v_reg v_trunc(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + { + c.s[i] = cvCeil(a.s[i]); + c.s[i+n] = 0; + } + return c; +} + +template inline v_reg v_cvt_f32(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = (float)a.s[i]; + return c; +} + +template inline v_reg v_cvt_f64(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = (double)a.s[i]; + return c; +} + +template inline v_reg v_cvt_f64(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = (double)a.s[i]; + return c; +} + +template inline v_reg<_Tp2, n*2> v_cvtsat(const v_reg<_Tp, n>& a, + const v_reg<_Tp, n>& b) +{ + v_reg<_Tp2, n*2> c; + for( int i = 0; i < n; i++ ) + { + c.s[i] = saturate_cast<_Tp2>(a.s[i]); + c.s[i+n] = saturate_cast<_Tp2>(b.s[i]); + } + return c; +} + +template inline v_reg<_Tp2, n*2> v_cvtsat(const v_reg<_Tp, n>& a, + const v_reg<_Tp, n>& b, + int rshift) +{ + v_reg<_Tp2, n*2> c; + for( int i = 0; i < n; i++ ) + { + c.s[i] = saturate_cast<_Tp2>((a.s[i] + (1<<(rshift-1))) >> rshift); + c.s[i+n] = saturate_cast<_Tp2>((b.s[i] + (1<<(rshift-1))) >> rshift); + } + return c; +} + +template inline void v_storesat(_Tp2* ptr, const v_reg<_Tp, n>& a) +{ + for( int i = 0; i < n; i++ ) + { + ptr[i] = saturate_cast<_Tp2>(a.s[i]); + } +} + +template inline void v_storesat(_Tp2* ptr, const v_reg<_Tp, n>& a, int rshift) +{ + for( int i = 0; i < n; i++ ) + { + ptr[i] = saturate_cast<_Tp2>((a.s[i] + (1<<(rshift-1))) >> rshift); + } +} + +template inline void v_transpose4x4(const v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1, + const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3, + v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1, + v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3) +{ + b0 = v_reg<_Tp, 4>(a0.s[0], a1.s[0], a2.s[0], a3.s[0]); + b1 = v_reg<_Tp, 4>(a0.s[1], a1.s[1], a2.s[1], a3.s[1]); + b2 = v_reg<_Tp, 4>(a0.s[2], a1.s[2], a2.s[2], a3.s[2]); + b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]); +} + +#if CV_SSE2 + +#define CV_SIMD128 1 +#define CV_SIMD128_64F 1 + +struct v_uint8x16 +{ + explicit v_uint8x16(__m128i v) : val(v) {} + v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7, + uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15) + { + val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3, + (char)v4, (char)v5, (char)v6, (char)v7, + (char)v8, (char)v9, (char)v10, (char)v11, + (char)v12, (char)v13, (char)v14, (char)v15); + } + uchar get(const int i) const + { + return (uchar)(_mm_extract_epi16(val, i/2) >> ((i&1)*8)); + } + uchar get0() const + { + return (uchar)_mm_cvtsi128_si32(val); + } + + __m128i val; +}; + +struct v_int8x16 +{ + explicit v_int8x16(__m128i v) : val(v) {} + v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7, + schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15) + { + val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3, + (char)v4, (char)v5, (char)v6, (char)v7, + (char)v8, (char)v9, (char)v10, (char)v11, + (char)v12, (char)v13, (char)v14, (char)v15); + } + schar get(const int i) const + { + return (schar)(_mm_extract_epi16(val, i/2) >> ((i&1)*8)); + } + schar get0() const + { + return (schar)_mm_cvtsi128_si32(val); + } + + __m128i val; +}; + +struct v_uint16x8 +{ + explicit v_uint16x8(__m128i v) : val(v) {} + v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7) + { + val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3, + (short)v4, (short)v5, (short)v6, (short)v7); + } + ushort get(const int i) const + { + return (ushort)_mm_extract_epi16(val, i); + } + uchar get0() const + { + return (ushort)_mm_cvtsi128_si32(val); + } + + __m128i val; +}; + +struct v_int16x8 +{ + explicit v_int16x8(__m128i v) : val(v) {} + v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7) + { + val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3, + (short)v4, (short)v5, (short)v6, (short)v7); + } + short get(const int i) const + { + return (short)_mm_extract_epi16(val, i); + } + short get0() const + { + return (short)_mm_cvtsi128_si32(val); + } + __m128i val; +}; + +struct v_uint32x4 +{ + explicit v_uint32x4(__m128i v) : val(v) {} + v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3) + { + val = _mm_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3); + } + unsigned get(const int i) const + { + unsigned CV_DECL_ALIGNED(16) buf[4]; + _mm_store_si128((__m128i*)buf, val); + return buf[i]; + } + unsigned get0() const + { + return (unsigned)_mm_cvtsi128_si32(val); + } + __m128i val; +}; + +struct v_int32x4 +{ + explicit v_int32x4(__m128i v) : val(v) {} + v_int32x4(int v0, int v1, int v2, int v3) + { + val = _mm_setr_epi32(v0, v1, v2, v3); + } + int get(int i) const + { + int CV_DECL_ALIGNED(16) buf[4]; + _mm_store_si128((__m128i*)buf, val); + return buf[i]; + } + int get0() const + { + return _mm_cvtsi128_si32(val); + } + __m128i val; +}; + +struct v_float32x4 +{ + explicit v_float32x4(__m128 v) : val(v) {} + v_float32x4(float v0, float v1, float v2, float v3) + { + val = _mm_setr_ps(v0, v1, v2, v3); + } + float get(int i) const + { + float CV_DECL_ALIGNED(16) buf[4]; + _mm_store_ps(buf, val); + return buf[i]; + } + float get0() const + { + return _mm_cvtss_f32(val); + } + __m128 val; +}; + +struct v_float64x2 +{ + explicit v_float64x2(__m128d v) : val(v) {} + v_float64x2(double v0, double v1) + { + val = _mm_setr_pd(v0, v1); + } + double get(int i) const + { + double CV_DECL_ALIGNED(16) buf[2]; + _mm_store_pd(buf, val); + return buf[i]; + } + double get0() const + { + return _mm_cvtsd_f64(val); + } + __m128d val; +}; + +inline v_uint8x16 v_setzero_u8() { return v_uint8x16(_mm_setzero_si128()); } +inline v_int8x16 v_setzero_s8() { return v_int8x16(_mm_setzero_si128()); } +inline v_uint16x8 v_setzero_u16() { return v_uint16x8(_mm_setzero_si128()); } +inline v_int16x8 v_setzero_s16() { return v_int16x8(_mm_setzero_si128()); } +inline v_uint32x4 v_setzero_u32() { return v_uint32x4(_mm_setzero_si128()); } +inline v_int32x4 v_setzero_s32() { return v_int32x4(_mm_setzero_si128()); } +inline v_float32x4 v_setzero_f32() { return v_float32x4(_mm_setzero_ps()); } +inline v_float64x2 v_setzero_f64() { return v_float64x2(_mm_setzero_pd()); } + +inline v_uint8x16 v_setall_u8(uchar v) { return v_uint8x16(_mm_set1_epi8((char)v)); } +inline v_int8x16 v_setall_s8(schar v) { return v_int8x16(_mm_set1_epi8((char)v)); } +inline v_uint16x8 v_setall_u16(ushort v) { return v_uint16x8(_mm_set1_epi16((short)v)); } +inline v_int16x8 v_setall_s16(short v) { return v_int16x8(_mm_set1_epi16((short)v)); } +inline v_uint32x4 v_setall_u32(unsigned v) { return v_uint32x4(_mm_set1_epi32((int)v)); } +inline v_int32x4 v_setall_s32(int v) { return v_int32x4(_mm_set1_epi32(v)); } +inline v_float32x4 v_setall_f32(float v) { return v_float32x4(_mm_set1_ps(v)); } +inline v_float64x2 v_setall_f64(double v) { return v_float64x2(_mm_set1_pd(v)); } + +template inline v_uint8x16 v_reinterpret_u8(const _Tpvec& a) +{ return v_uint8x16(a.val); } + +inline v_uint8x16 v_reinterpret_u8(const v_float32x4& a) +{ return v_uint8x16(_mm_castps_si128(a.val)); } + +inline v_uint8x16 v_reinterpret_u8(const v_float64x2& a) +{ return v_uint8x16(_mm_castpd_si128(a.val)); } + +template inline v_int8x16 v_reinterpret_s8(const _Tpvec& a) +{ return v_int8x16(a.val); } + +inline v_int8x16 v_reinterpret_s8(const v_float32x4& a) +{ return v_int8x16(_mm_castps_si128(a.val)); } + +inline v_int8x16 v_reinterpret_s8(const v_float64x2& a) +{ return v_int8x16(_mm_castpd_si128(a.val)); } + +template inline v_uint16x8 v_reinterpret_u16(const _Tpvec& a) +{ return v_uint16x8(a.val); } + +inline v_uint16x8 v_reinterpret_u16(const v_float32x4& a) +{ return v_uint16x8(_mm_castps_si128(a.val)); } + +inline v_uint16x8 v_reinterpret_u16(const v_float64x2& a) +{ return v_uint16x8(_mm_castpd_si128(a.val)); } + +template inline v_int16x8 v_reinterpret_s16(const _Tpvec& a) +{ return v_int16x8(a.val); } + +inline v_int16x8 v_reinterpret_s16(const v_float32x4& a) +{ return v_int16x8(_mm_castps_si128(a.val)); } + +inline v_int16x8 v_reinterpret_s16(const v_float64x2& a) +{ return v_int16x8(_mm_castpd_si128(a.val)); } + +template inline v_uint32x4 v_reinterpret_u32(const _Tpvec& a) +{ return v_uint32x4(a.val); } + +inline v_uint32x4 v_reinterpret_u32(const v_float32x4& a) +{ return v_uint32x4(_mm_castps_si128(a.val)); } + +inline v_uint32x4 v_reinterpret_u32(const v_float64x2& a) +{ return v_uint32x4(_mm_castpd_si128(a.val)); } + +template inline v_int32x4 v_reinterpret_s32(const _Tpvec& a) +{ return v_int32x4(a.val); } + +inline v_int32x4 v_reinterpret_s32(const v_float32x4& a) +{ return v_int32x4(_mm_castps_si128(a.val)); } + +inline v_int32x4 v_reinterpret_s32(const v_float64x2& a) +{ return v_int32x4(_mm_castpd_si128(a.val)); } + +template inline v_float32x4 v_reinterpret_f32(const _Tpvec& a) +{ return v_float32x4(_mm_castsi128_ps(a.val)); } + +inline v_float32x4 v_reinterpret_f32(const v_float64x2& a) +{ return v_float32x4(_mm_castpd_ps(a.val)); } + +template inline v_float64x2 v_reinterpret_f64(const _Tpvec& a) +{ return v_float64x2(_mm_castsi128_pd(a.val)); } + +inline v_float64x2 v_reinterpret_f64(const v_float64x2& a) +{ return v_float64x2(_mm_castps_pd(a.val)); } + +inline v_uint8x16 v_sat_u8(const v_uint16x8& a, const v_uint16x8& b) +{ + __m128i delta = _mm_set1_epi16(255); + return v_uint8x16(_mm_packus_epi16(_mm_adds_epu16(_mm_subs_epu16(a.val, delta), delta), + _mm_adds_epu16(_mm_subs_epu16(b.val, delta), delta))); +} +inline v_uint8x16 v_sat_u8(const v_uint16x8& a, const v_uint16x8& b, int n) +{ + // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers. + __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); + return v_uint8x16(_mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a.val, delta), n), + _mm_srli_epi16(_mm_add_epi16(b.val, delta), n))); +} + +inline v_uint8x16 v_sat_u8(const v_int16x8& a, const v_int16x8& b) +{ return v_uint8x16(_mm_packus_epi16(a.val, b.val)); } +inline v_uint8x16 v_sat_u8(const v_int16x8& a, const v_int16x8& b, int n) +{ + __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); + return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_add_epi16(a.val, delta), n), + _mm_srai_epi16(_mm_add_epi16(b.val, delta), n))); +} + +inline void v_storesat_u8(uchar* ptr, const v_uint16x8& a) +{ + __m128i delta = _mm_set1_epi16(255); + _mm_storel_epi64((__m128i*)ptr, + _mm_packus_epi16(_mm_adds_epu16(_mm_subs_epu16(a.val, delta), delta), delta)); +} + +inline void v_storesat_u8(uchar* ptr, const v_uint16x8& a, int n) +{ + __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); + _mm_storel_epi64((__m128i*)ptr, + _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a.val, delta), n), delta)); +} + +inline void v_storesat_u8(uchar* ptr, const v_int16x8& a) +{ _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); } + +inline void v_storesat_u8(uchar* ptr, const v_int16x8& a, int n) +{ + __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); + _mm_storel_epi64((__m128i*)ptr, + _mm_packus_epi16(_mm_srai_epi16(_mm_add_epi16(a.val, delta), n), delta)); +} + +inline v_int8x16 v_sat_s8(const v_uint16x8& a, const v_uint16x8& b) +{ + __m128i delta = _mm_set1_epi16(127); + return v_int8x16(_mm_packs_epi16(_mm_adds_epu16(_mm_subs_epu16(a.val, delta), delta), + _mm_adds_epu16(_mm_subs_epu16(b.val, delta), delta))); +} + +inline v_int8x16 v_sat_s8(const v_uint16x8& a, const v_uint16x8& b, int n) +{ + // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers. + __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); + return v_int8x16(_mm_packs_epi16(_mm_srli_epi16(_mm_add_epi16(a.val, delta), n), + _mm_srli_epi16(_mm_add_epi16(b.val, delta), n))); +} + +inline v_int8x16 v_sat_s8(const v_int16x8& a, const v_int16x8& b) +{ return v_int8x16(_mm_packs_epi16(a.val, b.val)); } + +inline v_int8x16 v_sat_s8(const v_int16x8& a, const v_int16x8& b, int n) +{ + __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); + return v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_add_epi16(a.val, delta), n), + _mm_srai_epi16(_mm_add_epi16(b.val, delta), n))); +} + +inline void v_storesat_s8(schar* ptr, const v_uint16x8& a) +{ + __m128i delta = _mm_set1_epi16(127); + _mm_storel_epi64((__m128i*)ptr, + _mm_packs_epi16(_mm_adds_epu16(_mm_subs_epu16(a.val, delta), delta), delta)); +} + +inline void v_storesat_s8(schar* ptr, const v_uint16x8& a, int n) +{ + __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); + _mm_storel_epi64((__m128i*)ptr, + _mm_packs_epi16(_mm_srli_epi16(_mm_add_epi16(a.val, delta), n), delta)); +} +inline void v_storesat_s8(schar* ptr, const v_int16x8& a) +{ _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); } + +inline void v_storesat_s8(schar* ptr, const v_int16x8& a, int n) +{ + __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); + _mm_storel_epi64((__m128i*)ptr, + _mm_packs_epi16(_mm_srai_epi16(_mm_add_epi16(a.val, delta), n), delta)); +} + +// bit-wise "mask ? a : b" +inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b) +{ + return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask)); +} + +inline v_uint16x8 v_sat_u16(const v_uint32x4& a, const v_uint32x4& b) +{ + __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768); + __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32); + __m128i b1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, b.val), maxval32, b.val), delta32); + __m128i r = _mm_packs_epi32(a1, b1); + return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768))); +} +inline v_uint16x8 v_sat_u16(const v_uint32x4& a, const v_uint32x4& b, int n) +{ + __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); + __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32); + __m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val, delta), n), delta32); + return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768))); +} +inline v_uint16x8 v_sat_u16(const v_int32x4& a, const v_int32x4& b) +{ + __m128i delta32 = _mm_set1_epi32(32768); + __m128i r = _mm_packs_epi32(_mm_sub_epi32(a.val, delta32), _mm_sub_epi32(b.val, delta32)); + return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768))); +} +inline v_uint16x8 v_sat_u16(const v_int32x4& a, const v_int32x4& b, int n) +{ + __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); + __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32); + __m128i b1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(b.val, delta), n), delta32); + return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768))); +} + +inline void v_storesat_u16(ushort* ptr, const v_uint32x4& a) +{ + __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768); + __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32); + __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, delta32), _mm_set1_epi16(-32768)); + _mm_storel_epi64((__m128i*)ptr, r); +} +inline void v_storesat_u16(ushort* ptr, const v_uint32x4& a, int n) +{ + __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); + __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32); + __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, delta32), _mm_set1_epi16(-32768)); + _mm_storel_epi64((__m128i*)ptr, r); +} +inline void v_storesat_u16(ushort* ptr, const v_int32x4& a) +{ + __m128i delta32 = _mm_set1_epi32(32768); + __m128i a1 = _mm_sub_epi32(a.val, delta32); + __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, delta32), _mm_set1_epi16(-32768)); + _mm_storel_epi64((__m128i*)ptr, r); +} +inline void v_storesat_u16(ushort* ptr, const v_int32x4& a, int n) +{ + __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); + __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32); + __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, delta32), _mm_set1_epi16(-32768)); + _mm_storel_epi64((__m128i*)ptr, r); +} + +inline v_int16x8 v_sat_s16(const v_uint32x4& a, const v_uint32x4& b) +{ + __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(32767); + __m128i a1 = v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val); + __m128i b1 = v_select_si128(_mm_cmpgt_epi32(z, b.val), maxval32, b.val); + return v_int16x8(_mm_packs_epi32(a1, b1)); +} +inline v_int16x8 v_sat_s16(const v_uint32x4& a, const v_uint32x4& b, int n) +{ + __m128i delta = _mm_set1_epi32(1 << (n-1)); + return v_int16x8(_mm_packs_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), + _mm_srli_epi32(_mm_add_epi32(b.val, delta), n))); +} +inline v_int16x8 v_sat_s16(const v_int32x4& a, const v_int32x4& b) +{ return v_int16x8(_mm_packs_epi32(a.val, b.val)); } +inline v_int16x8 v_sat_s16(const v_int32x4& a, const v_int32x4& b, int n) +{ + __m128i delta = _mm_set1_epi32(1 << (n-1)); + return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), + _mm_srai_epi32(_mm_add_epi32(b.val, delta), n))); +} + +inline void v_storesat_s16(short* ptr, const v_uint32x4& a) +{ + __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(32767); + __m128i a1 = v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val); + _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1)); +} +inline void v_storesat_s16(short* ptr, const v_uint32x4& a, int n) +{ + __m128i delta = _mm_set1_epi32(1 << (n-1)); + __m128i a1 = _mm_srli_epi32(_mm_add_epi32(a.val, delta), n); + _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1)); +} +inline void v_storesat_s16(short* ptr, const v_int32x4& a) +{ + _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val)); +} +inline void v_storesat_s16(short* ptr, const v_int32x4& a, int n) +{ + __m128i delta = _mm_set1_epi32(1 << (n-1)); + __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n); + _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1)); +} + +inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, + const v_float32x4& m1, const v_float32x4& m2, + const v_float32x4& m3) +{ + __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val); + __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val); + __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val); + __m128 v3 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(3, 3, 3, 3)), m3.val); + + return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3))); +} + + +#define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \ + inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ + { \ + return _Tpvec(intrin(a.val, b.val)); \ + } \ + inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ + { \ + a.val = intrin(a.val, b.val); \ + return a; \ + } + +OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8) +OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8) +OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8) +OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8) +OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16) +OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16) +OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint16x8, _mm_mullo_epi16) +OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16) +OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16) +OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int16x8, _mm_mullo_epi16) +OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32) +OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32) +OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps) +OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps) +OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps) +OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps) +OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd) +OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd) +OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd) +OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd) + +inline v_uint32x4 operator * (const v_uint32x4& a, const v_uint32x4& b) +{ + __m128i c0 = _mm_mul_epu32(a.val, b.val); + __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32)); + __m128i d0 = _mm_unpacklo_epi32(c0, c1); + __m128i d1 = _mm_unpackhi_epi32(c0, c1); + return v_uint32x4(_mm_unpacklo_epi64(d0, d1)); +} +inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b) +{ + __m128i c0 = _mm_mul_epu32(a.val, b.val); + __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32)); + __m128i d0 = _mm_unpacklo_epi32(c0, c1); + __m128i d1 = _mm_unpackhi_epi32(c0, c1); + return v_int32x4(_mm_unpacklo_epi64(d0, d1)); +} + +#define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \ + OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \ + OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \ + OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \ + inline _Tpvec operator ~ (const _Tpvec& a) \ + { \ + return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \ + } + +OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint8x16, si128, _mm_set1_epi32(-1)) +OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int8x16, si128, _mm_set1_epi32(-1)) +OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint16x8, si128, _mm_set1_epi32(-1)) +OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int16x8, si128, _mm_set1_epi32(-1)) +OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint32x4, si128, _mm_set1_epi32(-1)) +OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int32x4, si128, _mm_set1_epi32(-1)) +OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1))) +OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1))) + +inline v_float32x4 v_sqrt(v_float32x4 x) +{ return v_float32x4(_mm_sqrt_ps(x.val)); } +inline v_float64x2 v_sqrt(v_float64x2 x) +{ return v_float64x2(_mm_sqrt_pd(x.val)); } + +inline v_float32x4 v_abs(v_float32x4 x) +{ return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); } +inline v_float64x2 v_abs(v_float64x2 x) +{ + return v_float64x2(_mm_and_pd(x.val, + _mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1)))); +} + +// TODO: exp, log, sin, cos + +#define OPENCV_HAL_IMPL_SSE_BIN_FUNC(_Tpvec, func, intrin) \ +inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(intrin(a.val, b.val)); \ +} + +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_min, _mm_min_epu8) +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_max, _mm_max_epu8) +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_min, _mm_min_epi16) +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_max, _mm_max_epi16) +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_min, _mm_min_ps) +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_max, _mm_max_ps) +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_min, _mm_min_pd) +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_max, _mm_max_pd) + +inline v_int8x16 v_min(const v_int8x16& a, const v_int8x16& b) +{ + __m128i delta = _mm_set1_epi8((char)0x80); + return v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta), + _mm_xor_si128(b.val, delta)))); +} +inline v_int8x16 v_max(const v_int8x16& a, const v_int8x16& b) +{ + __m128i delta = _mm_set1_epi8((char)0x80); + return v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta), + _mm_xor_si128(b.val, delta)))); +} +inline v_uint16x8 v_min(const v_uint16x8& a, const v_uint16x8& b) +{ + return v_uint16x8(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, b.val))); +} +inline v_uint16x8 v_max(const v_uint16x8& a, const v_uint16x8& b) +{ + return v_uint16x8(_mm_adds_epu16(_mm_subs_epu16(a.val, b.val), b.val)); +} +inline v_uint32x4 v_min(const v_uint32x4& a, const v_uint32x4& b) +{ + __m128i delta = _mm_set1_epi32((int)0x80000000); + __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta)); + return v_uint32x4(v_select_si128(mask, b.val, a.val)); +} +inline v_uint32x4 v_max(const v_uint32x4& a, const v_uint32x4& b) +{ + __m128i delta = _mm_set1_epi32((int)0x80000000); + __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta)); + return v_uint32x4(v_select_si128(mask, a.val, b.val)); +} +inline v_int32x4 v_min(const v_int32x4& a, const v_int32x4& b) +{ + return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), b.val, a.val)); +} +inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b) +{ + return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), a.val, b.val)); +} + +#define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \ +inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \ +{ return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \ +inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \ +{ \ + __m128i not_mask = _mm_set1_epi32(-1); \ + return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \ +} \ +inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \ +{ return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \ +inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \ +{ \ + __m128i not_mask = _mm_set1_epi32(-1); \ + return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \ +} \ +inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \ +{ \ + __m128i smask = _mm_set1_##suffix(sbit); \ + return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \ +} \ +inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \ +{ \ + __m128i smask = _mm_set1_##suffix(sbit); \ + return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \ +} \ +inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \ +{ \ + __m128i smask = _mm_set1_##suffix(sbit); \ + __m128i not_mask = _mm_set1_epi32(-1); \ + __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \ + return _Tpuvec(_mm_xor_si128(res, not_mask)); \ +} \ +inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \ +{ \ + __m128i smask = _mm_set1_##suffix(sbit); \ + __m128i not_mask = _mm_set1_epi32(-1); \ + __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \ + return _Tpuvec(_mm_xor_si128(res, not_mask)); \ +} \ +inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \ +{ \ + return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \ +} \ +inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \ +{ \ + return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \ +} \ +inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \ +{ \ + __m128i not_mask = _mm_set1_epi32(-1); \ + return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \ +} \ +inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \ +{ \ + __m128i not_mask = _mm_set1_epi32(-1); \ + return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \ +} + +OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)0x80) +OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)0x8000) +OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000) + +#define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \ +inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \ +inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \ +inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \ +inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \ +inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \ +inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); } + +OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps) +OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd) + +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8) +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8) +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16) +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_add_wrap, _mm_add_epi16) +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8) +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8) +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16) +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16) + +#define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, sbit) \ +inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \ +{ \ + return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a.val, b.val), _mm_subs_epu##bits(b.val, a.val))); \ +} \ +inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \ +{ \ + __m128i smask = _mm_set1_epi8(sbit); \ + __m128i a1 = _mm_xor_si128(a.val, smask); \ + __m128i b1 = _mm_xor_si128(b.val, smask); \ + return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a1, b1), _mm_subs_epu##bits(b1, a1))); \ +} + +OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (char)0x80) +OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (char)0x8000) + +#define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \ +inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \ +{ \ + _Tpreg absmask = _mm_castsi128_##suffix(absmask_vec); \ + return _Tpvec(_mm_and_##suffix(_mm_sub_##suffix(a.val, b.val), absmask)); \ +} \ +inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \ +{ \ + _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \ + return _Tpvec(_mm_sqrt_##suffix(res)); \ +} \ +inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \ +{ \ + _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \ + return _Tpvec(res); \ +} \ +inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \ +{ \ + return _Tpvec(_mm_add_##suffix(_mm_mul_##suffix(a.val, b.val), c.val)); \ +} + +OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff)) +OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1)) + +#define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix) \ +inline _Tpuvec operator << (const _Tpuvec& a, int imm) \ +{ \ + return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \ +} \ +inline _Tpsvec operator << (const _Tpsvec& a, int imm) \ +{ \ + return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \ +} \ +inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \ +{ \ + return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \ +} \ +inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \ +{ \ + return _Tpsvec(_mm_srai_##suffix(a.val, imm)); \ +} + +OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16) +OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32) + +inline v_int16x8 v_mullo(const v_int16x8& a, const v_int16x8& b) +{ + return v_int16x8(_mm_mullo_epi16(a.val, b.val)); +} +inline v_uint16x8 v_mullo(const v_uint16x8& a, const v_uint16x8& b) +{ + return v_uint16x8(_mm_mullo_epi16(a.val, b.val)); +} +inline v_int16x8 v_mulhi2(const v_int16x8& a, const v_int16x8& b) +{ + return v_int16x8(_mm_slli_epi16(_mm_mulhi_epi16(a.val, b.val), 1)); +} +inline v_uint16x8 v_mulhi2(const v_uint16x8& a, const v_uint16x8& b) +{ + return v_uint16x8(_mm_slli_epi16(_mm_mulhi_epu16(a.val, b.val), 1)); +} + +#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \ +inline _Tpvec v_load(const _Tp* ptr) \ +{ return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \ +inline _Tpvec v_load_aligned(const _Tp* ptr) \ +{ return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \ +inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ +{ \ + return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \ + _mm_loadl_epi64((const __m128i*)ptr1))); \ +} \ +inline void v_store(_Tp* ptr, const _Tpvec& a) \ +{ _mm_storeu_si128((__m128i*)ptr, a.val); } \ +inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ +{ _mm_store_si128((__m128i*)ptr, a.val); } \ +inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ +{ _mm_storel_epi64((__m128i*)ptr, a.val); } \ +inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ +{ _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a.val, a.val)); } + +OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint8x16, uchar) +OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int8x16, schar) +OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint16x8, ushort) +OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int16x8, short) +OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint32x4, unsigned) +OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int32x4, int) + +#define OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(_Tpvec, _Tp, suffix) \ +inline _Tpvec v_load(const _Tp* ptr) \ +{ return _Tpvec(_mm_loadu_##suffix(ptr)); } \ +inline _Tpvec v_load_aligned(const _Tp* ptr) \ +{ return _Tpvec(_mm_load_##suffix(ptr)); } \ +inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ +{ \ + return _Tpvec(_mm_castsi128_##suffix( \ + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \ + _mm_loadl_epi64((const __m128i*)ptr1)))); \ +} \ +inline void v_store(_Tp* ptr, const _Tpvec& a) \ +{ _mm_storeu_##suffix(ptr, a.val); } \ +inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ +{ _mm_store_##suffix(ptr, a.val); } \ +inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ +{ _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \ +inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ +{ \ + __m128i a1 = _mm_cast##suffix##_si128(a.val); \ + _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a1, a1)); \ +} + +OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps) +OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd) + +#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \ +inline scalartype v_reduce_##func(const _Tpvec& a) \ +{ \ + scalartype CV_DECL_ALIGNED(16) buf[4]; \ + v_store_aligned(buf, a); \ + scalartype s0 = scalar_func(buf[0], buf[1]); \ + scalartype s1 = scalar_func(buf[2], buf[3]); \ + return scalar_func(s0, s1); \ +} + +OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, sum, OPENCV_HAL_ADD) +OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max) +OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min) +OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, sum, OPENCV_HAL_ADD) +OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max) +OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min) +OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD) +OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max) +OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min) + +#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \ +inline bool v_signmask(const _Tpvec& a) \ +{ \ + return and_op(_mm_movemask_##suffix(pack_op(a.val)), signmask); \ +} \ +inline bool v_check_all(const _Tpvec& a) \ +{ return and_op(_mm_movemask_##suffix(a.val), allmask) == allmask; } \ +inline bool v_check_any(const _Tpvec& a) \ +{ return and_op(_mm_movemask_##suffix(a.val), allmask) != 0; } + +#define OPENCV_HAL_PACKS(a) _mm_packs_epi16(a, a) +inline __m128i v_packq_epi32(__m128i a) +{ + __m128i b = _mm_packs_epi32(a, a); + return _mm_packs_epi16(b, b); +} + +OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535) +OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535) +OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa) +OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa) +OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888) +OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888) +OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 15, 15) +OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 3, 3) + +#define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \ +inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(_mm_xor_##suffix(b.val, _mm_and_##suffix(_mm_xor_##suffix(b.val, a.val), mask.val))); \ +} + +OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, si128) +OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, si128) +OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128) +OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128) +OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128) +OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128) +OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps) +OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd) + +#define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpuvec, _Tpwuvec, _Tpu, _Tpsvec, _Tpwsvec, _Tps, suffix, wsuffix, shift) \ +inline void v_expand(const _Tpuvec& a, _Tpwuvec& b0, _Tpwuvec& b1) \ +{ \ + __m128i z = _mm_setzero_si128(); \ + b0.val = _mm_unpacklo_##suffix(a.val, z); \ + b1.val = _mm_unpackhi_##suffix(a.val, z); \ +} \ +inline _Tpwuvec v_load_expand(const _Tpu* ptr) \ +{ \ + __m128i z = _mm_setzero_si128(); \ + return _Tpwuvec(_mm_unpacklo_##suffix(_mm_loadl_epi64((const __m128i*)ptr), z)); \ +} \ +inline void v_expand(const _Tpsvec& a, _Tpwsvec& b0, _Tpwsvec& b1) \ +{ \ + b0.val = _mm_srai_##wsuffix(_mm_unpacklo_##suffix(a.val, a.val), shift); \ + b1.val = _mm_srai_##wsuffix(_mm_unpackhi_##suffix(a.val, a.val), shift); \ +} \ +inline _Tpwsvec v_load_expand(const _Tps* ptr) \ +{ \ + __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \ + return _Tpwsvec(_mm_srai_##wsuffix(_mm_unpacklo_##suffix(a, a), shift)); \ +} + +OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, v_int8x16, v_int16x8, schar, epi8, epi16, 8) +OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, v_int16x8, v_int32x4, short, epi16, epi32, 16) + +inline v_uint32x4 v_load_expand_q(const uchar* ptr) +{ + __m128i z = _mm_setzero_si128(); + __m128i a = _mm_cvtsi32_si128(*(const int*)ptr); + return v_uint32x4(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z)); +} + +inline v_int32x4 v_load_expand_q(const schar* ptr) +{ + __m128i a = _mm_cvtsi32_si128(*(const int*)ptr); + a = _mm_unpacklo_epi8(a, a); + a = _mm_unpacklo_epi8(a, a); + return v_int32x4(_mm_srai_epi32(a, 24)); +} + +#define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \ +inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \ +{ \ + b0.val = _mm_unpacklo_##suffix(a0.val, a1.val); \ + b1.val = _mm_unpackhi_##suffix(a0.val, a1.val); \ +} \ +inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \ +{ \ + __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \ + return _Tpvec(cast_to(_mm_unpacklo_epi64(a1, b1))); \ +} \ +inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \ +{ \ + __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \ + return _Tpvec(cast_to(_mm_unpackhi_epi64(a1, b1))); \ +} \ +inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \ +{ \ + __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \ + c.val = cast_to(_mm_unpacklo_epi64(a1, b1)); \ + d.val = cast_to(_mm_unpackhi_epi64(a1, b1)); \ +} + +OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_SSE_UNPACKS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_SSE_UNPACKS(v_int16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps) +OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd) + +inline v_int32x4 v_round(const v_float32x4& a) +{ return v_int32x4(_mm_cvtps_epi32(a.val)); } + +inline v_int32x4 v_floor(const v_float32x4& a) +{ + __m128i a1 = _mm_cvtps_epi32(a.val); + __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(a1), a.val)); + return v_int32x4(_mm_add_epi32(a1, mask)); +} + +inline v_int32x4 v_ceil(const v_float32x4& a) +{ + __m128i a1 = _mm_cvtps_epi32(a.val); + __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(a.val, _mm_cvtepi32_ps(a1))); + return v_int32x4(_mm_sub_epi32(a1, mask)); +} + +inline v_int32x4 v_trunc(const v_float32x4& a) +{ return v_int32x4(_mm_cvttps_epi32(a.val)); } + +inline v_int32x4 v_round(const v_float64x2& a) +{ return v_int32x4(_mm_cvtpd_epi32(a.val)); } + +inline v_int32x4 v_floor(const v_float64x2& a) +{ + __m128i a1 = _mm_cvtpd_epi32(a.val); + __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(_mm_cvtepi32_pd(a1), a.val)); + mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0 + return v_int32x4(_mm_add_epi32(a1, mask)); +} + +inline v_int32x4 v_ceil(const v_float64x2& a) +{ + __m128i a1 = _mm_cvtpd_epi32(a.val); + __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(a.val, _mm_cvtepi32_pd(a1))); + mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0 + return v_int32x4(_mm_sub_epi32(a1, mask)); +} + +inline v_int32x4 v_trunc(const v_float64x2& a) +{ return v_int32x4(_mm_cvttpd_epi32(a.val)); } + +#define OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \ +inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \ + const _Tpvec& a2, const _Tpvec& a3, \ + _Tpvec& b0, _Tpvec& b1, \ + _Tpvec& b2, _Tpvec& b3) \ +{ \ + __m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \ + __m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \ + __m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \ + __m128i t3 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \ +\ + b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \ + b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \ + b2.val = cast_to(_mm_unpacklo_epi64(t2, t3)); \ + b3.val = cast_to(_mm_unpackhi_epi64(t2, t3)); \ +} + +OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps) + +#if 0 +inline void v_load_deinterleave(const uchar*, v_uint8x16&, v_uint8x16&, v_uint8x16&) +{ + // !!! TODO !!! +} +#endif + +inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d) +{ + __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ... + __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ... + __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); // a8 b8 c8 d8 ... + __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 48)); // a12 b12 c12 d12 ... + + __m128 v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ... + __m128 v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ... + __m128 v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ... + __m128 v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b4 b14 ... + + u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ... + u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ... + u2 = _mm_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ... + u3 = _mm_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ... + + v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ... + v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ... + v2 = _mm_unpackhi_epi8(u0, u1); // b0 b2 b4 b6 ... + v3 = _mm_unpackhi_epi8(u2, u3); // b1 b3 b5 b7 ... + + a.val = _mm_unpacklo_epi8(v0, v1); + b.val = _mm_unpacklo_epi8(v2, v3); + c.val = _mm_unpackhi_epi8(v0, v1); + d.val = _mm_unpacklo_epi8(v2, v3); +} + +#if 0 +inline void v_load_deinterleave(const ushort*, v_uint16x8&, v_uint16x8&, v_uint16x8&) +{ + // !!! TODO !!! +} +#endif + +inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d) +{ + __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 + __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 8)); // a2 b2 c2 d2 ... + __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ... + __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 24)); // a6 b6 c6 d6 ... + + __m128 v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ... + __m128 v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ... + __m128 v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ... + __m128 v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ... + + u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ... + u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ... + u2 = _mm_unpackhi_epi16(v0, v2); // c0 c2 c4 c6 ... + u3 = _mm_unpackhi_epi16(v1, v3); // c1 c3 c5 c7 ... + + a.val = _mm_unpacklo_epi16(u0, u1); + b.val = _mm_unpackhi_epi16(u0, u1); + c.val = _mm_unpacklo_epi16(u2, u3); + d.val = _mm_unpackhi_epi16(u2, u3); +} + +#if 0 +inline void v_load_deinterleave(const unsigned*, v_uint32x4&, v_uint32x4&, v_uint32x4&) +{ + // !!! TODO !!! +} +#endif + +inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d) +{ + v_uint32x4 u0(_mm_loadu_si128((const __m128i*)ptr)); // a0 b0 c0 d0 + v_uint32x4 u1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1 + v_uint32x4 u2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2 + v_uint32x4 u3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3 + + v_transpose4x4(u0, u1, u2, u3, a, b, c, d); +} + +inline void v_load_deinterleave(const float*, v_float32x4&, v_float32x4&, v_float32x4&) +{ + // !!! TODO !!! +} + +inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c, v_float32x4& d) +{ + v_float32x4 u0(_mm_loadu_ps(ptr)); + v_float32x4 u1(_mm_loadu_ps(ptr + 4)); + v_float32x4 u2(_mm_loadu_ps(ptr + 8)); + v_float32x4 u3(_mm_loadu_ps(ptr + 12)); + + v_transpose4x4(u0, u1, u2, u3, a, b, c, d); +} + +inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, + const v_uint8x16& c, const v_uint8x16& d) +{ + // a0 a1 a2 a3 .... + // b0 b1 b2 b3 .... + // c0 c1 c2 c3 .... + // d0 d1 d2 d3 .... + __m128i u0 = _mm_unpacklo_epi8(a.val, c.val); // a0 c0 a1 c1 ... + __m128i u1 = _mm_unpackhi_epi8(a.val, c.val); // a8 c8 a9 c9 ... + __m128i u2 = _mm_unpacklo_epi8(b.val, d.val); // b0 d0 b1 d1 ... + __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ... + + __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ... + __m128i v1 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ... + __m128i v2 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ... + __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ... + + _mm_storeu_si128((__m128i*)ptr, v0); + _mm_storeu_si128((__m128i*)(ptr + 16), v2); + _mm_storeu_si128((__m128i*)(ptr + 32), v1); + _mm_storeu_si128((__m128i*)(ptr + 48), v3); +} + +inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b, + const v_uint16x8& c, const v_uint16x8& d) +{ + // a0 a1 a2 a3 .... + // b0 b1 b2 b3 .... + // c0 c1 c2 c3 .... + // d0 d1 d2 d3 .... + __m128i u0 = _mm_unpacklo_epi16(a.val, c.val); // a0 c0 a1 c1 ... + __m128i u1 = _mm_unpackhi_epi16(a.val, c.val); // a4 c4 a5 c5 ... + __m128i u2 = _mm_unpacklo_epi16(b.val, d.val); // b0 d0 b1 d1 ... + __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ... + + __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ... + __m128i v1 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ... + __m128i v2 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ... + __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ... + + _mm_storeu_si128((__m128i*)ptr, v0); + _mm_storeu_si128((__m128i*)(ptr + 8), v2); + _mm_storeu_si128((__m128i*)(ptr + 16), v1); + _mm_storeu_si128((__m128i*)(ptr + 24), v3); +} + +inline v_float32x4 v_cvt_f32(const v_int32x4& a) +{ + return v_float32x4(_mm_cvtepi32_ps(a.val)); +} + +inline v_float32x4 v_cvt_f32(const v_float64x2& a) +{ + return v_float32x4(_mm_cvtpd_ps(a.val)); +} + +inline v_float64x2 v_cvt_f64(const v_int32x4& a) +{ + return v_float64x2(_mm_cvtepi32_pd(a.val)); +} + +inline v_float64x2 v_cvt_f64(const v_float32x4& a) +{ + return v_float64x2(_mm_cvtps_pd(a.val)); +} + +#elif CV_NEON + +#define CV_SIMD128 1 + +struct v_uint8x16 +{ + uint8x16_t val; +}; + +struct v_int8x16 +{ + int8x16_t val; +}; + +struct v_uint16x8 +{ + uint16x8_t val; +}; + +struct v_int16x8 +{ + int16x8_t val; +}; + +struct v_uint32x4 +{ + uint32x4_t val; +}; + +struct v_int32x4 +{ + int32x4_t val; +}; + +struct v_float32x4 +{ + float32x4_t val; +}; + +typedef v_reg v_float64x2; +typedef v_reg v_float64x4; + +#else + +typedef v_reg v_uint8x16; +typedef v_reg v_int8x16; +typedef v_reg v_uint16x8; +typedef v_reg v_int16x8; +typedef v_reg v_uint32x4; +typedef v_reg v_int32x4; +typedef v_reg v_float32x4; +typedef v_reg v_float32x8; +typedef v_reg v_float64x2; +typedef v_reg v_float64x4; + +inline v_uint8x16 v_setzero_u8() { return v_uint8x16::zero(); } +inline v_int8x16 v_setzero_s8() { return v_int8x16::zero(); } +inline v_uint16x8 v_setzero_u16() { return v_uint16x8::zero(); } +inline v_int16x8 v_setzero_s16() { return v_int16x8::zero(); } +inline v_uint32x4 v_setzero_u32() { return v_uint32x4::zero(); } +inline v_int32x4 v_setzero_s32() { return v_int32x4::zero(); } +inline v_float32x4 v_setzero_f32() { return v_float32x4::zero(); } +inline v_float64x2 v_setzero_f64() { return v_float64x2::zero(); } + +inline v_uint8x16 v_setall_u8(uchar v) { return v_uint8x16::all(v); } +inline v_int8x16 v_setall_s8(schar v) { return v_int8x16::all(v); } +inline v_uint16x8 v_setall_u16(ushort v) { return v_uint16x8::all(v); } +inline v_int16x8 v_setall_s16(short v) { return v_int16x8::all(v); } +inline v_uint32x4 v_setall_u32(unsigned v) { return v_uint32x4::all(v); } +inline v_int32x4 v_setall_s32(int v) { return v_int32x4::all(v); } +inline v_float32x4 v_setall_f32(float v) { return v_float32x4::all(v); } +inline v_float64x2 v_setall_f64(double v) { return v_float64x2::all(v); } + +template inline v_uint8x16 v_reinterpret_u8(const v_reg<_Tp, n>& a) +{ return v_reg<_Tp, n>::template reinterpret_as(a); } + +template inline v_int8x16 v_reinterpret_s8(const v_reg<_Tp, n>& a) +{ return v_reg<_Tp, n>::template reinterpret_as(a); } + +template inline v_uint16x8 v_reinterpret_u16(const v_reg<_Tp, n>& a) +{ return v_reg<_Tp, n>::template reinterpret_as(a); } + +template inline v_int16x8 v_reinterpret_s16(const v_reg<_Tp, n>& a) +{ return v_reg<_Tp, n>::template reinterpret_as(a); } + +template inline v_uint32x4 v_reinterpret_u32(const v_reg<_Tp, n>& a) +{ return v_reg<_Tp, n>::template reinterpret_as(a); } + +template inline v_int32x4 v_reinterpret_s32(const v_reg<_Tp, n>& a) +{ return v_reg<_Tp, n>::template reinterpret_as(a); } + +template inline v_float32x4 v_reinterpret_f32(const v_reg<_Tp, n>& a) +{ return v_reg<_Tp, n>::template reinterpret_as(a); } + +template inline v_float64x2 v_reinterpret_f64(const v_reg<_Tp, n>& a) +{ return v_reg<_Tp, n>::template reinterpret_as(a); } + +inline v_uint8x16 v_sat_u8(const v_uint16x8& a, const v_uint16x8& b) +{ return v_cvtsat(a, b); } +inline v_uint8x16 v_sat_u8(const v_uint16x8& a, const v_uint16x8& b, int n) +{ return v_cvtsat(a, b, n); } +inline v_uint8x16 v_sat_u8(const v_int16x8& a, const v_int16x8& b) +{ return v_cvtsat(a, b); } +inline v_uint8x16 v_sat_u8(const v_int16x8& a, const v_int16x8& b, int n) +{ return v_cvtsat(a, b, n); } + +inline void v_storesat_u8(uchar* ptr, const v_uint16x8& b) +{ return v_storesat(ptr, b); } +inline void v_storesat_u8(uchar* ptr, const v_uint16x8& b, int n) +{ return v_storesat(ptr, b, n); } +inline void v_storesat_u8(uchar* ptr, const v_int16x8& b) +{ return v_storesat(ptr, b); } +inline void v_storesat_u8(uchar* ptr, const v_int16x8& b, int n) +{ return v_storesat(ptr, b, n); } + +inline v_int8x16 v_sat_s8(const v_uint16x8& a, const v_uint16x8& b) +{ return v_cvtsat(a, b); } +inline v_int8x16 v_sat_s8(const v_uint16x8& a, const v_uint16x8& b, int n) +{ return v_cvtsat(a, b, n); } +inline v_int8x16 v_sat_s8(const v_int16x8& a, const v_int16x8& b) +{ return v_cvtsat(a, b); } +inline v_int8x16 v_sat_s8(const v_int16x8& a, const v_int16x8& b, int n) +{ return v_cvtsat(a, b, n); } + +inline void v_storesat_s8(schar* ptr, const v_uint16x8& b) +{ return v_storesat(ptr, b); } +inline void v_storesat_s8(schar* ptr, const v_uint16x8& b, int n) +{ return v_storesat(ptr, b, n); } +inline void v_storesat_s8(schar* ptr, const v_int16x8& b) +{ return v_storesat(ptr, b); } +inline void v_storesat_s8(schar* ptr, const v_int16x8& b, int n) +{ return v_storesat(ptr, b, n); } + +inline v_uint16x8 v_sat_u16(const v_uint32x4& a, const v_uint32x4& b) +{ return v_cvtsat(a, b); } +inline v_uint16x8 v_sat_u16(const v_uint32x4& a, const v_uint32x4& b, int n) +{ return v_cvtsat(a, b, n); } +inline v_uint16x8 v_sat_u16(const v_int32x4& a, const v_int32x4& b) +{ return v_cvtsat(a, b); } +inline v_uint16x8 v_sat_u16(const v_int32x4& a, const v_int32x4& b, int n) +{ return v_cvtsat(a, b, n); } + +inline void v_storesat_u16(ushort* ptr, const v_uint32x4& b) +{ return v_storesat(ptr, b); } +inline void v_storesat_u16(ushort* ptr, const v_uint32x4& b, int n) +{ return v_storesat(ptr, b, n); } +inline void v_storesat_u16(ushort* ptr, const v_int32x4& b) +{ return v_storesat(ptr, b); } +inline void v_storesat_u16(ushort* ptr, const v_int32x4& b, int n) +{ return v_storesat(ptr, b, n); } + +inline v_int16x8 v_sat_s16(const v_uint32x4& a, const v_uint32x4& b) +{ return v_cvtsat(a, b); } +inline v_int16x8 v_sat_s16(const v_uint32x4& a, const v_uint32x4& b, int n) +{ return v_cvtsat(a, b, n); } +inline v_int16x8 v_sat_s16(const v_int32x4& a, const v_int32x4& b) +{ return v_cvtsat(a, b); } +inline v_int16x8 v_sat_s16(const v_int32x4& a, const v_int32x4& b, int n) +{ return v_cvtsat(a, b, n); } + +inline void v_storesat_s16(short* ptr, const v_uint32x4& b) +{ return v_storesat(ptr, b); } +inline void v_storesat_s16(short* ptr, const v_uint32x4& b, int n) +{ return v_storesat(ptr, b, n); } +inline void v_storesat_s16(short* ptr, const v_int32x4& b) +{ return v_storesat(ptr, b); } +inline void v_storesat_s16(short* ptr, const v_int32x4& b, int n) +{ return v_storesat(ptr, b, n); } + +inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, + const v_float32x4& m1, const v_float32x4& m2, + const v_float32x4& m3) +{ + return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + v.s[3]*m3.s[0], + v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + v.s[3]*m3.s[1], + v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + v.s[3]*m3.s[2], + v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]); +} + +#endif + +}} + +#endif diff --git a/modules/hal/src/arithm.cpp b/modules/hal/src/arithm.cpp new file mode 100644 index 0000000000..a3f69facca --- /dev/null +++ b/modules/hal/src/arithm.cpp @@ -0,0 +1,47 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" + +namespace cv { namespace hal { + +}} diff --git a/modules/hal/src/color.cpp b/modules/hal/src/color.cpp new file mode 100644 index 0000000000..a3f69facca --- /dev/null +++ b/modules/hal/src/color.cpp @@ -0,0 +1,47 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" + +namespace cv { namespace hal { + +}} diff --git a/modules/hal/src/filter.cpp b/modules/hal/src/filter.cpp new file mode 100644 index 0000000000..a3f69facca --- /dev/null +++ b/modules/hal/src/filter.cpp @@ -0,0 +1,47 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" + +namespace cv { namespace hal { + +}} diff --git a/modules/hal/src/mathfuncs.cpp b/modules/hal/src/mathfuncs.cpp new file mode 100644 index 0000000000..a3f69facca --- /dev/null +++ b/modules/hal/src/mathfuncs.cpp @@ -0,0 +1,47 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" + +namespace cv { namespace hal { + +}} diff --git a/modules/hal/src/matrix.cpp b/modules/hal/src/matrix.cpp new file mode 100644 index 0000000000..a3f69facca --- /dev/null +++ b/modules/hal/src/matrix.cpp @@ -0,0 +1,47 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" + +namespace cv { namespace hal { + +}} diff --git a/modules/hal/src/precomp.hpp b/modules/hal/src/precomp.hpp index 04ec378211..e6923fb897 100644 --- a/modules/hal/src/precomp.hpp +++ b/modules/hal/src/precomp.hpp @@ -1,2 +1,44 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ #include "opencv2/hal.hpp" +#include "opencv2/hal/intrin.hpp" diff --git a/modules/hal/src/resize.cpp b/modules/hal/src/resize.cpp new file mode 100644 index 0000000000..a3f69facca --- /dev/null +++ b/modules/hal/src/resize.cpp @@ -0,0 +1,47 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" + +namespace cv { namespace hal { + +}} diff --git a/modules/hal/src/norm.cpp b/modules/hal/src/stat.cpp similarity index 100% rename from modules/hal/src/norm.cpp rename to modules/hal/src/stat.cpp diff --git a/modules/hal/src/warp.cpp b/modules/hal/src/warp.cpp new file mode 100644 index 0000000000..a3f69facca --- /dev/null +++ b/modules/hal/src/warp.cpp @@ -0,0 +1,47 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" + +namespace cv { namespace hal { + +}} From ee11a2d266343e4af94875543503ad39ea2d2f4e Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Thu, 16 Apr 2015 23:00:26 +0300 Subject: [PATCH 02/12] fully implemented SSE and NEON cases of intrin.hpp; extended the HAL with some basic math functions --- modules/calib3d/test/test_fisheye.cpp | 2 +- modules/core/include/opencv2/core/base.hpp | 47 +- modules/core/include/opencv2/core/matx.hpp | 2 +- .../core/include/opencv2/core/operations.hpp | 4 +- modules/core/src/kmeans.cpp | 8 +- modules/core/src/lapack.cpp | 182 +- modules/core/src/mathfuncs.cpp | 1489 +---------------- modules/core/src/stat.cpp | 166 +- modules/features2d/src/kaze/AKAZEFeatures.cpp | 2 +- modules/hal/include/opencv2/hal.hpp | 25 +- modules/hal/include/opencv2/hal/defs.h | 4 +- modules/hal/include/opencv2/hal/intrin.hpp | 901 ++++++++-- modules/hal/src/mathfuncs.cpp | 1305 +++++++++++++++ modules/hal/src/matrix.cpp | 161 ++ modules/hal/src/precomp.hpp | 4 + modules/hal/src/stat.cpp | 154 +- modules/photo/src/arrays.hpp | 5 + modules/stitching/src/autocalib.cpp | 2 +- 18 files changed, 2460 insertions(+), 2003 deletions(-) diff --git a/modules/calib3d/test/test_fisheye.cpp b/modules/calib3d/test/test_fisheye.cpp index 553b81c39b..d4212e94fc 100644 --- a/modules/calib3d/test/test_fisheye.cpp +++ b/modules/calib3d/test/test_fisheye.cpp @@ -381,7 +381,7 @@ TEST_F(fisheyeTest, EtimateUncertainties) EXPECT_MAT_NEAR(errors.c, cv::Vec2d(0.890439368129246, 0.816096854937896), 1e-10); EXPECT_MAT_NEAR(errors.k, cv::Vec4d(0.00516248605191506, 0.0168181467500934, 0.0213118690274604, 0.00916010877545648), 1e-10); EXPECT_MAT_NEAR(err_std, cv::Vec2d(0.187475975266883, 0.185678953263995), 1e-10); - CV_Assert(abs(rms - 0.263782587133546) < 1e-10); + CV_Assert(fabs(rms - 0.263782587133546) < 1e-10); CV_Assert(errors.alpha == 0); } diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp index 83661a2fd6..e4efe0fb9b 100644 --- a/modules/core/include/opencv2/core/base.hpp +++ b/modules/core/include/opencv2/core/base.hpp @@ -53,6 +53,7 @@ #include "opencv2/core/cvdef.h" #include "opencv2/core/cvstd.hpp" +#include "opencv2/hal.hpp" namespace cv { @@ -419,6 +420,12 @@ typedef Hamming HammingLUT; /////////////////////////////////// inline norms //////////////////////////////////// +template inline _Tp cv_abs(_Tp x) { return std::abs(x); } +inline int cv_abs(uchar x) { return x; } +inline int cv_abs(schar x) { return std::abs(x); } +inline int cv_abs(ushort x) { return x; } +inline int cv_abs(short x) { return std::abs(x); } + template static inline _AccTp normL2Sqr(const _Tp* a, int n) { @@ -447,12 +454,12 @@ _AccTp normL1(const _Tp* a, int n) #if CV_ENABLE_UNROLLED for(; i <= n - 4; i += 4 ) { - s += (_AccTp)std::abs(a[i]) + (_AccTp)std::abs(a[i+1]) + - (_AccTp)std::abs(a[i+2]) + (_AccTp)std::abs(a[i+3]); + s += (_AccTp)cv_abs(a[i]) + (_AccTp)cv_abs(a[i+1]) + + (_AccTp)cv_abs(a[i+2]) + (_AccTp)cv_abs(a[i+3]); } #endif for( ; i < n; i++ ) - s += std::abs(a[i]); + s += cv_abs(a[i]); return s; } @@ -461,7 +468,7 @@ _AccTp normInf(const _Tp* a, int n) { _AccTp s = 0; for( int i = 0; i < n; i++ ) - s = std::max(s, (_AccTp)std::abs(a[i])); + s = std::max(s, (_AccTp)cv_abs(a[i])); return s; } @@ -485,11 +492,10 @@ _AccTp normL2Sqr(const _Tp* a, const _Tp* b, int n) return s; } -template<> inline -float normL2Sqr(const float* a, const float* b, int n) +inline float normL2Sqr(const float* a, const float* b, int n) { if( n >= 8 ) - return normL2Sqr_(a, b, n); + return hal::normL2Sqr_(a, b, n); float s = 0; for( int i = 0; i < n; i++ ) { @@ -519,11 +525,10 @@ _AccTp normL1(const _Tp* a, const _Tp* b, int n) return s; } -template<> inline -float normL1(const float* a, const float* b, int n) +inline float normL1(const float* a, const float* b, int n) { if( n >= 8 ) - return normL1_(a, b, n); + return hal::normL1_(a, b, n); float s = 0; for( int i = 0; i < n; i++ ) { @@ -533,10 +538,9 @@ float normL1(const float* a, const float* b, int n) return s; } -template<> inline -int normL1(const uchar* a, const uchar* b, int n) +inline int normL1(const uchar* a, const uchar* b, int n) { - return normL1_(a, b, n); + return hal::normL1_(a, b, n); } template static inline @@ -551,6 +555,23 @@ _AccTp normInf(const _Tp* a, const _Tp* b, int n) return s; } +/** @brief Computes the cube root of an argument. + + The function cubeRoot computes \f$\sqrt[3]{\texttt{val}}\f$. Negative arguments are handled correctly. + NaN and Inf are not handled. The accuracy approaches the maximum possible accuracy for + single-precision data. + @param val A function argument. + */ +CV_EXPORTS_W float cubeRoot(float val); + +/** @brief Calculates the angle of a 2D vector in degrees. + + The function fastAtan2 calculates the full-range angle of an input 2D vector. The angle is measured + in degrees and varies from 0 to 360 degrees. The accuracy is about 0.3 degrees. + @param x x-coordinate of the vector. + @param y y-coordinate of the vector. + */ +CV_EXPORTS_W float fastAtan2(float y, float x); ////////////////// forward declarations for important OpenCV types ////////////////// diff --git a/modules/core/include/opencv2/core/matx.hpp b/modules/core/include/opencv2/core/matx.hpp index 6cc5d06251..e9023243e8 100644 --- a/modules/core/include/opencv2/core/matx.hpp +++ b/modules/core/include/opencv2/core/matx.hpp @@ -427,7 +427,7 @@ template struct Matx_DetOp double operator ()(const Matx<_Tp, m, m>& a) const { Matx<_Tp, m, m> temp = a; - double p = LU(temp.val, m*sizeof(_Tp), m, 0, 0, 0); + double p = hal::LU(temp.val, m*sizeof(_Tp), m, 0, 0, 0); if( p == 0 ) return p; for( int i = 0; i < m; i++ ) diff --git a/modules/core/include/opencv2/core/operations.hpp b/modules/core/include/opencv2/core/operations.hpp index bced1a7559..2c42e1f3a3 100644 --- a/modules/core/include/opencv2/core/operations.hpp +++ b/modules/core/include/opencv2/core/operations.hpp @@ -72,9 +72,9 @@ template struct Matx_FastInvOp b(i, i) = (_Tp)1; if( method == DECOMP_CHOLESKY ) - return Cholesky(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m); + return hal::Cholesky(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m); - return LU(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m) != 0; + return hal::LU(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m) != 0; } }; diff --git a/modules/core/src/kmeans.cpp b/modules/core/src/kmeans.cpp index cc86d2972d..fe5a0cf6e9 100644 --- a/modules/core/src/kmeans.cpp +++ b/modules/core/src/kmeans.cpp @@ -79,7 +79,7 @@ public: for ( int i = begin; i(k); - const double dist = normL2Sqr_(sample, center, dims); + const double dist = normL2Sqr(sample, center, dims); if( min_dist > dist ) { @@ -384,7 +384,7 @@ double cv::kmeans( InputArray _data, int K, if( labels[i] != max_k ) continue; sample = data.ptr(i); - double dist = normL2Sqr_(sample, _old_center, dims); + double dist = normL2Sqr(sample, _old_center, dims); if( max_dist <= dist ) { diff --git a/modules/core/src/lapack.cpp b/modules/core/src/lapack.cpp index a766e5f2ed..dea25dd64c 100644 --- a/modules/core/src/lapack.cpp +++ b/modules/core/src/lapack.cpp @@ -50,168 +50,6 @@ namespace cv { -/****************************************************************************************\ -* LU & Cholesky implementation for small matrices * -\****************************************************************************************/ - -template static inline int -LUImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n) -{ - int i, j, k, p = 1; - astep /= sizeof(A[0]); - bstep /= sizeof(b[0]); - - for( i = 0; i < m; i++ ) - { - k = i; - - for( j = i+1; j < m; j++ ) - if( std::abs(A[j*astep + i]) > std::abs(A[k*astep + i]) ) - k = j; - - if( std::abs(A[k*astep + i]) < std::numeric_limits<_Tp>::epsilon() ) - return 0; - - if( k != i ) - { - for( j = i; j < m; j++ ) - std::swap(A[i*astep + j], A[k*astep + j]); - if( b ) - for( j = 0; j < n; j++ ) - std::swap(b[i*bstep + j], b[k*bstep + j]); - p = -p; - } - - _Tp d = -1/A[i*astep + i]; - - for( j = i+1; j < m; j++ ) - { - _Tp alpha = A[j*astep + i]*d; - - for( k = i+1; k < m; k++ ) - A[j*astep + k] += alpha*A[i*astep + k]; - - if( b ) - for( k = 0; k < n; k++ ) - b[j*bstep + k] += alpha*b[i*bstep + k]; - } - - A[i*astep + i] = -d; - } - - if( b ) - { - for( i = m-1; i >= 0; i-- ) - for( j = 0; j < n; j++ ) - { - _Tp s = b[i*bstep + j]; - for( k = i+1; k < m; k++ ) - s -= A[i*astep + k]*b[k*bstep + j]; - b[i*bstep + j] = s*A[i*astep + i]; - } - } - - return p; -} - - -int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n) -{ - return LUImpl(A, astep, m, b, bstep, n); -} - - -int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n) -{ - return LUImpl(A, astep, m, b, bstep, n); -} - - -template static inline bool -CholImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n) -{ - _Tp* L = A; - int i, j, k; - double s; - astep /= sizeof(A[0]); - bstep /= sizeof(b[0]); - - for( i = 0; i < m; i++ ) - { - for( j = 0; j < i; j++ ) - { - s = A[i*astep + j]; - for( k = 0; k < j; k++ ) - s -= L[i*astep + k]*L[j*astep + k]; - L[i*astep + j] = (_Tp)(s*L[j*astep + j]); - } - s = A[i*astep + i]; - for( k = 0; k < j; k++ ) - { - double t = L[i*astep + k]; - s -= t*t; - } - if( s < std::numeric_limits<_Tp>::epsilon() ) - return false; - L[i*astep + i] = (_Tp)(1./std::sqrt(s)); - } - - if( !b ) - return true; - - // LLt x = b - // 1: L y = b - // 2. Lt x = y - - /* - [ L00 ] y0 b0 - [ L10 L11 ] y1 = b1 - [ L20 L21 L22 ] y2 b2 - [ L30 L31 L32 L33 ] y3 b3 - - [ L00 L10 L20 L30 ] x0 y0 - [ L11 L21 L31 ] x1 = y1 - [ L22 L32 ] x2 y2 - [ L33 ] x3 y3 - */ - - for( i = 0; i < m; i++ ) - { - for( j = 0; j < n; j++ ) - { - s = b[i*bstep + j]; - for( k = 0; k < i; k++ ) - s -= L[i*astep + k]*b[k*bstep + j]; - b[i*bstep + j] = (_Tp)(s*L[i*astep + i]); - } - } - - for( i = m-1; i >= 0; i-- ) - { - for( j = 0; j < n; j++ ) - { - s = b[i*bstep + j]; - for( k = m-1; k > i; k-- ) - s -= L[k*astep + i]*b[k*bstep + j]; - b[i*bstep + j] = (_Tp)(s*L[i*astep + i]); - } - } - - return true; -} - - -bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n) -{ - return CholImpl(A, astep, m, b, bstep, n); -} - -bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n) -{ - return CholImpl(A, astep, m, b, bstep, n); -} - - template static inline _Tp hypot(_Tp a, _Tp b) { a = std::abs(a); @@ -882,7 +720,7 @@ double cv::determinant( InputArray _mat ) Mat a(rows, rows, CV_32F, (uchar*)buffer); mat.copyTo(a); - result = LU(a.ptr(), a.step, rows, 0, 0, 0); + result = hal::LU(a.ptr(), a.step, rows, 0, 0, 0); if( result ) { for( int i = 0; i < rows; i++ ) @@ -906,7 +744,7 @@ double cv::determinant( InputArray _mat ) Mat a(rows, rows, CV_64F, (uchar*)buffer); mat.copyTo(a); - result = LU(a.ptr(), a.step, rows, 0, 0, 0); + result = hal::LU(a.ptr(), a.step, rows, 0, 0, 0); if( result ) { for( int i = 0; i < rows; i++ ) @@ -1169,13 +1007,13 @@ double cv::invert( InputArray _src, OutputArray _dst, int method ) setIdentity(dst); if( method == DECOMP_LU && type == CV_32F ) - result = LU(src1.ptr(), src1.step, n, dst.ptr(), dst.step, n) != 0; + result = hal::LU(src1.ptr(), src1.step, n, dst.ptr(), dst.step, n) != 0; else if( method == DECOMP_LU && type == CV_64F ) - result = LU(src1.ptr(), src1.step, n, dst.ptr(), dst.step, n) != 0; + result = hal::LU(src1.ptr(), src1.step, n, dst.ptr(), dst.step, n) != 0; else if( method == DECOMP_CHOLESKY && type == CV_32F ) - result = Cholesky(src1.ptr(), src1.step, n, dst.ptr(), dst.step, n); + result = hal::Cholesky(src1.ptr(), src1.step, n, dst.ptr(), dst.step, n); else - result = Cholesky(src1.ptr(), src1.step, n, dst.ptr(), dst.step, n); + result = hal::Cholesky(src1.ptr(), src1.step, n, dst.ptr(), dst.step, n); if( !result ) dst = Scalar(0); @@ -1407,16 +1245,16 @@ bool cv::solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int meth if( method == DECOMP_LU ) { if( type == CV_32F ) - result = LU(a.ptr(), a.step, n, dst.ptr(), dst.step, nb) != 0; + result = hal::LU(a.ptr(), a.step, n, dst.ptr(), dst.step, nb) != 0; else - result = LU(a.ptr(), a.step, n, dst.ptr(), dst.step, nb) != 0; + result = hal::LU(a.ptr(), a.step, n, dst.ptr(), dst.step, nb) != 0; } else if( method == DECOMP_CHOLESKY ) { if( type == CV_32F ) - result = Cholesky(a.ptr(), a.step, n, dst.ptr(), dst.step, nb); + result = hal::Cholesky(a.ptr(), a.step, n, dst.ptr(), dst.step, nb); else - result = Cholesky(a.ptr(), a.step, n, dst.ptr(), dst.step, nb); + result = hal::Cholesky(a.ptr(), a.step, n, dst.ptr(), dst.step, nb); } else { diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp index 446b62731b..e96eaeb41a 100644 --- a/modules/core/src/mathfuncs.cpp +++ b/modules/core/src/mathfuncs.cpp @@ -121,107 +121,6 @@ float fastAtan2( float y, float x ) return a; } -static void FastAtan2_32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees=true ) -{ - int i = 0; - float scale = angleInDegrees ? 1 : (float)(CV_PI/180); - -#ifdef HAVE_TEGRA_OPTIMIZATION - if (tegra::useTegra() && tegra::FastAtan2_32f(Y, X, angle, len, scale)) - return; -#endif - -#if CV_SSE2 - if( USE_SSE2 ) - { - Cv32suf iabsmask; iabsmask.i = 0x7fffffff; - __m128 eps = _mm_set1_ps((float)DBL_EPSILON), absmask = _mm_set1_ps(iabsmask.f); - __m128 _90 = _mm_set1_ps(90.f), _180 = _mm_set1_ps(180.f), _360 = _mm_set1_ps(360.f); - __m128 z = _mm_setzero_ps(), scale4 = _mm_set1_ps(scale); - __m128 p1 = _mm_set1_ps(atan2_p1), p3 = _mm_set1_ps(atan2_p3); - __m128 p5 = _mm_set1_ps(atan2_p5), p7 = _mm_set1_ps(atan2_p7); - - for( ; i <= len - 4; i += 4 ) - { - __m128 x = _mm_loadu_ps(X + i), y = _mm_loadu_ps(Y + i); - __m128 ax = _mm_and_ps(x, absmask), ay = _mm_and_ps(y, absmask); - __m128 mask = _mm_cmplt_ps(ax, ay); - __m128 tmin = _mm_min_ps(ax, ay), tmax = _mm_max_ps(ax, ay); - __m128 c = _mm_div_ps(tmin, _mm_add_ps(tmax, eps)); - __m128 c2 = _mm_mul_ps(c, c); - __m128 a = _mm_mul_ps(c2, p7); - a = _mm_mul_ps(_mm_add_ps(a, p5), c2); - a = _mm_mul_ps(_mm_add_ps(a, p3), c2); - a = _mm_mul_ps(_mm_add_ps(a, p1), c); - - __m128 b = _mm_sub_ps(_90, a); - a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask)); - - b = _mm_sub_ps(_180, a); - mask = _mm_cmplt_ps(x, z); - a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask)); - - b = _mm_sub_ps(_360, a); - mask = _mm_cmplt_ps(y, z); - a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask)); - - a = _mm_mul_ps(a, scale4); - _mm_storeu_ps(angle + i, a); - } - } -#elif CV_NEON - float32x4_t eps = vdupq_n_f32((float)DBL_EPSILON); - float32x4_t _90 = vdupq_n_f32(90.f), _180 = vdupq_n_f32(180.f), _360 = vdupq_n_f32(360.f); - float32x4_t z = vdupq_n_f32(0.0f), scale4 = vdupq_n_f32(scale); - float32x4_t p1 = vdupq_n_f32(atan2_p1), p3 = vdupq_n_f32(atan2_p3); - float32x4_t p5 = vdupq_n_f32(atan2_p5), p7 = vdupq_n_f32(atan2_p7); - - for( ; i <= len - 4; i += 4 ) - { - float32x4_t x = vld1q_f32(X + i), y = vld1q_f32(Y + i); - float32x4_t ax = vabsq_f32(x), ay = vabsq_f32(y); - float32x4_t tmin = vminq_f32(ax, ay), tmax = vmaxq_f32(ax, ay); - float32x4_t c = vmulq_f32(tmin, cv_vrecpq_f32(vaddq_f32(tmax, eps))); - float32x4_t c2 = vmulq_f32(c, c); - float32x4_t a = vmulq_f32(c2, p7); - a = vmulq_f32(vaddq_f32(a, p5), c2); - a = vmulq_f32(vaddq_f32(a, p3), c2); - a = vmulq_f32(vaddq_f32(a, p1), c); - - a = vbslq_f32(vcgeq_f32(ax, ay), a, vsubq_f32(_90, a)); - a = vbslq_f32(vcltq_f32(x, z), vsubq_f32(_180, a), a); - a = vbslq_f32(vcltq_f32(y, z), vsubq_f32(_360, a), a); - - vst1q_f32(angle + i, vmulq_f32(a, scale4)); - } -#endif - - for( ; i < len; i++ ) - { - float x = X[i], y = Y[i]; - float ax = std::abs(x), ay = std::abs(y); - float a, c, c2; - if( ax >= ay ) - { - c = ay/(ax + (float)DBL_EPSILON); - c2 = c*c; - a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c; - } - else - { - c = ax/(ay + (float)DBL_EPSILON); - c2 = c*c; - a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c; - } - if( x < 0 ) - a = 180.f - a; - if( y < 0 ) - a = 360.f - a; - angle[i] = (float)(a*scale); - } -} - - /* ************************************************************************** *\ Fast cube root by Ken Turkowski (http://www.worldserver.com/turk/computergraphics/papers.html) @@ -263,255 +162,6 @@ float cubeRoot( float value ) return v.f; } -static void Magnitude_32f(const float* x, const float* y, float* mag, int len) -{ -#if defined HAVE_IPP && 0 - CV_IPP_CHECK() - { - IppStatus status = ippsMagnitude_32f(x, y, mag, len); - if (status >= 0) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - - int i = 0; - -#if CV_SSE - if( USE_SSE2 ) - { - for( ; i <= len - 8; i += 8 ) - { - __m128 x0 = _mm_loadu_ps(x + i), x1 = _mm_loadu_ps(x + i + 4); - __m128 y0 = _mm_loadu_ps(y + i), y1 = _mm_loadu_ps(y + i + 4); - x0 = _mm_add_ps(_mm_mul_ps(x0, x0), _mm_mul_ps(y0, y0)); - x1 = _mm_add_ps(_mm_mul_ps(x1, x1), _mm_mul_ps(y1, y1)); - x0 = _mm_sqrt_ps(x0); x1 = _mm_sqrt_ps(x1); - _mm_storeu_ps(mag + i, x0); _mm_storeu_ps(mag + i + 4, x1); - } - } -#elif CV_NEON - for( ; i <= len - 4; i += 4 ) - { - float32x4_t v_x = vld1q_f32(x + i), v_y = vld1q_f32(y + i); - vst1q_f32(mag + i, cv_vsqrtq_f32(vmlaq_f32(vmulq_f32(v_x, v_x), v_y, v_y))); - } - for( ; i <= len - 2; i += 2 ) - { - float32x2_t v_x = vld1_f32(x + i), v_y = vld1_f32(y + i); - vst1_f32(mag + i, cv_vsqrt_f32(vmla_f32(vmul_f32(v_x, v_x), v_y, v_y))); - } -#endif - - for( ; i < len; i++ ) - { - float x0 = x[i], y0 = y[i]; - mag[i] = std::sqrt(x0*x0 + y0*y0); - } -} - -static void Magnitude_64f(const double* x, const double* y, double* mag, int len) -{ -#if defined(HAVE_IPP) - CV_IPP_CHECK() - { - IppStatus status = ippsMagnitude_64f(x, y, mag, len); - if (status >= 0) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - - int i = 0; - -#if CV_SSE2 - if( USE_SSE2 ) - { - for( ; i <= len - 4; i += 4 ) - { - __m128d x0 = _mm_loadu_pd(x + i), x1 = _mm_loadu_pd(x + i + 2); - __m128d y0 = _mm_loadu_pd(y + i), y1 = _mm_loadu_pd(y + i + 2); - x0 = _mm_add_pd(_mm_mul_pd(x0, x0), _mm_mul_pd(y0, y0)); - x1 = _mm_add_pd(_mm_mul_pd(x1, x1), _mm_mul_pd(y1, y1)); - x0 = _mm_sqrt_pd(x0); x1 = _mm_sqrt_pd(x1); - _mm_storeu_pd(mag + i, x0); _mm_storeu_pd(mag + i + 2, x1); - } - } -#endif - - for( ; i < len; i++ ) - { - double x0 = x[i], y0 = y[i]; - mag[i] = std::sqrt(x0*x0 + y0*y0); - } -} - - -static void InvSqrt_32f(const float* src, float* dst, int len) -{ -#if defined(HAVE_IPP) - CV_IPP_CHECK() - { - if (ippsInvSqrt_32f_A21(src, dst, len) >= 0) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - - int i = 0; - -#if CV_SSE - if( USE_SSE2 ) - { - __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f); - if( (((size_t)src|(size_t)dst) & 15) == 0 ) - for( ; i <= len - 8; i += 8 ) - { - __m128 t0 = _mm_load_ps(src + i), t1 = _mm_load_ps(src + i + 4); - __m128 h0 = _mm_mul_ps(t0, _0_5), h1 = _mm_mul_ps(t1, _0_5); - t0 = _mm_rsqrt_ps(t0); t1 = _mm_rsqrt_ps(t1); - t0 = _mm_mul_ps(t0, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t0,t0),h0))); - t1 = _mm_mul_ps(t1, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t1,t1),h1))); - _mm_store_ps(dst + i, t0); _mm_store_ps(dst + i + 4, t1); - } - else - for( ; i <= len - 8; i += 8 ) - { - __m128 t0 = _mm_loadu_ps(src + i), t1 = _mm_loadu_ps(src + i + 4); - __m128 h0 = _mm_mul_ps(t0, _0_5), h1 = _mm_mul_ps(t1, _0_5); - t0 = _mm_rsqrt_ps(t0); t1 = _mm_rsqrt_ps(t1); - t0 = _mm_mul_ps(t0, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t0,t0),h0))); - t1 = _mm_mul_ps(t1, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t1,t1),h1))); - _mm_storeu_ps(dst + i, t0); _mm_storeu_ps(dst + i + 4, t1); - } - } -#elif CV_NEON - for ( ; i <= len - 8; i += 8) - { - vst1q_f32(dst + i, cv_vrsqrtq_f32(vld1q_f32(src + i))); - vst1q_f32(dst + i + 4, cv_vrsqrtq_f32(vld1q_f32(src + i + 4))); - } -#endif - - for( ; i < len; i++ ) - dst[i] = 1/std::sqrt(src[i]); -} - - -static void InvSqrt_64f(const double* src, double* dst, int len) -{ - int i = 0; - -#if CV_SSE2 - if (USE_SSE2) - { - __m128d v_1 = _mm_set1_pd(1.0); - for ( ; i <= len - 2; i += 2) - _mm_storeu_pd(dst + i, _mm_div_pd(v_1, _mm_sqrt_pd(_mm_loadu_pd(src + i)))); - } -#endif - - for( ; i < len; i++ ) - dst[i] = 1/std::sqrt(src[i]); -} - - -static void Sqrt_32f(const float* src, float* dst, int len) -{ -#if defined(HAVE_IPP) - CV_IPP_CHECK() - { - if (ippsSqrt_32f_A21(src, dst, len) >= 0) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - int i = 0; - -#if CV_SSE - if( USE_SSE2 ) - { - if( (((size_t)src|(size_t)dst) & 15) == 0 ) - for( ; i <= len - 8; i += 8 ) - { - __m128 t0 = _mm_load_ps(src + i), t1 = _mm_load_ps(src + i + 4); - t0 = _mm_sqrt_ps(t0); t1 = _mm_sqrt_ps(t1); - _mm_store_ps(dst + i, t0); _mm_store_ps(dst + i + 4, t1); - } - else - for( ; i <= len - 8; i += 8 ) - { - __m128 t0 = _mm_loadu_ps(src + i), t1 = _mm_loadu_ps(src + i + 4); - t0 = _mm_sqrt_ps(t0); t1 = _mm_sqrt_ps(t1); - _mm_storeu_ps(dst + i, t0); _mm_storeu_ps(dst + i + 4, t1); - } - } -#elif CV_NEON - for ( ; i <= len - 8; i += 8) - { - vst1q_f32(dst + i, cv_vsqrtq_f32(vld1q_f32(src + i))); - vst1q_f32(dst + i + 4, cv_vsqrtq_f32(vld1q_f32(src + i + 4))); - } -#endif - - for( ; i < len; i++ ) - dst[i] = std::sqrt(src[i]); -} - - -static void Sqrt_64f(const double* src, double* dst, int len) -{ -#if defined(HAVE_IPP) - CV_IPP_CHECK() - { - if (ippsSqrt_64f_A50(src, dst, len) >= 0) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - - int i = 0; - -#if CV_SSE2 - if( USE_SSE2 ) - { - if( (((size_t)src|(size_t)dst) & 15) == 0 ) - for( ; i <= len - 4; i += 4 ) - { - __m128d t0 = _mm_load_pd(src + i), t1 = _mm_load_pd(src + i + 2); - t0 = _mm_sqrt_pd(t0); t1 = _mm_sqrt_pd(t1); - _mm_store_pd(dst + i, t0); _mm_store_pd(dst + i + 2, t1); - } - else - for( ; i <= len - 4; i += 4 ) - { - __m128d t0 = _mm_loadu_pd(src + i), t1 = _mm_loadu_pd(src + i + 2); - t0 = _mm_sqrt_pd(t0); t1 = _mm_sqrt_pd(t1); - _mm_storeu_pd(dst + i, t0); _mm_storeu_pd(dst + i + 2, t1); - } - } -#endif - - for( ; i < len; i++ ) - dst[i] = std::sqrt(src[i]); -} - - /****************************************************************************************\ * Cartezian -> Polar * \****************************************************************************************/ @@ -539,13 +189,13 @@ void magnitude( InputArray src1, InputArray src2, OutputArray dst ) { const float *x = (const float*)ptrs[0], *y = (const float*)ptrs[1]; float *mag = (float*)ptrs[2]; - Magnitude_32f( x, y, mag, len ); + hal::magnitude( x, y, mag, len ); } else { const double *x = (const double*)ptrs[0], *y = (const double*)ptrs[1]; double *mag = (double*)ptrs[2]; - Magnitude_64f( x, y, mag, len ); + hal::magnitude( x, y, mag, len ); } } } @@ -588,7 +238,7 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre { const float *x = (const float*)ptrs[0], *y = (const float*)ptrs[1]; float *angle = (float*)ptrs[2]; - FastAtan2_32f( y, x, angle, len, angleInDegrees ); + hal::fastAtan2( y, x, angle, len, angleInDegrees ); } else { @@ -618,7 +268,7 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre buf[1][k] = (float)y[k]; } - FastAtan2_32f( buf[1], buf[0], buf[0], len, angleInDegrees ); + hal::fastAtan2( buf[1], buf[0], buf[0], len, angleInDegrees ); k = 0; #if CV_SSE2 @@ -722,15 +372,15 @@ void cartToPolar( InputArray src1, InputArray src2, { const float *x = (const float*)ptrs[0], *y = (const float*)ptrs[1]; float *mag = (float*)ptrs[2], *angle = (float*)ptrs[3]; - Magnitude_32f( x, y, mag, len ); - FastAtan2_32f( y, x, angle, len, angleInDegrees ); + hal::magnitude( x, y, mag, len ); + hal::fastAtan2( y, x, angle, len, angleInDegrees ); } else { const double *x = (const double*)ptrs[0], *y = (const double*)ptrs[1]; double *angle = (double*)ptrs[3]; - Magnitude_64f(x, y, (double*)ptrs[2], len); + hal::magnitude(x, y, (double*)ptrs[2], len); k = 0; #if CV_SSE2 @@ -755,7 +405,7 @@ void cartToPolar( InputArray src1, InputArray src2, buf[1][k] = (float)y[k]; } - FastAtan2_32f( buf[1], buf[0], buf[0], len, angleInDegrees ); + hal::fastAtan2( buf[1], buf[0], buf[0], len, angleInDegrees ); k = 0; #if CV_SSE2 @@ -1096,482 +746,6 @@ void polarToCart( InputArray src1, InputArray src2, * E X P * \****************************************************************************************/ -typedef union -{ - struct { -#if ( defined( WORDS_BIGENDIAN ) && !defined( OPENCV_UNIVERSAL_BUILD ) ) || defined( __BIG_ENDIAN__ ) - int hi; - int lo; -#else - int lo; - int hi; -#endif - } i; - double d; -} -DBLINT; - -#define EXPTAB_SCALE 6 -#define EXPTAB_MASK ((1 << EXPTAB_SCALE) - 1) - -#define EXPPOLY_32F_A0 .9670371139572337719125840413672004409288e-2 - -static const double expTab[] = { - 1.0 * EXPPOLY_32F_A0, - 1.0108892860517004600204097905619 * EXPPOLY_32F_A0, - 1.0218971486541166782344801347833 * EXPPOLY_32F_A0, - 1.0330248790212284225001082839705 * EXPPOLY_32F_A0, - 1.0442737824274138403219664787399 * EXPPOLY_32F_A0, - 1.0556451783605571588083413251529 * EXPPOLY_32F_A0, - 1.0671404006768236181695211209928 * EXPPOLY_32F_A0, - 1.0787607977571197937406800374385 * EXPPOLY_32F_A0, - 1.0905077326652576592070106557607 * EXPPOLY_32F_A0, - 1.1023825833078409435564142094256 * EXPPOLY_32F_A0, - 1.1143867425958925363088129569196 * EXPPOLY_32F_A0, - 1.126521618608241899794798643787 * EXPPOLY_32F_A0, - 1.1387886347566916537038302838415 * EXPPOLY_32F_A0, - 1.151189229952982705817759635202 * EXPPOLY_32F_A0, - 1.1637248587775775138135735990922 * EXPPOLY_32F_A0, - 1.1763969916502812762846457284838 * EXPPOLY_32F_A0, - 1.1892071150027210667174999705605 * EXPPOLY_32F_A0, - 1.2021567314527031420963969574978 * EXPPOLY_32F_A0, - 1.2152473599804688781165202513388 * EXPPOLY_32F_A0, - 1.2284805361068700056940089577928 * EXPPOLY_32F_A0, - 1.2418578120734840485936774687266 * EXPPOLY_32F_A0, - 1.2553807570246910895793906574423 * EXPPOLY_32F_A0, - 1.2690509571917332225544190810323 * EXPPOLY_32F_A0, - 1.2828700160787782807266697810215 * EXPPOLY_32F_A0, - 1.2968395546510096659337541177925 * EXPPOLY_32F_A0, - 1.3109612115247643419229917863308 * EXPPOLY_32F_A0, - 1.3252366431597412946295370954987 * EXPPOLY_32F_A0, - 1.3396675240533030053600306697244 * EXPPOLY_32F_A0, - 1.3542555469368927282980147401407 * EXPPOLY_32F_A0, - 1.3690024229745906119296011329822 * EXPPOLY_32F_A0, - 1.3839098819638319548726595272652 * EXPPOLY_32F_A0, - 1.3989796725383111402095281367152 * EXPPOLY_32F_A0, - 1.4142135623730950488016887242097 * EXPPOLY_32F_A0, - 1.4296133383919700112350657782751 * EXPPOLY_32F_A0, - 1.4451808069770466200370062414717 * EXPPOLY_32F_A0, - 1.4609177941806469886513028903106 * EXPPOLY_32F_A0, - 1.476826145939499311386907480374 * EXPPOLY_32F_A0, - 1.4929077282912648492006435314867 * EXPPOLY_32F_A0, - 1.5091644275934227397660195510332 * EXPPOLY_32F_A0, - 1.5255981507445383068512536895169 * EXPPOLY_32F_A0, - 1.5422108254079408236122918620907 * EXPPOLY_32F_A0, - 1.5590044002378369670337280894749 * EXPPOLY_32F_A0, - 1.5759808451078864864552701601819 * EXPPOLY_32F_A0, - 1.5931421513422668979372486431191 * EXPPOLY_32F_A0, - 1.6104903319492543081795206673574 * EXPPOLY_32F_A0, - 1.628027421857347766848218522014 * EXPPOLY_32F_A0, - 1.6457554781539648445187567247258 * EXPPOLY_32F_A0, - 1.6636765803267364350463364569764 * EXPPOLY_32F_A0, - 1.6817928305074290860622509524664 * EXPPOLY_32F_A0, - 1.7001063537185234695013625734975 * EXPPOLY_32F_A0, - 1.7186192981224779156293443764563 * EXPPOLY_32F_A0, - 1.7373338352737062489942020818722 * EXPPOLY_32F_A0, - 1.7562521603732994831121606193753 * EXPPOLY_32F_A0, - 1.7753764925265212525505592001993 * EXPPOLY_32F_A0, - 1.7947090750031071864277032421278 * EXPPOLY_32F_A0, - 1.8142521755003987562498346003623 * EXPPOLY_32F_A0, - 1.8340080864093424634870831895883 * EXPPOLY_32F_A0, - 1.8539791250833855683924530703377 * EXPPOLY_32F_A0, - 1.8741676341102999013299989499544 * EXPPOLY_32F_A0, - 1.8945759815869656413402186534269 * EXPPOLY_32F_A0, - 1.9152065613971472938726112702958 * EXPPOLY_32F_A0, - 1.9360617934922944505980559045667 * EXPPOLY_32F_A0, - 1.9571441241754002690183222516269 * EXPPOLY_32F_A0, - 1.9784560263879509682582499181312 * EXPPOLY_32F_A0, -}; - - -// the code below uses _mm_cast* intrinsics, which are not avialable on VS2005 -#if (defined _MSC_VER && _MSC_VER < 1500) || \ - (!defined __APPLE__ && defined __GNUC__ && __GNUC__*100 + __GNUC_MINOR__ < 402) -#undef CV_SSE2 -#define CV_SSE2 0 -#endif - -static const double exp_prescale = 1.4426950408889634073599246810019 * (1 << EXPTAB_SCALE); -static const double exp_postscale = 1./(1 << EXPTAB_SCALE); -static const double exp_max_val = 3000.*(1 << EXPTAB_SCALE); // log10(DBL_MAX) < 3000 - -static void Exp_32f( const float *_x, float *y, int n ) -{ - static const float - A4 = (float)(1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0), - A3 = (float)(.6931471805521448196800669615864773144641 / EXPPOLY_32F_A0), - A2 = (float)(.2402265109513301490103372422686535526573 / EXPPOLY_32F_A0), - A1 = (float)(.5550339366753125211915322047004666939128e-1 / EXPPOLY_32F_A0); - -#undef EXPPOLY -#define EXPPOLY(x) \ - (((((x) + A1)*(x) + A2)*(x) + A3)*(x) + A4) - - int i = 0; - const Cv32suf* x = (const Cv32suf*)_x; - Cv32suf buf[4]; - -#if CV_SSE2 - if( n >= 8 && USE_SSE2 ) - { - static const __m128d prescale2 = _mm_set1_pd(exp_prescale); - static const __m128 postscale4 = _mm_set1_ps((float)exp_postscale); - static const __m128 maxval4 = _mm_set1_ps((float)(exp_max_val/exp_prescale)); - static const __m128 minval4 = _mm_set1_ps((float)(-exp_max_val/exp_prescale)); - - static const __m128 mA1 = _mm_set1_ps(A1); - static const __m128 mA2 = _mm_set1_ps(A2); - static const __m128 mA3 = _mm_set1_ps(A3); - static const __m128 mA4 = _mm_set1_ps(A4); - bool y_aligned = (size_t)(void*)y % 16 == 0; - - ushort CV_DECL_ALIGNED(16) tab_idx[8]; - - for( ; i <= n - 8; i += 8 ) - { - __m128 xf0, xf1; - xf0 = _mm_loadu_ps(&x[i].f); - xf1 = _mm_loadu_ps(&x[i+4].f); - __m128i xi0, xi1, xi2, xi3; - - xf0 = _mm_min_ps(_mm_max_ps(xf0, minval4), maxval4); - xf1 = _mm_min_ps(_mm_max_ps(xf1, minval4), maxval4); - - __m128d xd0 = _mm_cvtps_pd(xf0); - __m128d xd2 = _mm_cvtps_pd(_mm_movehl_ps(xf0, xf0)); - __m128d xd1 = _mm_cvtps_pd(xf1); - __m128d xd3 = _mm_cvtps_pd(_mm_movehl_ps(xf1, xf1)); - - xd0 = _mm_mul_pd(xd0, prescale2); - xd2 = _mm_mul_pd(xd2, prescale2); - xd1 = _mm_mul_pd(xd1, prescale2); - xd3 = _mm_mul_pd(xd3, prescale2); - - xi0 = _mm_cvtpd_epi32(xd0); - xi2 = _mm_cvtpd_epi32(xd2); - - xi1 = _mm_cvtpd_epi32(xd1); - xi3 = _mm_cvtpd_epi32(xd3); - - xd0 = _mm_sub_pd(xd0, _mm_cvtepi32_pd(xi0)); - xd2 = _mm_sub_pd(xd2, _mm_cvtepi32_pd(xi2)); - xd1 = _mm_sub_pd(xd1, _mm_cvtepi32_pd(xi1)); - xd3 = _mm_sub_pd(xd3, _mm_cvtepi32_pd(xi3)); - - xf0 = _mm_movelh_ps(_mm_cvtpd_ps(xd0), _mm_cvtpd_ps(xd2)); - xf1 = _mm_movelh_ps(_mm_cvtpd_ps(xd1), _mm_cvtpd_ps(xd3)); - - xf0 = _mm_mul_ps(xf0, postscale4); - xf1 = _mm_mul_ps(xf1, postscale4); - - xi0 = _mm_unpacklo_epi64(xi0, xi2); - xi1 = _mm_unpacklo_epi64(xi1, xi3); - xi0 = _mm_packs_epi32(xi0, xi1); - - _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi16(EXPTAB_MASK))); - - xi0 = _mm_add_epi16(_mm_srai_epi16(xi0, EXPTAB_SCALE), _mm_set1_epi16(127)); - xi0 = _mm_max_epi16(xi0, _mm_setzero_si128()); - xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(255)); - xi1 = _mm_unpackhi_epi16(xi0, _mm_setzero_si128()); - xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128()); - - __m128d yd0 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[0]), _mm_load_sd(expTab + tab_idx[1])); - __m128d yd1 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[2]), _mm_load_sd(expTab + tab_idx[3])); - __m128d yd2 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[4]), _mm_load_sd(expTab + tab_idx[5])); - __m128d yd3 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[6]), _mm_load_sd(expTab + tab_idx[7])); - - __m128 yf0 = _mm_movelh_ps(_mm_cvtpd_ps(yd0), _mm_cvtpd_ps(yd1)); - __m128 yf1 = _mm_movelh_ps(_mm_cvtpd_ps(yd2), _mm_cvtpd_ps(yd3)); - - yf0 = _mm_mul_ps(yf0, _mm_castsi128_ps(_mm_slli_epi32(xi0, 23))); - yf1 = _mm_mul_ps(yf1, _mm_castsi128_ps(_mm_slli_epi32(xi1, 23))); - - __m128 zf0 = _mm_add_ps(xf0, mA1); - __m128 zf1 = _mm_add_ps(xf1, mA1); - - zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA2); - zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA2); - - zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA3); - zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA3); - - zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA4); - zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA4); - - zf0 = _mm_mul_ps(zf0, yf0); - zf1 = _mm_mul_ps(zf1, yf1); - - if( y_aligned ) - { - _mm_store_ps(y + i, zf0); - _mm_store_ps(y + i + 4, zf1); - } - else - { - _mm_storeu_ps(y + i, zf0); - _mm_storeu_ps(y + i + 4, zf1); - } - } - } - else -#endif - for( ; i <= n - 4; i += 4 ) - { - double x0 = x[i].f * exp_prescale; - double x1 = x[i + 1].f * exp_prescale; - double x2 = x[i + 2].f * exp_prescale; - double x3 = x[i + 3].f * exp_prescale; - int val0, val1, val2, val3, t; - - if( ((x[i].i >> 23) & 255) > 127 + 10 ) - x0 = x[i].i < 0 ? -exp_max_val : exp_max_val; - - if( ((x[i+1].i >> 23) & 255) > 127 + 10 ) - x1 = x[i+1].i < 0 ? -exp_max_val : exp_max_val; - - if( ((x[i+2].i >> 23) & 255) > 127 + 10 ) - x2 = x[i+2].i < 0 ? -exp_max_val : exp_max_val; - - if( ((x[i+3].i >> 23) & 255) > 127 + 10 ) - x3 = x[i+3].i < 0 ? -exp_max_val : exp_max_val; - - val0 = cvRound(x0); - val1 = cvRound(x1); - val2 = cvRound(x2); - val3 = cvRound(x3); - - x0 = (x0 - val0)*exp_postscale; - x1 = (x1 - val1)*exp_postscale; - x2 = (x2 - val2)*exp_postscale; - x3 = (x3 - val3)*exp_postscale; - - t = (val0 >> EXPTAB_SCALE) + 127; - t = !(t & ~255) ? t : t < 0 ? 0 : 255; - buf[0].i = t << 23; - - t = (val1 >> EXPTAB_SCALE) + 127; - t = !(t & ~255) ? t : t < 0 ? 0 : 255; - buf[1].i = t << 23; - - t = (val2 >> EXPTAB_SCALE) + 127; - t = !(t & ~255) ? t : t < 0 ? 0 : 255; - buf[2].i = t << 23; - - t = (val3 >> EXPTAB_SCALE) + 127; - t = !(t & ~255) ? t : t < 0 ? 0 : 255; - buf[3].i = t << 23; - - x0 = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 ); - x1 = buf[1].f * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 ); - - y[i] = (float)x0; - y[i + 1] = (float)x1; - - x2 = buf[2].f * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 ); - x3 = buf[3].f * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 ); - - y[i + 2] = (float)x2; - y[i + 3] = (float)x3; - } - - for( ; i < n; i++ ) - { - double x0 = x[i].f * exp_prescale; - int val0, t; - - if( ((x[i].i >> 23) & 255) > 127 + 10 ) - x0 = x[i].i < 0 ? -exp_max_val : exp_max_val; - - val0 = cvRound(x0); - t = (val0 >> EXPTAB_SCALE) + 127; - t = !(t & ~255) ? t : t < 0 ? 0 : 255; - - buf[0].i = t << 23; - x0 = (x0 - val0)*exp_postscale; - - y[i] = (float)(buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY(x0)); - } -} - - -static void Exp_64f( const double *_x, double *y, int n ) -{ - static const double - A5 = .99999999999999999998285227504999 / EXPPOLY_32F_A0, - A4 = .69314718055994546743029643825322 / EXPPOLY_32F_A0, - A3 = .24022650695886477918181338054308 / EXPPOLY_32F_A0, - A2 = .55504108793649567998466049042729e-1 / EXPPOLY_32F_A0, - A1 = .96180973140732918010002372686186e-2 / EXPPOLY_32F_A0, - A0 = .13369713757180123244806654839424e-2 / EXPPOLY_32F_A0; - -#undef EXPPOLY -#define EXPPOLY(x) (((((A0*(x) + A1)*(x) + A2)*(x) + A3)*(x) + A4)*(x) + A5) - - int i = 0; - Cv64suf buf[4]; - const Cv64suf* x = (const Cv64suf*)_x; - -#if CV_SSE2 - if( USE_SSE2 ) - { - static const __m128d prescale2 = _mm_set1_pd(exp_prescale); - static const __m128d postscale2 = _mm_set1_pd(exp_postscale); - static const __m128d maxval2 = _mm_set1_pd(exp_max_val); - static const __m128d minval2 = _mm_set1_pd(-exp_max_val); - - static const __m128d mA0 = _mm_set1_pd(A0); - static const __m128d mA1 = _mm_set1_pd(A1); - static const __m128d mA2 = _mm_set1_pd(A2); - static const __m128d mA3 = _mm_set1_pd(A3); - static const __m128d mA4 = _mm_set1_pd(A4); - static const __m128d mA5 = _mm_set1_pd(A5); - - int CV_DECL_ALIGNED(16) tab_idx[4]; - - for( ; i <= n - 4; i += 4 ) - { - __m128d xf0 = _mm_loadu_pd(&x[i].f), xf1 = _mm_loadu_pd(&x[i+2].f); - __m128i xi0, xi1; - xf0 = _mm_min_pd(_mm_max_pd(xf0, minval2), maxval2); - xf1 = _mm_min_pd(_mm_max_pd(xf1, minval2), maxval2); - xf0 = _mm_mul_pd(xf0, prescale2); - xf1 = _mm_mul_pd(xf1, prescale2); - - xi0 = _mm_cvtpd_epi32(xf0); - xi1 = _mm_cvtpd_epi32(xf1); - xf0 = _mm_mul_pd(_mm_sub_pd(xf0, _mm_cvtepi32_pd(xi0)), postscale2); - xf1 = _mm_mul_pd(_mm_sub_pd(xf1, _mm_cvtepi32_pd(xi1)), postscale2); - - xi0 = _mm_unpacklo_epi64(xi0, xi1); - _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi32(EXPTAB_MASK))); - - xi0 = _mm_add_epi32(_mm_srai_epi32(xi0, EXPTAB_SCALE), _mm_set1_epi32(1023)); - xi0 = _mm_packs_epi32(xi0, xi0); - xi0 = _mm_max_epi16(xi0, _mm_setzero_si128()); - xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(2047)); - xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128()); - xi1 = _mm_unpackhi_epi32(xi0, _mm_setzero_si128()); - xi0 = _mm_unpacklo_epi32(xi0, _mm_setzero_si128()); - - __m128d yf0 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[0]), _mm_load_sd(expTab + tab_idx[1])); - __m128d yf1 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[2]), _mm_load_sd(expTab + tab_idx[3])); - yf0 = _mm_mul_pd(yf0, _mm_castsi128_pd(_mm_slli_epi64(xi0, 52))); - yf1 = _mm_mul_pd(yf1, _mm_castsi128_pd(_mm_slli_epi64(xi1, 52))); - - __m128d zf0 = _mm_add_pd(_mm_mul_pd(mA0, xf0), mA1); - __m128d zf1 = _mm_add_pd(_mm_mul_pd(mA0, xf1), mA1); - - zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA2); - zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA2); - - zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA3); - zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA3); - - zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA4); - zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA4); - - zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA5); - zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA5); - - zf0 = _mm_mul_pd(zf0, yf0); - zf1 = _mm_mul_pd(zf1, yf1); - - _mm_storeu_pd(y + i, zf0); - _mm_storeu_pd(y + i + 2, zf1); - } - } - else -#endif - for( ; i <= n - 4; i += 4 ) - { - double x0 = x[i].f * exp_prescale; - double x1 = x[i + 1].f * exp_prescale; - double x2 = x[i + 2].f * exp_prescale; - double x3 = x[i + 3].f * exp_prescale; - - double y0, y1, y2, y3; - int val0, val1, val2, val3, t; - - t = (int)(x[i].i >> 52); - if( (t & 2047) > 1023 + 10 ) - x0 = t < 0 ? -exp_max_val : exp_max_val; - - t = (int)(x[i+1].i >> 52); - if( (t & 2047) > 1023 + 10 ) - x1 = t < 0 ? -exp_max_val : exp_max_val; - - t = (int)(x[i+2].i >> 52); - if( (t & 2047) > 1023 + 10 ) - x2 = t < 0 ? -exp_max_val : exp_max_val; - - t = (int)(x[i+3].i >> 52); - if( (t & 2047) > 1023 + 10 ) - x3 = t < 0 ? -exp_max_val : exp_max_val; - - val0 = cvRound(x0); - val1 = cvRound(x1); - val2 = cvRound(x2); - val3 = cvRound(x3); - - x0 = (x0 - val0)*exp_postscale; - x1 = (x1 - val1)*exp_postscale; - x2 = (x2 - val2)*exp_postscale; - x3 = (x3 - val3)*exp_postscale; - - t = (val0 >> EXPTAB_SCALE) + 1023; - t = !(t & ~2047) ? t : t < 0 ? 0 : 2047; - buf[0].i = (int64)t << 52; - - t = (val1 >> EXPTAB_SCALE) + 1023; - t = !(t & ~2047) ? t : t < 0 ? 0 : 2047; - buf[1].i = (int64)t << 52; - - t = (val2 >> EXPTAB_SCALE) + 1023; - t = !(t & ~2047) ? t : t < 0 ? 0 : 2047; - buf[2].i = (int64)t << 52; - - t = (val3 >> EXPTAB_SCALE) + 1023; - t = !(t & ~2047) ? t : t < 0 ? 0 : 2047; - buf[3].i = (int64)t << 52; - - y0 = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 ); - y1 = buf[1].f * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 ); - - y[i] = y0; - y[i + 1] = y1; - - y2 = buf[2].f * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 ); - y3 = buf[3].f * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 ); - - y[i + 2] = y2; - y[i + 3] = y3; - } - - for( ; i < n; i++ ) - { - double x0 = x[i].f * exp_prescale; - int val0, t; - - t = (int)(x[i].i >> 52); - if( (t & 2047) > 1023 + 10 ) - x0 = t < 0 ? -exp_max_val : exp_max_val; - - val0 = cvRound(x0); - t = (val0 >> EXPTAB_SCALE) + 1023; - t = !(t & ~2047) ? t : t < 0 ? 0 : 2047; - - buf[0].i = (int64)t << 52; - x0 = (x0 - val0)*exp_postscale; - - y[i] = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 ); - } -} - -#undef EXPTAB_SCALE -#undef EXPTAB_MASK -#undef EXPPOLY_32F_A0 - #ifdef HAVE_IPP static void Exp_32f_ipp(const float *x, float *y, int n) { @@ -1584,7 +758,7 @@ static void Exp_32f_ipp(const float *x, float *y, int n) } setIppErrorStatus(); } - Exp_32f(x, y, n); + hal::exp(x, y, n); } static void Exp_64f_ipp(const double *x, double *y, int n) @@ -1598,11 +772,14 @@ static void Exp_64f_ipp(const double *x, double *y, int n) } setIppErrorStatus(); } - Exp_64f(x, y, n); + hal::exp(x, y, n); } #define Exp_32f Exp_32f_ipp #define Exp_64f Exp_64f_ipp +#else +#define Exp_32f hal::exp +#define Exp_64f hal::exp #endif @@ -1637,613 +814,6 @@ void exp( InputArray _src, OutputArray _dst ) * L O G * \****************************************************************************************/ -#define LOGTAB_SCALE 8 -#define LOGTAB_MASK ((1 << LOGTAB_SCALE) - 1) -#define LOGTAB_MASK2 ((1 << (20 - LOGTAB_SCALE)) - 1) -#define LOGTAB_MASK2_32F ((1 << (23 - LOGTAB_SCALE)) - 1) - -static const double CV_DECL_ALIGNED(16) icvLogTab[] = { -0.0000000000000000000000000000000000000000, 1.000000000000000000000000000000000000000, -.00389864041565732288852075271279318258166, .9961089494163424124513618677042801556420, -.00778214044205494809292034119607706088573, .9922480620155038759689922480620155038760, -.01165061721997527263705585198749759001657, .9884169884169884169884169884169884169884, -.01550418653596525274396267235488267033361, .9846153846153846153846153846153846153846, -.01934296284313093139406447562578250654042, .9808429118773946360153256704980842911877, -.02316705928153437593630670221500622574241, .9770992366412213740458015267175572519084, -.02697658769820207233514075539915211265906, .9733840304182509505703422053231939163498, -.03077165866675368732785500469617545604706, .9696969696969696969696969696969696969697, -.03455238150665972812758397481047722976656, .9660377358490566037735849056603773584906, -.03831886430213659461285757856785494368522, .9624060150375939849624060150375939849624, -.04207121392068705056921373852674150839447, .9588014981273408239700374531835205992509, -.04580953603129420126371940114040626212953, .9552238805970149253731343283582089552239, -.04953393512227662748292900118940451648088, .9516728624535315985130111524163568773234, -.05324451451881227759255210685296333394944, .9481481481481481481481481481481481481481, -.05694137640013842427411105973078520037234, .9446494464944649446494464944649446494465, -.06062462181643483993820353816772694699466, .9411764705882352941176470588235294117647, -.06429435070539725460836422143984236754475, .9377289377289377289377289377289377289377, -.06795066190850773679699159401934593915938, .9343065693430656934306569343065693430657, -.07159365318700880442825962290953611955044, .9309090909090909090909090909090909090909, -.07522342123758751775142172846244648098944, .9275362318840579710144927536231884057971, -.07884006170777602129362549021607264876369, .9241877256317689530685920577617328519856, -.08244366921107458556772229485432035289706, .9208633093525179856115107913669064748201, -.08603433734180314373940490213499288074675, .9175627240143369175627240143369175627240, -.08961215868968712416897659522874164395031, .9142857142857142857142857142857142857143, -.09317722485418328259854092721070628613231, .9110320284697508896797153024911032028470, -.09672962645855109897752299730200320482256, .9078014184397163120567375886524822695035, -.10026945316367513738597949668474029749630, .9045936395759717314487632508833922261484, -.10379679368164355934833764649738441221420, .9014084507042253521126760563380281690141, -.10731173578908805021914218968959175981580, .8982456140350877192982456140350877192982, -.11081436634029011301105782649756292812530, .8951048951048951048951048951048951048951, -.11430477128005862852422325204315711744130, .8919860627177700348432055749128919860627, -.11778303565638344185817487641543266363440, .8888888888888888888888888888888888888889, -.12124924363286967987640707633545389398930, .8858131487889273356401384083044982698962, -.12470347850095722663787967121606925502420, .8827586206896551724137931034482758620690, -.12814582269193003360996385708858724683530, .8797250859106529209621993127147766323024, -.13157635778871926146571524895989568904040, .8767123287671232876712328767123287671233, -.13499516453750481925766280255629681050780, .8737201365187713310580204778156996587031, -.13840232285911913123754857224412262439730, .8707482993197278911564625850340136054422, -.14179791186025733629172407290752744302150, .8677966101694915254237288135593220338983, -.14518200984449788903951628071808954700830, .8648648648648648648648648648648648648649, -.14855469432313711530824207329715136438610, .8619528619528619528619528619528619528620, -.15191604202584196858794030049466527998450, .8590604026845637583892617449664429530201, -.15526612891112392955683674244937719777230, .8561872909698996655518394648829431438127, -.15860503017663857283636730244325008243330, .8533333333333333333333333333333333333333, -.16193282026931324346641360989451641216880, .8504983388704318936877076411960132890365, -.16524957289530714521497145597095368430010, .8476821192052980132450331125827814569536, -.16855536102980664403538924034364754334090, .8448844884488448844884488448844884488449, -.17185025692665920060697715143760433420540, .8421052631578947368421052631578947368421, -.17513433212784912385018287750426679849630, .8393442622950819672131147540983606557377, -.17840765747281828179637841458315961062910, .8366013071895424836601307189542483660131, -.18167030310763465639212199675966985523700, .8338762214983713355048859934853420195440, -.18492233849401198964024217730184318497780, .8311688311688311688311688311688311688312, -.18816383241818296356839823602058459073300, .8284789644012944983818770226537216828479, -.19139485299962943898322009772527962923050, .8258064516129032258064516129032258064516, -.19461546769967164038916962454095482826240, .8231511254019292604501607717041800643087, -.19782574332991986754137769821682013571260, .8205128205128205128205128205128205128205, -.20102574606059073203390141770796617493040, .8178913738019169329073482428115015974441, -.20421554142869088876999228432396193966280, .8152866242038216560509554140127388535032, -.20739519434607056602715147164417430758480, .8126984126984126984126984126984126984127, -.21056476910734961416338251183333341032260, .8101265822784810126582278481012658227848, -.21372432939771812687723695489694364368910, .8075709779179810725552050473186119873817, -.21687393830061435506806333251006435602900, .8050314465408805031446540880503144654088, -.22001365830528207823135744547471404075630, .8025078369905956112852664576802507836991, -.22314355131420973710199007200571941211830, .8000000000000000000000000000000000000000, -.22626367865045338145790765338460914790630, .7975077881619937694704049844236760124611, -.22937410106484582006380890106811420992010, .7950310559006211180124223602484472049689, -.23247487874309405442296849741978803649550, .7925696594427244582043343653250773993808, -.23556607131276688371634975283086532726890, .7901234567901234567901234567901234567901, -.23864773785017498464178231643018079921600, .7876923076923076923076923076923076923077, -.24171993688714515924331749374687206000090, .7852760736196319018404907975460122699387, -.24478272641769091566565919038112042471760, .7828746177370030581039755351681957186544, -.24783616390458124145723672882013488560910, .7804878048780487804878048780487804878049, -.25088030628580937353433455427875742316250, .7781155015197568389057750759878419452888, -.25391520998096339667426946107298135757450, .7757575757575757575757575757575757575758, -.25694093089750041913887912414793390780680, .7734138972809667673716012084592145015106, -.25995752443692604627401010475296061486000, .7710843373493975903614457831325301204819, -.26296504550088134477547896494797896593800, .7687687687687687687687687687687687687688, -.26596354849713793599974565040611196309330, .7664670658682634730538922155688622754491, -.26895308734550393836570947314612567424780, .7641791044776119402985074626865671641791, -.27193371548364175804834985683555714786050, .7619047619047619047619047619047619047619, -.27490548587279922676529508862586226314300, .7596439169139465875370919881305637982196, -.27786845100345625159121709657483734190480, .7573964497041420118343195266272189349112, -.28082266290088775395616949026589281857030, .7551622418879056047197640117994100294985, -.28376817313064456316240580235898960381750, .7529411764705882352941176470588235294118, -.28670503280395426282112225635501090437180, .7507331378299120234604105571847507331378, -.28963329258304265634293983566749375313530, .7485380116959064327485380116959064327485, -.29255300268637740579436012922087684273730, .7463556851311953352769679300291545189504, -.29546421289383584252163927885703742504130, .7441860465116279069767441860465116279070, -.29836697255179722709783618483925238251680, .7420289855072463768115942028985507246377, -.30126133057816173455023545102449133992200, .7398843930635838150289017341040462427746, -.30414733546729666446850615102448500692850, .7377521613832853025936599423631123919308, -.30702503529491181888388950937951449304830, .7356321839080459770114942528735632183908, -.30989447772286465854207904158101882785550, .7335243553008595988538681948424068767908, -.31275571000389684739317885942000430077330, .7314285714285714285714285714285714285714, -.31560877898630329552176476681779604405180, .7293447293447293447293447293447293447293, -.31845373111853458869546784626436419785030, .7272727272727272727272727272727272727273, -.32129061245373424782201254856772720813750, .7252124645892351274787535410764872521246, -.32411946865421192853773391107097268104550, .7231638418079096045197740112994350282486, -.32694034499585328257253991068864706903700, .7211267605633802816901408450704225352113, -.32975328637246797969240219572384376078850, .7191011235955056179775280898876404494382, -.33255833730007655635318997155991382896900, .7170868347338935574229691876750700280112, -.33535554192113781191153520921943709254280, .7150837988826815642458100558659217877095, -.33814494400871636381467055798566434532400, .7130919220055710306406685236768802228412, -.34092658697059319283795275623560883104800, .7111111111111111111111111111111111111111, -.34370051385331840121395430287520866841080, .7091412742382271468144044321329639889197, -.34646676734620857063262633346312213689100, .7071823204419889502762430939226519337017, -.34922538978528827602332285096053965389730, .7052341597796143250688705234159779614325, -.35197642315717814209818925519357435405250, .7032967032967032967032967032967032967033, -.35471990910292899856770532096561510115850, .7013698630136986301369863013698630136986, -.35745588892180374385176833129662554711100, .6994535519125683060109289617486338797814, -.36018440357500774995358483465679455548530, .6975476839237057220708446866485013623978, -.36290549368936841911903457003063522279280, .6956521739130434782608695652173913043478, -.36561919956096466943762379742111079394830, .6937669376693766937669376693766937669377, -.36832556115870762614150635272380895912650, .6918918918918918918918918918918918918919, -.37102461812787262962487488948681857436900, .6900269541778975741239892183288409703504, -.37371640979358405898480555151763837784530, .6881720430107526881720430107526881720430, -.37640097516425302659470730759494472295050, .6863270777479892761394101876675603217158, -.37907835293496944251145919224654790014030, .6844919786096256684491978609625668449198, -.38174858149084833769393299007788300514230, .6826666666666666666666666666666666666667, -.38441169891033200034513583887019194662580, .6808510638297872340425531914893617021277, -.38706774296844825844488013899535872042180, .6790450928381962864721485411140583554377, -.38971675114002518602873692543653305619950, .6772486772486772486772486772486772486772, -.39235876060286384303665840889152605086580, .6754617414248021108179419525065963060686, -.39499380824086893770896722344332374632350, .6736842105263157894736842105263157894737, -.39762193064713846624158577469643205404280, .6719160104986876640419947506561679790026, -.40024316412701266276741307592601515352730, .6701570680628272251308900523560209424084, -.40285754470108348090917615991202183067800, .6684073107049608355091383812010443864230, -.40546510810816432934799991016916465014230, .6666666666666666666666666666666666666667, -.40806588980822172674223224930756259709600, .6649350649350649350649350649350649350649, -.41065992498526837639616360320360399782650, .6632124352331606217616580310880829015544, -.41324724855021932601317757871584035456180, .6614987080103359173126614987080103359173, -.41582789514371093497757669865677598863850, .6597938144329896907216494845360824742268, -.41840189913888381489925905043492093682300, .6580976863753213367609254498714652956298, -.42096929464412963239894338585145305842150, .6564102564102564102564102564102564102564, -.42353011550580327293502591601281892508280, .6547314578005115089514066496163682864450, -.42608439531090003260516141381231136620050, .6530612244897959183673469387755102040816, -.42863216738969872610098832410585600882780, .6513994910941475826972010178117048346056, -.43117346481837132143866142541810404509300, .6497461928934010152284263959390862944162, -.43370832042155937902094819946796633303180, .6481012658227848101265822784810126582278, -.43623676677491801667585491486534010618930, .6464646464646464646464646464646464646465, -.43875883620762790027214350629947148263450, .6448362720403022670025188916876574307305, -.44127456080487520440058801796112675219780, .6432160804020100502512562814070351758794, -.44378397241030093089975139264424797147500, .6416040100250626566416040100250626566416, -.44628710262841947420398014401143882423650, .6400000000000000000000000000000000000000, -.44878398282700665555822183705458883196130, .6384039900249376558603491271820448877805, -.45127464413945855836729492693848442286250, .6368159203980099502487562189054726368159, -.45375911746712049854579618113348260521900, .6352357320099255583126550868486352357320, -.45623743348158757315857769754074979573500, .6336633663366336633663366336633663366337, -.45870962262697662081833982483658473938700, .6320987654320987654320987654320987654321, -.46117571512217014895185229761409573256980, .6305418719211822660098522167487684729064, -.46363574096303250549055974261136725544930, .6289926289926289926289926289926289926290, -.46608972992459918316399125615134835243230, .6274509803921568627450980392156862745098, -.46853771156323925639597405279346276074650, .6259168704156479217603911980440097799511, -.47097971521879100631480241645476780831830, .6243902439024390243902439024390243902439, -.47341577001667212165614273544633761048330, .6228710462287104622871046228710462287105, -.47584590486996386493601107758877333253630, .6213592233009708737864077669902912621359, -.47827014848147025860569669930555392056700, .6198547215496368038740920096852300242131, -.48068852934575190261057286988943815231330, .6183574879227053140096618357487922705314, -.48310107575113581113157579238759353756900, .6168674698795180722891566265060240963855, -.48550781578170076890899053978500887751580, .6153846153846153846153846153846153846154, -.48790877731923892879351001283794175833480, .6139088729016786570743405275779376498801, -.49030398804519381705802061333088204264650, .6124401913875598086124401913875598086124, -.49269347544257524607047571407747454941280, .6109785202863961813842482100238663484487, -.49507726679785146739476431321236304938800, .6095238095238095238095238095238095238095, -.49745538920281889838648226032091770321130, .6080760095011876484560570071258907363420, -.49982786955644931126130359189119189977650, .6066350710900473933649289099526066350711, -.50219473456671548383667413872899487614650, .6052009456264775413711583924349881796690, -.50455601075239520092452494282042607665050, .6037735849056603773584905660377358490566, -.50691172444485432801997148999362252652650, .6023529411764705882352941176470588235294, -.50926190178980790257412536448100581765150, .6009389671361502347417840375586854460094, -.51160656874906207391973111953120678663250, .5995316159250585480093676814988290398126, -.51394575110223428282552049495279788970950, .5981308411214953271028037383177570093458, -.51627947444845445623684554448118433356300, .5967365967365967365967365967365967365967, -.51860776420804555186805373523384332656850, .5953488372093023255813953488372093023256, -.52093064562418522900344441950437612831600, .5939675174013921113689095127610208816705, -.52324814376454775732838697877014055848100, .5925925925925925925925925925925925925926, -.52556028352292727401362526507000438869000, .5912240184757505773672055427251732101617, -.52786708962084227803046587723656557500350, .5898617511520737327188940092165898617512, -.53016858660912158374145519701414741575700, .5885057471264367816091954022988505747126, -.53246479886947173376654518506256863474850, .5871559633027522935779816513761467889908, -.53475575061602764748158733709715306758900, .5858123569794050343249427917620137299771, -.53704146589688361856929077475797384977350, .5844748858447488584474885844748858447489, -.53932196859560876944783558428753167390800, .5831435079726651480637813211845102505695, -.54159728243274429804188230264117009937750, .5818181818181818181818181818181818181818, -.54386743096728351609669971367111429572100, .5804988662131519274376417233560090702948, -.54613243759813556721383065450936555862450, .5791855203619909502262443438914027149321, -.54839232556557315767520321969641372561450, .5778781038374717832957110609480812641084, -.55064711795266219063194057525834068655950, .5765765765765765765765765765765765765766, -.55289683768667763352766542084282264113450, .5752808988764044943820224719101123595506, -.55514150754050151093110798683483153581600, .5739910313901345291479820627802690582960, -.55738115013400635344709144192165695130850, .5727069351230425055928411633109619686801, -.55961578793542265941596269840374588966350, .5714285714285714285714285714285714285714, -.56184544326269181269140062795486301183700, .5701559020044543429844097995545657015590, -.56407013828480290218436721261241473257550, .5688888888888888888888888888888888888889, -.56628989502311577464155334382667206227800, .5676274944567627494456762749445676274945, -.56850473535266865532378233183408156037350, .5663716814159292035398230088495575221239, -.57071468100347144680739575051120482385150, .5651214128035320088300220750551876379691, -.57291975356178548306473885531886480748650, .5638766519823788546255506607929515418502, -.57511997447138785144460371157038025558000, .5626373626373626373626373626373626373626, -.57731536503482350219940144597785547375700, .5614035087719298245614035087719298245614, -.57950594641464214795689713355386629700650, .5601750547045951859956236323851203501094, -.58169173963462239562716149521293118596100, .5589519650655021834061135371179039301310, -.58387276558098266665552955601015128195300, .5577342047930283224400871459694989106754, -.58604904500357812846544902640744112432000, .5565217391304347826086956521739130434783, -.58822059851708596855957011939608491957200, .5553145336225596529284164859002169197397, -.59038744660217634674381770309992134571100, .5541125541125541125541125541125541125541, -.59254960960667157898740242671919986605650, .5529157667386609071274298056155507559395, -.59470710774669277576265358220553025603300, .5517241379310344827586206896551724137931, -.59685996110779382384237123915227130055450, .5505376344086021505376344086021505376344, -.59900818964608337768851242799428291618800, .5493562231759656652360515021459227467811, -.60115181318933474940990890900138765573500, .5481798715203426124197002141327623126338, -.60329085143808425240052883964381180703650, .5470085470085470085470085470085470085470, -.60542532396671688843525771517306566238400, .5458422174840085287846481876332622601279, -.60755525022454170969155029524699784815300, .5446808510638297872340425531914893617021, -.60968064953685519036241657886421307921400, .5435244161358811040339702760084925690021, -.61180154110599282990534675263916142284850, .5423728813559322033898305084745762711864, -.61391794401237043121710712512140162289150, .5412262156448202959830866807610993657505, -.61602987721551394351138242200249806046500, .5400843881856540084388185654008438818565, -.61813735955507864705538167982012964785100, .5389473684210526315789473684210526315789, -.62024040975185745772080281312810257077200, .5378151260504201680672268907563025210084, -.62233904640877868441606324267922900617100, .5366876310272536687631027253668763102725, -.62443328801189346144440150965237990021700, .5355648535564853556485355648535564853556, -.62652315293135274476554741340805776417250, .5344467640918580375782881002087682672234, -.62860865942237409420556559780379757285100, .5333333333333333333333333333333333333333, -.63068982562619868570408243613201193511500, .5322245322245322245322245322245322245322, -.63276666957103777644277897707070223987100, .5311203319502074688796680497925311203320, -.63483920917301017716738442686619237065300, .5300207039337474120082815734989648033126, -.63690746223706917739093569252872839570050, .5289256198347107438016528925619834710744, -.63897144645792069983514238629140891134750, .5278350515463917525773195876288659793814, -.64103117942093124081992527862894348800200, .5267489711934156378600823045267489711934, -.64308667860302726193566513757104985415950, .5256673511293634496919917864476386036961, -.64513796137358470073053240412264131009600, .5245901639344262295081967213114754098361, -.64718504499530948859131740391603671014300, .5235173824130879345603271983640081799591, -.64922794662510974195157587018911726772800, .5224489795918367346938775510204081632653, -.65126668331495807251485530287027359008800, .5213849287169042769857433808553971486762, -.65330127201274557080523663898929953575150, .5203252032520325203252032520325203252033, -.65533172956312757406749369692988693714150, .5192697768762677484787018255578093306288, -.65735807270835999727154330685152672231200, .5182186234817813765182186234817813765182, -.65938031808912778153342060249997302889800, .5171717171717171717171717171717171717172, -.66139848224536490484126716182800009846700, .5161290322580645161290322580645161290323, -.66341258161706617713093692145776003599150, .5150905432595573440643863179074446680080, -.66542263254509037562201001492212526500250, .5140562248995983935742971887550200803213, -.66742865127195616370414654738851822912700, .5130260521042084168336673346693386773547, -.66943065394262923906154583164607174694550, .5120000000000000000000000000000000000000, -.67142865660530226534774556057527661323550, .5109780439121756487025948103792415169661, -.67342267521216669923234121597488410770900, .5099601593625498007968127490039840637450, -.67541272562017662384192817626171745359900, .5089463220675944333996023856858846918489, -.67739882359180603188519853574689477682100, .5079365079365079365079365079365079365079, -.67938098479579733801614338517538271844400, .5069306930693069306930693069306930693069, -.68135922480790300781450241629499942064300, .5059288537549407114624505928853754940711, -.68333355911162063645036823800182901322850, .5049309664694280078895463510848126232742, -.68530400309891936760919861626462079584600, .5039370078740157480314960629921259842520, -.68727057207096020619019327568821609020250, .5029469548133595284872298624754420432220, -.68923328123880889251040571252815425395950, .5019607843137254901960784313725490196078, -.69314718055994530941723212145818, 5.0e-01, -}; - - - -#define LOGTAB_TRANSLATE(x,h) (((x) - 1.)*icvLogTab[(h)+1]) -static const double ln_2 = 0.69314718055994530941723212145818; - -static void Log_32f( const float *_x, float *y, int n ) -{ - static const float shift[] = { 0, -1.f/512 }; - static const float - A0 = 0.3333333333333333333333333f, - A1 = -0.5f, - A2 = 1.f; - - #undef LOGPOLY - #define LOGPOLY(x) (((A0*(x) + A1)*(x) + A2)*(x)) - - int i = 0; - Cv32suf buf[4]; - const int* x = (const int*)_x; - -#if CV_SSE2 - if( USE_SSE2 ) - { - static const __m128d ln2_2 = _mm_set1_pd(ln_2); - static const __m128 _1_4 = _mm_set1_ps(1.f); - static const __m128 shift4 = _mm_set1_ps(-1.f/512); - - static const __m128 mA0 = _mm_set1_ps(A0); - static const __m128 mA1 = _mm_set1_ps(A1); - static const __m128 mA2 = _mm_set1_ps(A2); - - int CV_DECL_ALIGNED(16) idx[4]; - - for( ; i <= n - 4; i += 4 ) - { - __m128i h0 = _mm_loadu_si128((const __m128i*)(x + i)); - __m128i yi0 = _mm_sub_epi32(_mm_and_si128(_mm_srli_epi32(h0, 23), _mm_set1_epi32(255)), _mm_set1_epi32(127)); - __m128d yd0 = _mm_mul_pd(_mm_cvtepi32_pd(yi0), ln2_2); - __m128d yd1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_unpackhi_epi64(yi0,yi0)), ln2_2); - - __m128i xi0 = _mm_or_si128(_mm_and_si128(h0, _mm_set1_epi32(LOGTAB_MASK2_32F)), _mm_set1_epi32(127 << 23)); - - h0 = _mm_and_si128(_mm_srli_epi32(h0, 23 - LOGTAB_SCALE - 1), _mm_set1_epi32(LOGTAB_MASK*2)); - _mm_store_si128((__m128i*)idx, h0); - h0 = _mm_cmpeq_epi32(h0, _mm_set1_epi32(510)); - - __m128d t0, t1, t2, t3, t4; - t0 = _mm_load_pd(icvLogTab + idx[0]); - t2 = _mm_load_pd(icvLogTab + idx[1]); - t1 = _mm_unpackhi_pd(t0, t2); - t0 = _mm_unpacklo_pd(t0, t2); - t2 = _mm_load_pd(icvLogTab + idx[2]); - t4 = _mm_load_pd(icvLogTab + idx[3]); - t3 = _mm_unpackhi_pd(t2, t4); - t2 = _mm_unpacklo_pd(t2, t4); - - yd0 = _mm_add_pd(yd0, t0); - yd1 = _mm_add_pd(yd1, t2); - - __m128 yf0 = _mm_movelh_ps(_mm_cvtpd_ps(yd0), _mm_cvtpd_ps(yd1)); - - __m128 xf0 = _mm_sub_ps(_mm_castsi128_ps(xi0), _1_4); - xf0 = _mm_mul_ps(xf0, _mm_movelh_ps(_mm_cvtpd_ps(t1), _mm_cvtpd_ps(t3))); - xf0 = _mm_add_ps(xf0, _mm_and_ps(_mm_castsi128_ps(h0), shift4)); - - __m128 zf0 = _mm_mul_ps(xf0, mA0); - zf0 = _mm_mul_ps(_mm_add_ps(zf0, mA1), xf0); - zf0 = _mm_mul_ps(_mm_add_ps(zf0, mA2), xf0); - yf0 = _mm_add_ps(yf0, zf0); - - _mm_storeu_ps(y + i, yf0); - } - } - else -#endif - for( ; i <= n - 4; i += 4 ) - { - double x0, x1, x2, x3; - double y0, y1, y2, y3; - int h0, h1, h2, h3; - - h0 = x[i]; - h1 = x[i+1]; - buf[0].i = (h0 & LOGTAB_MASK2_32F) | (127 << 23); - buf[1].i = (h1 & LOGTAB_MASK2_32F) | (127 << 23); - - y0 = (((h0 >> 23) & 0xff) - 127) * ln_2; - y1 = (((h1 >> 23) & 0xff) - 127) * ln_2; - - h0 = (h0 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; - h1 = (h1 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; - - y0 += icvLogTab[h0]; - y1 += icvLogTab[h1]; - - h2 = x[i+2]; - h3 = x[i+3]; - - x0 = LOGTAB_TRANSLATE( buf[0].f, h0 ); - x1 = LOGTAB_TRANSLATE( buf[1].f, h1 ); - - buf[2].i = (h2 & LOGTAB_MASK2_32F) | (127 << 23); - buf[3].i = (h3 & LOGTAB_MASK2_32F) | (127 << 23); - - y2 = (((h2 >> 23) & 0xff) - 127) * ln_2; - y3 = (((h3 >> 23) & 0xff) - 127) * ln_2; - - h2 = (h2 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; - h3 = (h3 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; - - y2 += icvLogTab[h2]; - y3 += icvLogTab[h3]; - - x2 = LOGTAB_TRANSLATE( buf[2].f, h2 ); - x3 = LOGTAB_TRANSLATE( buf[3].f, h3 ); - - x0 += shift[h0 == 510]; - x1 += shift[h1 == 510]; - y0 += LOGPOLY( x0 ); - y1 += LOGPOLY( x1 ); - - y[i] = (float) y0; - y[i + 1] = (float) y1; - - x2 += shift[h2 == 510]; - x3 += shift[h3 == 510]; - y2 += LOGPOLY( x2 ); - y3 += LOGPOLY( x3 ); - - y[i + 2] = (float) y2; - y[i + 3] = (float) y3; - } - - for( ; i < n; i++ ) - { - int h0 = x[i]; - double y0; - float x0; - - y0 = (((h0 >> 23) & 0xff) - 127) * ln_2; - - buf[0].i = (h0 & LOGTAB_MASK2_32F) | (127 << 23); - h0 = (h0 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; - - y0 += icvLogTab[h0]; - x0 = (float)LOGTAB_TRANSLATE( buf[0].f, h0 ); - x0 += shift[h0 == 510]; - y0 += LOGPOLY( x0 ); - - y[i] = (float)y0; - } -} - - -static void Log_64f( const double *x, double *y, int n ) -{ - static const double shift[] = { 0, -1./512 }; - static const double - A7 = 1.0, - A6 = -0.5, - A5 = 0.333333333333333314829616256247390992939472198486328125, - A4 = -0.25, - A3 = 0.2, - A2 = -0.1666666666666666574148081281236954964697360992431640625, - A1 = 0.1428571428571428769682682968777953647077083587646484375, - A0 = -0.125; - - #undef LOGPOLY - #define LOGPOLY(x,k) ((x)+=shift[k], xq = (x)*(x),\ - (((A0*xq + A2)*xq + A4)*xq + A6)*xq + \ - (((A1*xq + A3)*xq + A5)*xq + A7)*(x)) - - int i = 0; - DBLINT buf[4]; - DBLINT *X = (DBLINT *) x; - -#if CV_SSE2 - if( USE_SSE2 ) - { - static const __m128d ln2_2 = _mm_set1_pd(ln_2); - static const __m128d _1_2 = _mm_set1_pd(1.); - static const __m128d shift2 = _mm_set1_pd(-1./512); - - static const __m128i log_and_mask2 = _mm_set_epi32(LOGTAB_MASK2, 0xffffffff, LOGTAB_MASK2, 0xffffffff); - static const __m128i log_or_mask2 = _mm_set_epi32(1023 << 20, 0, 1023 << 20, 0); - - static const __m128d mA0 = _mm_set1_pd(A0); - static const __m128d mA1 = _mm_set1_pd(A1); - static const __m128d mA2 = _mm_set1_pd(A2); - static const __m128d mA3 = _mm_set1_pd(A3); - static const __m128d mA4 = _mm_set1_pd(A4); - static const __m128d mA5 = _mm_set1_pd(A5); - static const __m128d mA6 = _mm_set1_pd(A6); - static const __m128d mA7 = _mm_set1_pd(A7); - - int CV_DECL_ALIGNED(16) idx[4]; - - for( ; i <= n - 4; i += 4 ) - { - __m128i h0 = _mm_loadu_si128((const __m128i*)(x + i)); - __m128i h1 = _mm_loadu_si128((const __m128i*)(x + i + 2)); - - __m128d xd0 = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(h0, log_and_mask2), log_or_mask2)); - __m128d xd1 = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(h1, log_and_mask2), log_or_mask2)); - - h0 = _mm_unpackhi_epi32(_mm_unpacklo_epi32(h0, h1), _mm_unpackhi_epi32(h0, h1)); - - __m128i yi0 = _mm_sub_epi32(_mm_and_si128(_mm_srli_epi32(h0, 20), - _mm_set1_epi32(2047)), _mm_set1_epi32(1023)); - __m128d yd0 = _mm_mul_pd(_mm_cvtepi32_pd(yi0), ln2_2); - __m128d yd1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_unpackhi_epi64(yi0, yi0)), ln2_2); - - h0 = _mm_and_si128(_mm_srli_epi32(h0, 20 - LOGTAB_SCALE - 1), _mm_set1_epi32(LOGTAB_MASK * 2)); - _mm_store_si128((__m128i*)idx, h0); - h0 = _mm_cmpeq_epi32(h0, _mm_set1_epi32(510)); - - __m128d t0, t1, t2, t3, t4; - t0 = _mm_load_pd(icvLogTab + idx[0]); - t2 = _mm_load_pd(icvLogTab + idx[1]); - t1 = _mm_unpackhi_pd(t0, t2); - t0 = _mm_unpacklo_pd(t0, t2); - t2 = _mm_load_pd(icvLogTab + idx[2]); - t4 = _mm_load_pd(icvLogTab + idx[3]); - t3 = _mm_unpackhi_pd(t2, t4); - t2 = _mm_unpacklo_pd(t2, t4); - - yd0 = _mm_add_pd(yd0, t0); - yd1 = _mm_add_pd(yd1, t2); - - xd0 = _mm_mul_pd(_mm_sub_pd(xd0, _1_2), t1); - xd1 = _mm_mul_pd(_mm_sub_pd(xd1, _1_2), t3); - - xd0 = _mm_add_pd(xd0, _mm_and_pd(_mm_castsi128_pd(_mm_unpacklo_epi32(h0, h0)), shift2)); - xd1 = _mm_add_pd(xd1, _mm_and_pd(_mm_castsi128_pd(_mm_unpackhi_epi32(h0, h0)), shift2)); - - __m128d zd0 = _mm_mul_pd(xd0, mA0); - __m128d zd1 = _mm_mul_pd(xd1, mA0); - zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA1), xd0); - zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA1), xd1); - zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA2), xd0); - zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA2), xd1); - zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA3), xd0); - zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA3), xd1); - zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA4), xd0); - zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA4), xd1); - zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA5), xd0); - zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA5), xd1); - zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA6), xd0); - zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA6), xd1); - zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA7), xd0); - zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA7), xd1); - - yd0 = _mm_add_pd(yd0, zd0); - yd1 = _mm_add_pd(yd1, zd1); - - _mm_storeu_pd(y + i, yd0); - _mm_storeu_pd(y + i + 2, yd1); - } - } - else -#endif - for( ; i <= n - 4; i += 4 ) - { - double xq; - double x0, x1, x2, x3; - double y0, y1, y2, y3; - int h0, h1, h2, h3; - - h0 = X[i].i.lo; - h1 = X[i + 1].i.lo; - buf[0].i.lo = h0; - buf[1].i.lo = h1; - - h0 = X[i].i.hi; - h1 = X[i + 1].i.hi; - buf[0].i.hi = (h0 & LOGTAB_MASK2) | (1023 << 20); - buf[1].i.hi = (h1 & LOGTAB_MASK2) | (1023 << 20); - - y0 = (((h0 >> 20) & 0x7ff) - 1023) * ln_2; - y1 = (((h1 >> 20) & 0x7ff) - 1023) * ln_2; - - h2 = X[i + 2].i.lo; - h3 = X[i + 3].i.lo; - buf[2].i.lo = h2; - buf[3].i.lo = h3; - - h0 = (h0 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; - h1 = (h1 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; - - y0 += icvLogTab[h0]; - y1 += icvLogTab[h1]; - - h2 = X[i + 2].i.hi; - h3 = X[i + 3].i.hi; - - x0 = LOGTAB_TRANSLATE( buf[0].d, h0 ); - x1 = LOGTAB_TRANSLATE( buf[1].d, h1 ); - - buf[2].i.hi = (h2 & LOGTAB_MASK2) | (1023 << 20); - buf[3].i.hi = (h3 & LOGTAB_MASK2) | (1023 << 20); - - y2 = (((h2 >> 20) & 0x7ff) - 1023) * ln_2; - y3 = (((h3 >> 20) & 0x7ff) - 1023) * ln_2; - - h2 = (h2 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; - h3 = (h3 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; - - y2 += icvLogTab[h2]; - y3 += icvLogTab[h3]; - - x2 = LOGTAB_TRANSLATE( buf[2].d, h2 ); - x3 = LOGTAB_TRANSLATE( buf[3].d, h3 ); - - y0 += LOGPOLY( x0, h0 == 510 ); - y1 += LOGPOLY( x1, h1 == 510 ); - - y[i] = y0; - y[i + 1] = y1; - - y2 += LOGPOLY( x2, h2 == 510 ); - y3 += LOGPOLY( x3, h3 == 510 ); - - y[i + 2] = y2; - y[i + 3] = y3; - } - - for( ; i < n; i++ ) - { - int h0 = X[i].i.hi; - double xq; - double x0, y0 = (((h0 >> 20) & 0x7ff) - 1023) * ln_2; - - buf[0].i.hi = (h0 & LOGTAB_MASK2) | (1023 << 20); - buf[0].i.lo = X[i].i.lo; - h0 = (h0 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; - - y0 += icvLogTab[h0]; - x0 = LOGTAB_TRANSLATE( buf[0].d, h0 ); - y0 += LOGPOLY( x0, h0 == 510 ); - y[i] = y0; - } -} - #ifdef HAVE_IPP static void Log_32f_ipp(const float *x, float *y, int n) { @@ -2256,7 +826,7 @@ static void Log_32f_ipp(const float *x, float *y, int n) } setIppErrorStatus(); } - Log_32f(x, y, n); + hal::log(x, y, n); } static void Log_64f_ipp(const double *x, double *y, int n) @@ -2270,11 +840,14 @@ static void Log_64f_ipp(const double *x, double *y, int n) } setIppErrorStatus(); } - Log_64f(x, y, n); + hal::log(x, y, n); } #define Log_32f Log_32f_ipp #define Log_64f Log_64f_ipp +#else +#define Log_32f hal::log +#define Log_64f hal::log #endif void log( InputArray _src, OutputArray _dst ) @@ -2651,6 +1224,11 @@ static bool ocl_pow(InputArray _src, double power, OutputArray _dst, #endif +static void InvSqrt_32f(const float* src, float* dst, int n) { hal::invSqrt(src, dst, n); } +static void InvSqrt_64f(const double* src, double* dst, int n) { hal::invSqrt(src, dst, n); } +static void Sqrt_32f(const float* src, float* dst, int n) { hal::sqrt(src, dst, n); } +static void Sqrt_64f(const double* src, double* dst, int n) { hal::sqrt(src, dst, n); } + void pow( InputArray _src, double power, OutputArray _dst ) { int type = _src.type(), depth = CV_MAT_DEPTH(type), @@ -3085,27 +1663,6 @@ void patchNaNs( InputOutputArray _a, double _val ) } } - -void exp(const float* src, float* dst, int n) -{ - Exp_32f(src, dst, n); -} - -void log(const float* src, float* dst, int n) -{ - Log_32f(src, dst, n); -} - -void fastAtan2(const float* y, const float* x, float* dst, int n, bool angleInDegrees) -{ - FastAtan2_32f(y, x, dst, n, angleInDegrees); -} - -void magnitude(const float* x, const float* y, float* dst, int n) -{ - Magnitude_32f(x, y, dst, n); -} - } CV_IMPL float cvCbrt(float value) { return cv::cubeRoot(value); } diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index ca707e78a4..e43df94448 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -2416,140 +2416,6 @@ void cv::minMaxLoc( InputArray _img, double* minVal, double* maxVal, namespace cv { -float normL2Sqr_(const float* a, const float* b, int n) -{ - int j = 0; float d = 0.f; -#if CV_SSE - if( USE_SSE2 ) - { - float CV_DECL_ALIGNED(16) buf[4]; - __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps(); - - for( ; j <= n - 8; j += 8 ) - { - __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j)); - __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4)); - d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0)); - d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1)); - } - _mm_store_ps(buf, _mm_add_ps(d0, d1)); - d = buf[0] + buf[1] + buf[2] + buf[3]; - } - else -#endif - { - for( ; j <= n - 4; j += 4 ) - { - float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3]; - d += t0*t0 + t1*t1 + t2*t2 + t3*t3; - } - } - - for( ; j < n; j++ ) - { - float t = a[j] - b[j]; - d += t*t; - } - return d; -} - - -float normL1_(const float* a, const float* b, int n) -{ - int j = 0; float d = 0.f; -#if CV_SSE - if( USE_SSE2 ) - { - float CV_DECL_ALIGNED(16) buf[4]; - static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; - __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps(); - __m128 absmask = _mm_load_ps((const float*)absbuf); - - for( ; j <= n - 8; j += 8 ) - { - __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j)); - __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4)); - d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask)); - d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask)); - } - _mm_store_ps(buf, _mm_add_ps(d0, d1)); - d = buf[0] + buf[1] + buf[2] + buf[3]; - } - else -#elif CV_NEON - float32x4_t v_sum = vdupq_n_f32(0.0f); - for ( ; j <= n - 4; j += 4) - v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j))); - - float CV_DECL_ALIGNED(16) buf[4]; - vst1q_f32(buf, v_sum); - d = buf[0] + buf[1] + buf[2] + buf[3]; -#endif - { - for( ; j <= n - 4; j += 4 ) - { - d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) + - std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]); - } - } - - for( ; j < n; j++ ) - d += std::abs(a[j] - b[j]); - return d; -} - -int normL1_(const uchar* a, const uchar* b, int n) -{ - int j = 0, d = 0; -#if CV_SSE - if( USE_SSE2 ) - { - __m128i d0 = _mm_setzero_si128(); - - for( ; j <= n - 16; j += 16 ) - { - __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j)); - __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j)); - - d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1)); - } - - for( ; j <= n - 4; j += 4 ) - { - __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j)); - __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j)); - - d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1)); - } - d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0))); - } - else -#elif CV_NEON - uint32x4_t v_sum = vdupq_n_u32(0.0f); - for ( ; j <= n - 16; j += 16) - { - uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j)); - uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst)); - v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high))); - v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high))); - } - - uint CV_DECL_ALIGNED(16) buf[4]; - vst1q_u32(buf, v_sum); - d = buf[0] + buf[1] + buf[2] + buf[3]; -#endif - { - for( ; j <= n - 4; j += 4 ) - { - d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) + - std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]); - } - } - for( ; j < n; j++ ) - d += std::abs(a[j] - b[j]); - return d; -} - template int normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn) { @@ -2564,7 +2430,7 @@ normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn) if( mask[i] ) { for( int k = 0; k < cn; k++ ) - result = std::max(result, ST(std::abs(src[k]))); + result = std::max(result, ST(cv_abs(src[k]))); } } *_result = result; @@ -2585,7 +2451,7 @@ normL1_(const T* src, const uchar* mask, ST* _result, int len, int cn) if( mask[i] ) { for( int k = 0; k < cn; k++ ) - result += std::abs(src[k]); + result += cv_abs(src[k]); } } *_result = result; @@ -2684,9 +2550,7 @@ normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int le Hamming::ResultType Hamming::operator()( const unsigned char* a, const unsigned char* b, int size ) const { - int result = 0; - cv::hal::normHamming(a, b, size, result); - return result; + return cv::hal::normHamming(a, b, size); } #define CV_DEF_NORM_FUNC(L, suffix, type, ntype) \ @@ -3037,16 +2901,12 @@ double cv::norm( InputArray _src, int normType, InputArray _mask ) if( normType == NORM_HAMMING ) { - int result = 0; - cv::hal::normHamming(data, (int)len, result); - return result; + return hal::normHamming(data, (int)len); } if( normType == NORM_HAMMING2 ) { - int result = 0; - hal::normHamming(data, (int)len, 2, result); - return result; + return hal::normHamming(data, (int)len, 2); } } } @@ -3072,9 +2932,7 @@ double cv::norm( InputArray _src, int normType, InputArray _mask ) for( size_t i = 0; i < it.nplanes; i++, ++it ) { - int one = 0; - cv::hal::normHamming(ptrs[0], total, cellSize, one); - result += one; + result += hal::normHamming(ptrs[0], total, cellSize); } return result; @@ -3558,9 +3416,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m for( size_t i = 0; i < it.nplanes; i++, ++it ) { - int one = 0; - hal::normHamming(ptrs[0], ptrs[1], total, cellSize, one); - result += one; + result += hal::normHamming(ptrs[0], ptrs[1], total, cellSize); } return result; @@ -3698,7 +3554,7 @@ static void batchDistHamming(const uchar* src1, const uchar* src2, size_t step2, if( !mask ) { for( int i = 0; i < nvecs; i++ ) - hal::normHamming(src1, src2 + step2*i, len, dist[i]); + dist[i] = hal::normHamming(src1, src2 + step2*i, len); } else { @@ -3706,7 +3562,7 @@ static void batchDistHamming(const uchar* src1, const uchar* src2, size_t step2, for( int i = 0; i < nvecs; i++ ) { if (mask[i]) - hal::normHamming(src1, src2 + step2*i, len, dist[i]); + dist[i] = hal::normHamming(src1, src2 + step2*i, len); else dist[i] = val0; } @@ -3720,7 +3576,7 @@ static void batchDistHamming2(const uchar* src1, const uchar* src2, size_t step2 if( !mask ) { for( int i = 0; i < nvecs; i++ ) - hal::normHamming(src1, src2 + step2*i, len, 2, dist[i]); + dist[i] = hal::normHamming(src1, src2 + step2*i, len, 2); } else { @@ -3728,7 +3584,7 @@ static void batchDistHamming2(const uchar* src1, const uchar* src2, size_t step2 for( int i = 0; i < nvecs; i++ ) { if (mask[i]) - hal::normHamming(src1, src2 + step2*i, len, 2, dist[i]); + dist[i] = hal::normHamming(src1, src2 + step2*i, len, 2); else dist[i] = val0; } diff --git a/modules/features2d/src/kaze/AKAZEFeatures.cpp b/modules/features2d/src/kaze/AKAZEFeatures.cpp index fd15345b29..d12656e994 100644 --- a/modules/features2d/src/kaze/AKAZEFeatures.cpp +++ b/modules/features2d/src/kaze/AKAZEFeatures.cpp @@ -812,7 +812,7 @@ void AKAZEFeatures::Compute_Main_Orientation(KeyPoint& kpt, const std::vector(float)(2.0*CV_PI) ? ang1 - (float)(5.0*CV_PI / 3.0) : ang1 + (float)(CV_PI / 3.0)); diff --git a/modules/hal/include/opencv2/hal.hpp b/modules/hal/include/opencv2/hal.hpp index 7641c46cad..95d1ac66c3 100644 --- a/modules/hal/include/opencv2/hal.hpp +++ b/modules/hal/include/opencv2/hal.hpp @@ -81,28 +81,17 @@ float normL1_(const float* a, const float* b, int n); float normL2Sqr_(const float* a, const float* b, int n); void exp(const float* src, float* dst, int n); +void exp(const double* src, double* dst, int n); void log(const float* src, float* dst, int n); +void log(const double* src, double* dst, int n); void fastAtan2(const float* y, const float* x, float* dst, int n, bool angleInDegrees); void magnitude(const float* x, const float* y, float* dst, int n); - -/** @brief Computes the cube root of an argument. - - The function cubeRoot computes \f$\sqrt[3]{\texttt{val}}\f$. Negative arguments are handled correctly. - NaN and Inf are not handled. The accuracy approaches the maximum possible accuracy for - single-precision data. - @param val A function argument. - */ -float cubeRoot(float val); - -/** @brief Calculates the angle of a 2D vector in degrees. - - The function fastAtan2 calculates the full-range angle of an input 2D vector. The angle is measured - in degrees and varies from 0 to 360 degrees. The accuracy is about 0.3 degrees. - @param x x-coordinate of the vector. - @param y y-coordinate of the vector. - */ -float fastAtan2(float y, float x); +void magnitude(const double* x, const double* y, double* dst, int n); +void sqrt(const float* src, float* dst, int len); +void sqrt(const double* src, double* dst, int len); +void invSqrt(const float* src, float* dst, int len); +void invSqrt(const double* src, double* dst, int len); }} //cv::hal diff --git a/modules/hal/include/opencv2/hal/defs.h b/modules/hal/include/opencv2/hal/defs.h index 6e1ff2a0ad..c011fe617e 100644 --- a/modules/hal/include/opencv2/hal/defs.h +++ b/modules/hal/include/opencv2/hal/defs.h @@ -380,7 +380,7 @@ cvRound( double value ) TEGRA_ROUND_DBL(value); #elif defined CV_ICC || defined __GNUC__ # if CV_VFP - ARM_ROUND_DBL(value) + ARM_ROUND_DBL(value); # else return (int)lrint(value); # endif @@ -488,7 +488,7 @@ CV_INLINE int cvRound(float value) TEGRA_ROUND_FLT(value); #elif defined CV_ICC || defined __GNUC__ # if CV_VFP - ARM_ROUND_FLT(value) + ARM_ROUND_FLT(value); # else return (int)lrintf(value); # endif diff --git a/modules/hal/include/opencv2/hal/intrin.hpp b/modules/hal/include/opencv2/hal/intrin.hpp index b7b147a199..c3c47e0594 100644 --- a/modules/hal/include/opencv2/hal/intrin.hpp +++ b/modules/hal/include/opencv2/hal/intrin.hpp @@ -438,6 +438,14 @@ OPENCV_HAL_IMPL_ADDSUB_OP(v_add_wrap, +, (_Tp), _Tp) OPENCV_HAL_IMPL_ADDSUB_OP(v_sub_wrap, -, (_Tp), _Tp) OPENCV_HAL_IMPL_ADDSUB_OP(v_absdiff, -, (rtype)std::abs, typename TypeTraits<_Tp>::abs_type) +template inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a) +{ + v_reg<_Tp, n> c; + for( int i = 0; i < n; i++ ) + c.s[i] = 1.f/std::sqrt(a.s[i]); + return c; +} + template inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) { v_reg<_Tp, n> c; @@ -446,6 +454,7 @@ template inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, return c; } + template inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) { v_reg<_Tp, n> c; @@ -544,7 +553,7 @@ template inline void v_expand(const v_reg<_Tp, n>& a, } template inline v_reg::int_type, n> - v_reinterpret_int(const v_reg<_Tp, n>& a) + v_reinterpret_as_int(const v_reg<_Tp, n>& a) { v_reg::int_type, n> c; for( int i = 0; i < n; i++ ) @@ -553,7 +562,7 @@ template inline v_reg::int_type, n } template inline v_reg::uint_type, n> - v_reinterpret_uint(const v_reg<_Tp, n>& a) + v_reinterpret_as_uint(const v_reg<_Tp, n>& a) { v_reg::uint_type, n> c; for( int i = 0; i < n; i++ ) @@ -944,7 +953,7 @@ struct v_uint16x8 { return (ushort)_mm_extract_epi16(val, i); } - uchar get0() const + ushort get0() const { return (ushort)_mm_cvtsi128_si32(val); } @@ -1069,79 +1078,79 @@ inline v_int32x4 v_setall_s32(int v) { return v_int32x4(_mm_set1_epi32(v)); } inline v_float32x4 v_setall_f32(float v) { return v_float32x4(_mm_set1_ps(v)); } inline v_float64x2 v_setall_f64(double v) { return v_float64x2(_mm_set1_pd(v)); } -template inline v_uint8x16 v_reinterpret_u8(const _Tpvec& a) +template inline v_uint8x16 v_reinterpret_as_u8(const _Tpvec& a) { return v_uint8x16(a.val); } -inline v_uint8x16 v_reinterpret_u8(const v_float32x4& a) +inline v_uint8x16 v_reinterpret_as_u8(const v_float32x4& a) { return v_uint8x16(_mm_castps_si128(a.val)); } -inline v_uint8x16 v_reinterpret_u8(const v_float64x2& a) +inline v_uint8x16 v_reinterpret_as_u8(const v_float64x2& a) { return v_uint8x16(_mm_castpd_si128(a.val)); } -template inline v_int8x16 v_reinterpret_s8(const _Tpvec& a) +template inline v_int8x16 v_reinterpret_as_s8(const _Tpvec& a) { return v_int8x16(a.val); } -inline v_int8x16 v_reinterpret_s8(const v_float32x4& a) +inline v_int8x16 v_reinterpret_as_s8(const v_float32x4& a) { return v_int8x16(_mm_castps_si128(a.val)); } -inline v_int8x16 v_reinterpret_s8(const v_float64x2& a) +inline v_int8x16 v_reinterpret_as_s8(const v_float64x2& a) { return v_int8x16(_mm_castpd_si128(a.val)); } -template inline v_uint16x8 v_reinterpret_u16(const _Tpvec& a) +template inline v_uint16x8 v_reinterpret_as_u16(const _Tpvec& a) { return v_uint16x8(a.val); } -inline v_uint16x8 v_reinterpret_u16(const v_float32x4& a) +inline v_uint16x8 v_reinterpret_as_u16(const v_float32x4& a) { return v_uint16x8(_mm_castps_si128(a.val)); } -inline v_uint16x8 v_reinterpret_u16(const v_float64x2& a) +inline v_uint16x8 v_reinterpret_as_u16(const v_float64x2& a) { return v_uint16x8(_mm_castpd_si128(a.val)); } -template inline v_int16x8 v_reinterpret_s16(const _Tpvec& a) +template inline v_int16x8 v_reinterpret_as_s16(const _Tpvec& a) { return v_int16x8(a.val); } -inline v_int16x8 v_reinterpret_s16(const v_float32x4& a) +inline v_int16x8 v_reinterpret_as_s16(const v_float32x4& a) { return v_int16x8(_mm_castps_si128(a.val)); } -inline v_int16x8 v_reinterpret_s16(const v_float64x2& a) +inline v_int16x8 v_reinterpret_as_s16(const v_float64x2& a) { return v_int16x8(_mm_castpd_si128(a.val)); } -template inline v_uint32x4 v_reinterpret_u32(const _Tpvec& a) +template inline v_uint32x4 v_reinterpret_as_u32(const _Tpvec& a) { return v_uint32x4(a.val); } -inline v_uint32x4 v_reinterpret_u32(const v_float32x4& a) +inline v_uint32x4 v_reinterpret_as_u32(const v_float32x4& a) { return v_uint32x4(_mm_castps_si128(a.val)); } -inline v_uint32x4 v_reinterpret_u32(const v_float64x2& a) +inline v_uint32x4 v_reinterpret_as_u32(const v_float64x2& a) { return v_uint32x4(_mm_castpd_si128(a.val)); } -template inline v_int32x4 v_reinterpret_s32(const _Tpvec& a) +template inline v_int32x4 v_reinterpret_as_s32(const _Tpvec& a) { return v_int32x4(a.val); } -inline v_int32x4 v_reinterpret_s32(const v_float32x4& a) +inline v_int32x4 v_reinterpret_as_s32(const v_float32x4& a) { return v_int32x4(_mm_castps_si128(a.val)); } -inline v_int32x4 v_reinterpret_s32(const v_float64x2& a) +inline v_int32x4 v_reinterpret_as_s32(const v_float64x2& a) { return v_int32x4(_mm_castpd_si128(a.val)); } -template inline v_float32x4 v_reinterpret_f32(const _Tpvec& a) +template inline v_float32x4 v_reinterpret_as_f32(const _Tpvec& a) { return v_float32x4(_mm_castsi128_ps(a.val)); } -inline v_float32x4 v_reinterpret_f32(const v_float64x2& a) +inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a) { return v_float32x4(_mm_castpd_ps(a.val)); } -template inline v_float64x2 v_reinterpret_f64(const _Tpvec& a) +template inline v_float64x2 v_reinterpret_as_f64(const _Tpvec& a) { return v_float64x2(_mm_castsi128_pd(a.val)); } -inline v_float64x2 v_reinterpret_f64(const v_float64x2& a) +inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a) { return v_float64x2(_mm_castps_pd(a.val)); } -inline v_uint8x16 v_sat_u8(const v_uint16x8& a, const v_uint16x8& b) +inline v_uint8x16 v_cvtn_u16(const v_uint16x8& a, const v_uint16x8& b) { __m128i delta = _mm_set1_epi16(255); return v_uint8x16(_mm_packus_epi16(_mm_adds_epu16(_mm_subs_epu16(a.val, delta), delta), _mm_adds_epu16(_mm_subs_epu16(b.val, delta), delta))); } -inline v_uint8x16 v_sat_u8(const v_uint16x8& a, const v_uint16x8& b, int n) +inline v_uint8x16 v_shiftn_u16(const v_uint16x8& a, const v_uint16x8& b, int n) { // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers. __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); @@ -1149,81 +1158,53 @@ inline v_uint8x16 v_sat_u8(const v_uint16x8& a, const v_uint16x8& b, int n) _mm_srli_epi16(_mm_add_epi16(b.val, delta), n))); } -inline v_uint8x16 v_sat_u8(const v_int16x8& a, const v_int16x8& b) +inline v_uint8x16 v_cvtun_s16(const v_int16x8& a, const v_int16x8& b) { return v_uint8x16(_mm_packus_epi16(a.val, b.val)); } -inline v_uint8x16 v_sat_u8(const v_int16x8& a, const v_int16x8& b, int n) +inline v_uint8x16 v_shiftun_s16(const v_int16x8& a, const v_int16x8& b, int n) { __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_add_epi16(a.val, delta), n), _mm_srai_epi16(_mm_add_epi16(b.val, delta), n))); } -inline void v_storesat_u8(uchar* ptr, const v_uint16x8& a) +inline void v_storen_u16(uchar* ptr, const v_uint16x8& a) { __m128i delta = _mm_set1_epi16(255); _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(_mm_adds_epu16(_mm_subs_epu16(a.val, delta), delta), delta)); } -inline void v_storesat_u8(uchar* ptr, const v_uint16x8& a, int n) +inline void v_shiftstoren_u16(uchar* ptr, const v_uint16x8& a, int n) { __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a.val, delta), n), delta)); } -inline void v_storesat_u8(uchar* ptr, const v_int16x8& a) +inline void v_storeun_s16(uchar* ptr, const v_int16x8& a) { _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); } -inline void v_storesat_u8(uchar* ptr, const v_int16x8& a, int n) +inline void v_shiftstoreun_s16(uchar* ptr, const v_int16x8& a, int n) { __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(_mm_srai_epi16(_mm_add_epi16(a.val, delta), n), delta)); } -inline v_int8x16 v_sat_s8(const v_uint16x8& a, const v_uint16x8& b) -{ - __m128i delta = _mm_set1_epi16(127); - return v_int8x16(_mm_packs_epi16(_mm_adds_epu16(_mm_subs_epu16(a.val, delta), delta), - _mm_adds_epu16(_mm_subs_epu16(b.val, delta), delta))); -} - -inline v_int8x16 v_sat_s8(const v_uint16x8& a, const v_uint16x8& b, int n) -{ - // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers. - __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); - return v_int8x16(_mm_packs_epi16(_mm_srli_epi16(_mm_add_epi16(a.val, delta), n), - _mm_srli_epi16(_mm_add_epi16(b.val, delta), n))); -} - -inline v_int8x16 v_sat_s8(const v_int16x8& a, const v_int16x8& b) +inline v_int8x16 v_cvtn_s16(const v_int16x8& a, const v_int16x8& b) { return v_int8x16(_mm_packs_epi16(a.val, b.val)); } -inline v_int8x16 v_sat_s8(const v_int16x8& a, const v_int16x8& b, int n) +inline v_int8x16 v_shiftn_s16(const v_int16x8& a, const v_int16x8& b, int n) { __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); return v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_add_epi16(a.val, delta), n), _mm_srai_epi16(_mm_add_epi16(b.val, delta), n))); } -inline void v_storesat_s8(schar* ptr, const v_uint16x8& a) -{ - __m128i delta = _mm_set1_epi16(127); - _mm_storel_epi64((__m128i*)ptr, - _mm_packs_epi16(_mm_adds_epu16(_mm_subs_epu16(a.val, delta), delta), delta)); -} - -inline void v_storesat_s8(schar* ptr, const v_uint16x8& a, int n) -{ - __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); - _mm_storel_epi64((__m128i*)ptr, - _mm_packs_epi16(_mm_srli_epi16(_mm_add_epi16(a.val, delta), n), delta)); -} -inline void v_storesat_s8(schar* ptr, const v_int16x8& a) +inline void v_storen_s16(schar* ptr, const v_int16x8& a) { _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); } -inline void v_storesat_s8(schar* ptr, const v_int16x8& a, int n) +inline void v_shiftstoren_s16(schar* ptr, const v_int16x8& a, int n) { __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); _mm_storel_epi64((__m128i*)ptr, @@ -1236,7 +1217,7 @@ inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b) return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask)); } -inline v_uint16x8 v_sat_u16(const v_uint32x4& a, const v_uint32x4& b) +inline v_uint16x8 v_cvtn_u32(const v_uint32x4& a, const v_uint32x4& b) { __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768); __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32); @@ -1244,20 +1225,20 @@ inline v_uint16x8 v_sat_u16(const v_uint32x4& a, const v_uint32x4& b) __m128i r = _mm_packs_epi32(a1, b1); return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768))); } -inline v_uint16x8 v_sat_u16(const v_uint32x4& a, const v_uint32x4& b, int n) +inline v_uint16x8 v_shiftn_u32(const v_uint32x4& a, const v_uint32x4& b, int n) { __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32); __m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val, delta), n), delta32); return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768))); } -inline v_uint16x8 v_sat_u16(const v_int32x4& a, const v_int32x4& b) +inline v_uint16x8 v_cvtun_s32(const v_int32x4& a, const v_int32x4& b) { __m128i delta32 = _mm_set1_epi32(32768); __m128i r = _mm_packs_epi32(_mm_sub_epi32(a.val, delta32), _mm_sub_epi32(b.val, delta32)); return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768))); } -inline v_uint16x8 v_sat_u16(const v_int32x4& a, const v_int32x4& b, int n) +inline v_uint16x8 v_shiftun_s32(const v_int32x4& a, const v_int32x4& b, int n) { __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32); @@ -1265,28 +1246,28 @@ inline v_uint16x8 v_sat_u16(const v_int32x4& a, const v_int32x4& b, int n) return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768))); } -inline void v_storesat_u16(ushort* ptr, const v_uint32x4& a) +inline void v_storen_u32(ushort* ptr, const v_uint32x4& a) { __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768); __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32); __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, delta32), _mm_set1_epi16(-32768)); _mm_storel_epi64((__m128i*)ptr, r); } -inline void v_storesat_u16(ushort* ptr, const v_uint32x4& a, int n) +inline void v_shiftstoren_u32(ushort* ptr, const v_uint32x4& a, int n) { __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32); __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, delta32), _mm_set1_epi16(-32768)); _mm_storel_epi64((__m128i*)ptr, r); } -inline void v_storesat_u16(ushort* ptr, const v_int32x4& a) +inline void v_storeun_s32(ushort* ptr, const v_int32x4& a) { __m128i delta32 = _mm_set1_epi32(32768); __m128i a1 = _mm_sub_epi32(a.val, delta32); __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, delta32), _mm_set1_epi16(-32768)); _mm_storel_epi64((__m128i*)ptr, r); } -inline void v_storesat_u16(ushort* ptr, const v_int32x4& a, int n) +inline void v_shiftstoreun_s32(ushort* ptr, const v_int32x4& a, int n) { __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32); @@ -1294,45 +1275,20 @@ inline void v_storesat_u16(ushort* ptr, const v_int32x4& a, int n) _mm_storel_epi64((__m128i*)ptr, r); } -inline v_int16x8 v_sat_s16(const v_uint32x4& a, const v_uint32x4& b) -{ - __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(32767); - __m128i a1 = v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val); - __m128i b1 = v_select_si128(_mm_cmpgt_epi32(z, b.val), maxval32, b.val); - return v_int16x8(_mm_packs_epi32(a1, b1)); -} -inline v_int16x8 v_sat_s16(const v_uint32x4& a, const v_uint32x4& b, int n) -{ - __m128i delta = _mm_set1_epi32(1 << (n-1)); - return v_int16x8(_mm_packs_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), - _mm_srli_epi32(_mm_add_epi32(b.val, delta), n))); -} -inline v_int16x8 v_sat_s16(const v_int32x4& a, const v_int32x4& b) +inline v_int16x8 v_cvtn_s32(const v_int32x4& a, const v_int32x4& b) { return v_int16x8(_mm_packs_epi32(a.val, b.val)); } -inline v_int16x8 v_sat_s16(const v_int32x4& a, const v_int32x4& b, int n) +inline v_int16x8 v_shiftn_s32(const v_int32x4& a, const v_int32x4& b, int n) { __m128i delta = _mm_set1_epi32(1 << (n-1)); return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), _mm_srai_epi32(_mm_add_epi32(b.val, delta), n))); } -inline void v_storesat_s16(short* ptr, const v_uint32x4& a) -{ - __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(32767); - __m128i a1 = v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val); - _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1)); -} -inline void v_storesat_s16(short* ptr, const v_uint32x4& a, int n) -{ - __m128i delta = _mm_set1_epi32(1 << (n-1)); - __m128i a1 = _mm_srli_epi32(_mm_add_epi32(a.val, delta), n); - _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1)); -} -inline void v_storesat_s16(short* ptr, const v_int32x4& a) +inline void v_storen_s32(short* ptr, const v_int32x4& a) { _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val)); } -inline void v_storesat_s16(short* ptr, const v_int32x4& a, int n) +inline void v_shiftstoren_s32(short* ptr, const v_int32x4& a, int n) { __m128i delta = _mm_set1_epi32(1 << (n-1)); __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n); @@ -1373,6 +1329,8 @@ OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint16x8, _mm_mullo_epi16) OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16) OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16) OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int16x8, _mm_mullo_epi16) +OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32) +OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32) OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32) OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32) OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps) @@ -1421,9 +1379,26 @@ OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1 inline v_float32x4 v_sqrt(v_float32x4 x) { return v_float32x4(_mm_sqrt_ps(x.val)); } + +inline v_float32x4 v_invsqrt(v_float32x4 x) +{ + static const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f); + __m128 t = x.val; + __m128 h = _mm_mul_ps(t, _0_5); + t = _mm_rsqrt_ps(t); + t = _mm_mul_ps(t, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t, t), h))); + return v_float32x4(t); +} + inline v_float64x2 v_sqrt(v_float64x2 x) { return v_float64x2(_mm_sqrt_pd(x.val)); } +inline v_float64x2 v_invsqrt(v_float64x2 x) +{ + static const __m128d v_1 = _mm_set1_pd(1.); + return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val))); +} + inline v_float32x4 v_abs(v_float32x4 x) { return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); } inline v_float64x2 v_abs(v_float64x2 x) @@ -1893,7 +1868,7 @@ inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \ __m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \ __m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \ __m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \ - __m128i t3 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \ + __m128i t3 = cast_from(_mm_unpackhi_##suffix(a2.val, a3.val)); \ \ b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \ b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \ @@ -2074,42 +2049,682 @@ inline v_float64x2 v_cvt_f64(const v_float32x4& a) struct v_uint8x16 { + explicit v_uint8x16(uint8x16_t v) : val(v) {} + v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7, + uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15) + { + uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15}; + val = vld1q_u8(v); + } + uchar get0() const + { + return vgetq_lane_u8(val, 0); + } + uint8x16_t val; }; struct v_int8x16 { + explicit v_int8x16(int8x16_t v) : val(v) {} + v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7, + schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15) + { + schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15}; + val = vld1q_s8(v); + } + schar get0() const + { + return vgetq_lane_s8(val, 0); + } + int8x16_t val; }; struct v_uint16x8 { + explicit v_uint16x8(uint16x8_t v) : val(v) {} + v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7) + { + ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7}; + val = vld1q_u16(v); + } + ushort get0() const + { + return vgetq_lane_u16(val, 0); + } + uint16x8_t val; }; struct v_int16x8 { + explicit v_int16x8(int16x8_t v) : val(v) {} + v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7) + { + short v[] = {v0, v1, v2, v3, v4, v5, v6, v7}; + val = vld1q_s16(v); + } + short get0() const + { + return vgetq_lane_s16(val, 0); + } + int16x8_t val; }; struct v_uint32x4 { + explicit v_uint32x4(uint32x4_t v) : val(v) {} + v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3) + { + unsigned v[] = {v0, v1, v2, v3}; + val = vld1q_u32(v); + } + unsigned get0() const + { + return vgetq_lane_u32(val, 0); + } + uint32x4_t val; }; struct v_int32x4 { + explicit v_int32x4(int32x4_t v) : val(v) {} + v_int32x4(int v0, int v1, int v2, int v3) + { + int v[] = {v0, v1, v2, v3}; + val = vld1q_s32(v); + } + int get0() const + { + return vgetq_lane_s32(val, 0); + } int32x4_t val; }; struct v_float32x4 { + explicit v_float32x4(float32x4_t v) : val(v) {} + v_float32x4(float v0, float v1, float v2, float v3) + { + float v[] = {v0, v1, v2, v3}; + val = vld1q_f32(v); + } + float get0() const + { + return vgetq_lane_f32(val, 0); + } float32x4_t val; }; typedef v_reg v_float64x2; typedef v_reg v_float64x4; +#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \ +inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \ +inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \ +inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \ +inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \ +inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \ +inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(vreinterpretq_u16_##suffix(v.val)); } \ +inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpretq_s16_##suffix(v.val)); } \ +inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(vreinterpretq_u32_##suffix(v.val)); } \ +inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(vreinterpretq_s32_##suffix(v.val)); } \ +inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); } + +OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8) +OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8) +OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16) +OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16) +OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32) +OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32) +OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32) + +inline v_uint8x16 v_cvtn_u16(const v_uint16x8& a, const v_uint16x8& b) +{ + uint8x8_t a1 = vqmovn_u16(a.val), b1 = vqmovn_u16(b.val); + return v_uint8x16(vcombine_u8(a1, b1)); +} +inline v_uint8x16 v_cvtun_s16(const v_int16x8& a, const v_int16x8& b) +{ + uint8x8_t a1 = vqmovun_s16(a.val), b1 = vqmovun_s16(b.val); + return v_uint8x16(vcombine_u8(a1, b1)); +} +inline v_int8x16 v_cvtn_s16(const v_int16x8& a, const v_int16x8& b) +{ + int8x8_t a1 = vqmovn_s16(a.val), b1 = vqmovn_s16(b.val); + return v_int8x16(vcombine_s8(a1, b1)); +} +inline void v_storen_u16(uchar* ptr, const v_uint16x8& a) { vst1_u8(ptr, vqmovn_u16(a.val)); } +inline void v_storeun_s16(uchar* ptr, const v_int16x8& a) { vst1_u8(ptr, vqmovun_s16(a.val)); } +inline void v_storen_s16(schar* ptr, const v_int16x8& a) { vst1_s8(ptr, vqmovn_s16(a.val)); } + +inline v_uint16x8 v_cvtn_u32(const v_uint32x4& a, const v_uint32x4& b) +{ + uint16x4_t a1 = vqmovn_u32(a.val), b1 = vqmovn_u32(b.val); + return v_uint16x8(vcombine_u16(a1, b1)); +} +inline v_uint16x8 v_cvtun_s32(const v_int32x4& a, const v_int32x4& b) +{ + uint16x4_t a1 = vqmovun_s32(a.val), b1 = vqmovun_s32(b.val); + return v_uint16x8(vcombine_u16(a1, b1)); +} +inline v_int16x8 v_cvtn_s32(const v_int32x4& a, const v_int32x4& b) +{ + int16x4_t a1 = vqmovn_s32(a.val), b1 = vqmovn_s32(b.val); + return v_int16x8(vcombine_s16(a1, b1)); +} +inline void v_storen_u32(ushort* ptr, const v_uint32x4& a) { vst1_u16(ptr, vqmovn_u32(a.val)); } +inline void v_storeun_s32(ushort* ptr, const v_int32x4& a) { vst1_u16(ptr, vqmovun_s32(a.val)); } +inline void v_storen_s32(short* ptr, const v_int32x4& a) { vst1_s16(ptr, vqmovn_s32(a.val)); } + +#define v_shiftn_u16(a, b, n) v_uint8x16(vcombine_u8(vqrshrn_n_u16((a).val, (n)), vqrshrn_n_u16((b).val, (n)))) +#define v_shiftn_s16(a, b, n) v_int8x16(vcombine_s8(vqrshrn_n_s16((a).val, (n)), vqrshrn_n_s16((b).val, (n)))) +#define v_shiftn_u32(a, b, n) v_uint16x8(vcombine_u16(vqrshrn_n_u32((a).val, (n)), vqrshrn_n_u32((b).val, (n)))) +#define v_shiftn_s32(a, b, n) v_int16x8(vcombine_s16(vqrshrn_n_s32((a).val, (n)), vqrshrn_n_s32((b).val, (n)))) +#define v_shiftun_s16(a, b, n) v_uint8x16(vcombine_u8(vqrshrun_n_s16((a).val, (n)), vqrshrun_n_s16((b).val, (n)))) +#define v_shiftun_s32(a, b, n) v_uint16x8(vcombine_u16(vqrshrun_n_s32((a).val, (n)), vqrshrun_n_s32((b).val, (n)))) +#define v_shiftstoren_u16(a, n) vst1_u8(vqrshrn_n_u16((a).val, (n))) +#define v_shiftstoren_s16(a, n) vst1_s8(vqrshrn_n_s16((a).val, (n))) +#define v_shiftstoreun_s16(a, n) vst1_u8(vqrshrun_n_s16((a).val, (n))) +#define v_shiftstoren_u32(a, n) vst1_u16(vqrshrn_n_u32((a).val, (n))) +#define v_shiftstoren_s32(a, n) vst1_s16(vqrshrn_n_s32((a).val, (n))) +#define v_shiftstoreun_s32(a, n) vst1_u16(vqrshrun_n_s32((a).val, (n))) + +inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, + const v_float32x4& m1, const v_float32x4& m2, + const v_float32x4& m3) +{ + float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val); + float32x4_t res = vmulq_lane_f32(m0.val, vl, 0); + res = vmlaq_lane_f32(res, m1.val, vl, 1); + res = vmlaq_lane_f32(res, m2.val, vh, 0); + res = vmlaq_lane_f32(res, m3.val, vh, 1); + return v_float32x4(res); +} + +#define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \ +inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(intrin(a.val, b.val)); \ +} \ +inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ +{ \ + a.val = intrin(a.val, b.val); \ + return a; \ +} + +OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8) +OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8) +OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8) +OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_u8) +OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16) +OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16) +OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint16x8, vmulq_u16) +OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16) +OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16) +OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int16x8, vmulq_s16) +OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32) +OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32) +OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32) +OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32) +OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32) +OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32) + +inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b) +{ + float32x4_t reciprocal = vrecpeq_f32(b.val); + reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal); + return v_float32x4(vmulq_f32(a.val, reciprocal)); +} +inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b) +{ + float32x4_t reciprocal = vrecpeq_f32(b.val); + reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal); + a.val = vmulq_f32(a.val, reciprocal); + return a; +} + +#define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \ + OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \ + OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \ + OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \ + inline _Tpvec operator ~ (const _Tpvec& a) \ + { \ + return _Tpvec(vmvnq_##suffix(a.val)); \ + } + +OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint8x16, u8) +OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int8x16, s8) +OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint16x8, u16) +OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int16x8, s16) +OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint32x4, u32) +OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32) + +#define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \ +inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \ +{ \ + return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \ +} \ +inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \ +{ \ + a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \ + return a; \ +} + +OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32) +OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32) +OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32) + +inline v_float32x4 operator ~ (const v_float32x4& a) +{ + return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val)))); +} + +inline v_float32x4 v_sqrt(const v_float32x4& x) +{ + float32x4_t e = vrsqrteq_f32(x.val); + e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), x.val), e); + e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), x.val), e); + return v_float32x4(vmulq_f32(e, x.val)); +} + +inline v_float32x4 v_invsqrt(const v_float32x4& x) +{ + float32x4_t e = vrsqrteq_f32(x.val); + e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), x.val), e); + e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), x.val), e); + return v_float32x4(e); +} + +inline v_float32x4 v_abs(v_float32x4 x) +{ return v_float32x4(vabsq_f32(x.val)); } + +// TODO: exp, log, sin, cos + +#define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \ +inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(intrin(a.val, b.val)); \ +} + +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_min, vminq_u8) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_max, vmaxq_u8) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_min, vminq_s8) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_max, vmaxq_s8) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_min, vminq_u16) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_max, vmaxq_u16) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_min, vminq_s16) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_max, vmaxq_s16) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_min, vminq_u32) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_max, vmaxq_u32) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_max, vmaxq_s32) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_min, vminq_f32) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_max, vmaxq_f32) + + +#define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \ +inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \ +inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \ +inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \ +inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \ +inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \ +inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); } + +OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8) +OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, OPENCV_HAL_NOP, s8, u8) +OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16) +OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, OPENCV_HAL_NOP, s16, u16) +OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32) +OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, OPENCV_HAL_NOP, s32, u32) +OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32) + +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_add_wrap, vaddq_s16) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16) + +// TODO: absdiff for signed integers +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32) + +inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b) +{ + v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val)); + return v_sqrt(x); +} + +inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b) +{ + return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val)); +} + +inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c) +{ + return v_float32x4(vmlaq_f32(c.val, a.val, b.val)); +} + +// trade efficiency for convenience +#define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, _Tp, suffix) \ +inline _Tpvec operator << (const _Tpvec& a, int n) \ +{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##suffix((_Tp)n))); } \ +inline _Tpvec operator >> (const _Tpvec& a, int n) \ +{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##suffix((_Tp)-n))); } + +OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, uchar, u8) +OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, schar, s8) +OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, ushort, u16) +OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, short, s16) +OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, unsigned, u32) +OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, int, s32) + +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_mullo, vmulq_u16) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mullo, vmulq_s16) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mulhi2, vqrdmulhq_s16) + +#define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \ +inline _Tpvec v_load(const _Tp* ptr) \ +{ return _Tpvec(vld1q_##suffix(ptr)); } \ +inline _Tpvec v_load_aligned(const _Tp* ptr) \ +{ return _Tpvec(vld1q_##suffix(ptr)); } \ +inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ +{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \ +inline void v_store(_Tp* ptr, const _Tpvec& a) \ +{ vst1q_##suffix(ptr, a.val); } \ +inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ +{ vst1q_##suffix(ptr, a.val); } \ +inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ +{ vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \ +inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ +{ vst1_##suffix(ptr, vget_high_##suffix(a.val)); } + +OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8) +OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8) +OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16) +OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16) +OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32) +OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32) +OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32) + +#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \ +inline scalartype v_reduce_##func(const _Tpvec& a) \ +{ \ + scalartype CV_DECL_ALIGNED(16) buf[4]; \ + v_store_aligned(buf, a); \ + scalartype s0 = scalar_func(buf[0], buf[1]); \ + scalartype s1 = scalar_func(buf[2], buf[3]); \ + return scalar_func(s0, s1); \ +} + +OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, sum, OPENCV_HAL_ADD) +OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max) +OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min) +OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, sum, OPENCV_HAL_ADD) +OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, max, std::max) +OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, min, std::min) +OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD) +OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, max, std::max) +OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, min, std::min) + +inline int v_signmask(const v_uint8x16& a) +{ + uint8x8_t m0 = vcreate_u8(CV_BIG_UINT(0x0706050403020100)); + uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_u8(m0, m0)); + uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0))); + return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8); +} +inline int v_signmask(const v_int8x16& a) +{ return v_signmask(v_reinterpret_as_u8(a)); } + +inline int v_signmask(const v_uint16x8& a) +{ + uint16x4_t m0 = vcreate_u16(CV_BIG_UINT(0x0003000200010000)); + uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_u16(m0, m0)); + uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0)); + return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4); +} +inline int v_signmask(const v_int16x8& a) +{ return v_signmask(v_reinterpret_as_u16(a)); } + +inline int v_signmask(const v_uint32x4& a) +{ + uint32x2_t m0 = vcreate_u32(CV_BIG_UINT(0x0000000100000000)); + uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_u32(m0, m0)); + uint64x2_t v1 = vpaddlq_u32(v0); + return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2); +} +inline int v_signmask(const v_int32x4& a) +{ return v_signmask(v_reinterpret_as_u32(a)); } +inline int v_signmask(const v_float32x4& a) +{ return v_signmask(v_reinterpret_as_u32(a)); } + +#define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \ +inline bool v_check_all(const v_##_Tpvec& a) \ +{ \ + _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \ + uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \ + return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \ +} \ +inline bool v_check_any(const v_##_Tpvec& a) \ +{ \ + _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \ + uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \ + return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \ +} + +OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7) +OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15) +OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31) + +inline bool v_check_all(const v_int8x16& a) +{ return v_check_all(v_reinterpret_as_u8(a)); } +inline bool v_check_all(const v_int16x8& a) +{ return v_check_all(v_reinterpret_as_u16(a)); } +inline bool v_check_all(const v_int32x4& a) +{ return v_check_all(v_reinterpret_as_u32(a)); } +inline bool v_check_all(const v_float32x4& a) +{ return v_check_all(v_reinterpret_as_u32(a)); } + +inline bool v_check_any(const v_int8x16& a) +{ return v_check_all(v_reinterpret_as_u8(a)); } +inline bool v_check_any(const v_int16x8& a) +{ return v_check_all(v_reinterpret_as_u16(a)); } +inline bool v_check_any(const v_int32x4& a) +{ return v_check_all(v_reinterpret_as_u32(a)); } +inline bool v_check_any(const v_float32x4& a) +{ return v_check_all(v_reinterpret_as_u32(a)); } + +#define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \ +inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(vbslq_##suffix(vreinterpretq_##usuffix##_##suffix(mask.val), a.val, b.val)); \ +} + +OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8) +OPENCV_HAL_IMPL_NEON_SELECT(v_int8x16, s8, u8) +OPENCV_HAL_IMPL_NEON_SELECT(v_uint16x8, u16, u16) +OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16) +OPENCV_HAL_IMPL_NEON_SELECT(v_uint32x4, u32, u32) +OPENCV_HAL_IMPL_NEON_SELECT(v_int32x4, s32, u32) +OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32) + +#define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \ +inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \ +{ \ + b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \ + b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \ +} \ +inline _Tpwvec v_load_expand(const _Tp* ptr) \ +{ \ + return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \ +} + +OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8) +OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8) +OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16) +OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16) + +inline v_uint32x4 v_load_expand_q(const uchar* ptr) +{ + uint8x8_t v0 = vcreate_u8(*(unsigned*)ptr); + uint16x4_t v1 = vget_low_u16(vmovl_u8(v0)); + return v_uint32x4(vmovl_u16(v1)); +} + +inline v_int32x4 v_load_expand_q(const schar* ptr) +{ + int8x8_t v0 = vcreate_s8(*(unsigned*)ptr); + int16x4_t v1 = vget_low_s16(vmovl_s8(v0)); + return v_int32x4(vmovl_s16(v1)); +} + +#define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \ +inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \ +{ \ + _Tpvec##x2_t p = vzipq_##suffix(a0.val, a1.val); \ + b0.val = p.val[0]; \ + b1.val = p.val[1]; \ +} \ +inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \ +{ \ + return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \ +} \ +inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \ +{ \ + return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \ +} \ +inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \ +{ \ + c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \ + d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \ +} + +OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8) +OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8) +OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16) +OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16) +OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32) +OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32) +OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32) + +inline v_int32x4 v_round(const v_float32x4& a) +{ + static const int32x4_t v_sign = vdupq_n_s32(1 << 31), + v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f)); + + int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val))); + return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition)))); +} + +inline v_int32x4 v_floor(const v_float32x4& a) +{ + int32x4_t a1 = vcvtq_s32_f32(a.val); + uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val); + return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask))); +} + +inline v_int32x4 v_ceil(const v_float32x4& a) +{ + int32x4_t a1 = vcvtq_s32_f32(a.val); + uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1)); + return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask))); +} + +inline v_int32x4 v_trunc(const v_float32x4& a) +{ return v_int32x4(vcvtq_s32_f32(a.val)); } + +#define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \ +inline void transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \ + const v_##_Tpvec& a2, const v_##_Tpvec& a3, \ + v_##_Tpvec& b0, v_##_Tpvec& b1, \ + v_##_Tpvec& b2, v_##_Tpvec& b3) \ +{ \ + /* m00 m01 m02 m03 */ \ + /* m10 m11 m12 m13 */ \ + /* m20 m21 m22 m23 */ \ + /* m30 m31 m32 m33 */ \ + _Tpvec##x2_t t0 = vtrnq_##suffix(a0.val, a1.val); \ + _Tpvec##x2_t t1 = vtrnq_##suffix(a2.val, a3.val); \ + /* m00 m10 m02 m12 */ \ + /* m01 m11 m03 m13 */ \ + /* m20 m30 m22 m32 */ \ + /* m21 m31 m23 m33 */ \ + b0.val = vcombine_##suffix(vget_low_##suffix(t0.val[0]), vget_low_##suffix(t1.val[0])); \ + b1.val = vcombine_##suffix(vget_low_##suffix(t0.val[1]), vget_low_##suffix(t1.val[1])); \ + b2.val = vcombine_##suffix(vget_high_##suffix(t0.val[0]), vget_high_##suffix(t1.val[0])); \ + b3.val = vcombine_##suffix(vget_high_##suffix(t0.val[1]), vget_high_##suffix(t1.val[1])); \ +} + +OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32) +OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32) +OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32) + +#define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \ +inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \ +{ \ + _Tpvec##x3_t v = vld3q_##suffix(ptr); \ + a.val = v.val[0]; \ + b.val = v.val[1]; \ + c.val = v.val[2]; \ +} \ +inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \ + v_##_Tpvec& c, v_##_Tpvec& d) \ +{ \ + _Tpvec##x4_t v = vld4q_##suffix(ptr); \ + a.val = v.val[0]; \ + b.val = v.val[1]; \ + c.val = v.val[2]; \ + d.val = v.val[3]; \ +} \ +inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \ +{ \ + _Tpvec##x3_t v; \ + v.val[0] = a.val; \ + v.val[1] = b.val; \ + v.val[2] = c.val; \ + vst3q_##suffix(ptr, v); \ +} \ +inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \ + const v_##_Tpvec& c, const v_##_Tpvec& d) \ +{ \ + _Tpvec##x4_t v; \ + v.val[0] = a.val; \ + v.val[1] = b.val; \ + v.val[2] = c.val; \ + v.val[3] = d.val; \ + vst4q_##suffix(ptr, v); \ +} + +OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8) +OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8) +OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16) +OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16) +OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32) +OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32) +OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32) + +inline v_float32x4 v_cvt_f32(const v_int32x4& a) +{ + return v_float32x4(vcvtq_f32_s32(a.val)); +} + #else typedef v_reg v_uint8x16; @@ -2141,100 +2756,84 @@ inline v_int32x4 v_setall_s32(int v) { return v_int32x4::all(v); } inline v_float32x4 v_setall_f32(float v) { return v_float32x4::all(v); } inline v_float64x2 v_setall_f64(double v) { return v_float64x2::all(v); } -template inline v_uint8x16 v_reinterpret_u8(const v_reg<_Tp, n>& a) +template inline v_uint8x16 v_reinterpret_as_u8(const v_reg<_Tp, n>& a) { return v_reg<_Tp, n>::template reinterpret_as(a); } -template inline v_int8x16 v_reinterpret_s8(const v_reg<_Tp, n>& a) +template inline v_int8x16 v_reinterpret_as_s8(const v_reg<_Tp, n>& a) { return v_reg<_Tp, n>::template reinterpret_as(a); } -template inline v_uint16x8 v_reinterpret_u16(const v_reg<_Tp, n>& a) +template inline v_uint16x8 v_reinterpret_as_u16(const v_reg<_Tp, n>& a) { return v_reg<_Tp, n>::template reinterpret_as(a); } -template inline v_int16x8 v_reinterpret_s16(const v_reg<_Tp, n>& a) +template inline v_int16x8 v_reinterpret_as_s16(const v_reg<_Tp, n>& a) { return v_reg<_Tp, n>::template reinterpret_as(a); } -template inline v_uint32x4 v_reinterpret_u32(const v_reg<_Tp, n>& a) +template inline v_uint32x4 v_reinterpret_as_u32(const v_reg<_Tp, n>& a) { return v_reg<_Tp, n>::template reinterpret_as(a); } -template inline v_int32x4 v_reinterpret_s32(const v_reg<_Tp, n>& a) +template inline v_int32x4 v_reinterpret_as_s32(const v_reg<_Tp, n>& a) { return v_reg<_Tp, n>::template reinterpret_as(a); } -template inline v_float32x4 v_reinterpret_f32(const v_reg<_Tp, n>& a) +template inline v_float32x4 v_reinterpret_as_f32(const v_reg<_Tp, n>& a) { return v_reg<_Tp, n>::template reinterpret_as(a); } -template inline v_float64x2 v_reinterpret_f64(const v_reg<_Tp, n>& a) +template inline v_float64x2 v_reinterpret_as_f64(const v_reg<_Tp, n>& a) { return v_reg<_Tp, n>::template reinterpret_as(a); } -inline v_uint8x16 v_sat_u8(const v_uint16x8& a, const v_uint16x8& b) +inline v_uint8x16 v_cvtn_u16(const v_uint16x8& a, const v_uint16x8& b) { return v_cvtsat(a, b); } -inline v_uint8x16 v_sat_u8(const v_uint16x8& a, const v_uint16x8& b, int n) +inline v_uint8x16 v_shiftn_u16(const v_uint16x8& a, const v_uint16x8& b, int n) { return v_cvtsat(a, b, n); } -inline v_uint8x16 v_sat_u8(const v_int16x8& a, const v_int16x8& b) +inline v_uint8x16 v_cvtun_s16(const v_int16x8& a, const v_int16x8& b) { return v_cvtsat(a, b); } -inline v_uint8x16 v_sat_u8(const v_int16x8& a, const v_int16x8& b, int n) +inline v_uint8x16 v_shiftun_s16(const v_int16x8& a, const v_int16x8& b, int n) { return v_cvtsat(a, b, n); } -inline void v_storesat_u8(uchar* ptr, const v_uint16x8& b) +inline void v_storen_u16(uchar* ptr, const v_uint16x8& b) { return v_storesat(ptr, b); } -inline void v_storesat_u8(uchar* ptr, const v_uint16x8& b, int n) +inline void v_shiftstoren_u16(uchar* ptr, const v_uint16x8& b, int n) { return v_storesat(ptr, b, n); } -inline void v_storesat_u8(uchar* ptr, const v_int16x8& b) +inline void v_shiftstoreun_s16(uchar* ptr, const v_int16x8& b) { return v_storesat(ptr, b); } -inline void v_storesat_u8(uchar* ptr, const v_int16x8& b, int n) +inline void v_shiftstoreun_s16(uchar* ptr, const v_int16x8& b, int n) { return v_storesat(ptr, b, n); } -inline v_int8x16 v_sat_s8(const v_uint16x8& a, const v_uint16x8& b) -{ return v_cvtsat(a, b); } -inline v_int8x16 v_sat_s8(const v_uint16x8& a, const v_uint16x8& b, int n) -{ return v_cvtsat(a, b, n); } -inline v_int8x16 v_sat_s8(const v_int16x8& a, const v_int16x8& b) +inline v_int8x16 v_cvtn_s16(const v_int16x8& a, const v_int16x8& b) { return v_cvtsat(a, b); } -inline v_int8x16 v_sat_s8(const v_int16x8& a, const v_int16x8& b, int n) +inline v_int8x16 v_shiftn_s16(const v_int16x8& a, const v_int16x8& b, int n) { return v_cvtsat(a, b, n); } -inline void v_storesat_s8(schar* ptr, const v_uint16x8& b) +inline void v_storen_s16(schar* ptr, const v_int16x8& b) { return v_storesat(ptr, b); } -inline void v_storesat_s8(schar* ptr, const v_uint16x8& b, int n) -{ return v_storesat(ptr, b, n); } -inline void v_storesat_s8(schar* ptr, const v_int16x8& b) -{ return v_storesat(ptr, b); } -inline void v_storesat_s8(schar* ptr, const v_int16x8& b, int n) +inline void v_shiftstoren_s16(schar* ptr, const v_int16x8& b, int n) { return v_storesat(ptr, b, n); } -inline v_uint16x8 v_sat_u16(const v_uint32x4& a, const v_uint32x4& b) +inline v_uint16x8 v_cvtn_u32(const v_uint32x4& a, const v_uint32x4& b) { return v_cvtsat(a, b); } -inline v_uint16x8 v_sat_u16(const v_uint32x4& a, const v_uint32x4& b, int n) +inline v_uint16x8 v_shiftn_u32(const v_uint32x4& a, const v_uint32x4& b, int n) { return v_cvtsat(a, b, n); } -inline v_uint16x8 v_sat_u16(const v_int32x4& a, const v_int32x4& b) +inline v_uint16x8 v_cvtun_s32(const v_int32x4& a, const v_int32x4& b) { return v_cvtsat(a, b); } -inline v_uint16x8 v_sat_u16(const v_int32x4& a, const v_int32x4& b, int n) +inline v_uint16x8 v_shiftun_s32(const v_int32x4& a, const v_int32x4& b, int n) { return v_cvtsat(a, b, n); } -inline void v_storesat_u16(ushort* ptr, const v_uint32x4& b) +inline void v_storen_u32(ushort* ptr, const v_uint32x4& b) { return v_storesat(ptr, b); } -inline void v_storesat_u16(ushort* ptr, const v_uint32x4& b, int n) +inline void v_shiftstoren_u32(ushort* ptr, const v_uint32x4& b, int n) { return v_storesat(ptr, b, n); } -inline void v_storesat_u16(ushort* ptr, const v_int32x4& b) +inline void v_storeun_s32(ushort* ptr, const v_int32x4& b) { return v_storesat(ptr, b); } -inline void v_storesat_u16(ushort* ptr, const v_int32x4& b, int n) +inline void v_shiftstoreun_s32(ushort* ptr, const v_int32x4& b, int n) { return v_storesat(ptr, b, n); } -inline v_int16x8 v_sat_s16(const v_uint32x4& a, const v_uint32x4& b) -{ return v_cvtsat(a, b); } -inline v_int16x8 v_sat_s16(const v_uint32x4& a, const v_uint32x4& b, int n) -{ return v_cvtsat(a, b, n); } -inline v_int16x8 v_sat_s16(const v_int32x4& a, const v_int32x4& b) +inline v_int16x8 v_cvtn_s32(const v_int32x4& a, const v_int32x4& b) { return v_cvtsat(a, b); } -inline v_int16x8 v_sat_s16(const v_int32x4& a, const v_int32x4& b, int n) +inline v_int16x8 v_shiftn_s32(const v_int32x4& a, const v_int32x4& b, int n) { return v_cvtsat(a, b, n); } -inline void v_storesat_s16(short* ptr, const v_uint32x4& b) -{ return v_storesat(ptr, b); } -inline void v_storesat_s16(short* ptr, const v_uint32x4& b, int n) -{ return v_storesat(ptr, b, n); } -inline void v_storesat_s16(short* ptr, const v_int32x4& b) +inline void v_storen_s32(short* ptr, const v_int32x4& b) { return v_storesat(ptr, b); } -inline void v_storesat_s16(short* ptr, const v_int32x4& b, int n) +inline void v_shiftstoren_s32(short* ptr, const v_int32x4& b, int n) { return v_storesat(ptr, b, n); } inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, diff --git a/modules/hal/src/mathfuncs.cpp b/modules/hal/src/mathfuncs.cpp index a3f69facca..e970cfedb8 100644 --- a/modules/hal/src/mathfuncs.cpp +++ b/modules/hal/src/mathfuncs.cpp @@ -44,4 +44,1309 @@ namespace cv { namespace hal { +///////////////////////////////////// ATAN2 //////////////////////////////////// +static const float atan2_p1 = 0.9997878412794807f*(float)(180/CV_PI); +static const float atan2_p3 = -0.3258083974640975f*(float)(180/CV_PI); +static const float atan2_p5 = 0.1555786518463281f*(float)(180/CV_PI); +static const float atan2_p7 = -0.04432655554792128f*(float)(180/CV_PI); + +#if CV_NEON +static inline float32x4_t cv_vrecpq_f32(float32x4_t val) +{ + float32x4_t reciprocal = vrecpeq_f32(val); + reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal); + return reciprocal; +} +#endif + +void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees ) +{ + int i = 0; + float scale = angleInDegrees ? 1 : (float)(CV_PI/180); + +#ifdef HAVE_TEGRA_OPTIMIZATION + if (tegra::useTegra() && tegra::FastAtan2_32f(Y, X, angle, len, scale)) + return; +#endif + +#if CV_SSE2 + Cv32suf iabsmask; iabsmask.i = 0x7fffffff; + __m128 eps = _mm_set1_ps((float)DBL_EPSILON), absmask = _mm_set1_ps(iabsmask.f); + __m128 _90 = _mm_set1_ps(90.f), _180 = _mm_set1_ps(180.f), _360 = _mm_set1_ps(360.f); + __m128 z = _mm_setzero_ps(), scale4 = _mm_set1_ps(scale); + __m128 p1 = _mm_set1_ps(atan2_p1), p3 = _mm_set1_ps(atan2_p3); + __m128 p5 = _mm_set1_ps(atan2_p5), p7 = _mm_set1_ps(atan2_p7); + + for( ; i <= len - 4; i += 4 ) + { + __m128 x = _mm_loadu_ps(X + i), y = _mm_loadu_ps(Y + i); + __m128 ax = _mm_and_ps(x, absmask), ay = _mm_and_ps(y, absmask); + __m128 mask = _mm_cmplt_ps(ax, ay); + __m128 tmin = _mm_min_ps(ax, ay), tmax = _mm_max_ps(ax, ay); + __m128 c = _mm_div_ps(tmin, _mm_add_ps(tmax, eps)); + __m128 c2 = _mm_mul_ps(c, c); + __m128 a = _mm_mul_ps(c2, p7); + a = _mm_mul_ps(_mm_add_ps(a, p5), c2); + a = _mm_mul_ps(_mm_add_ps(a, p3), c2); + a = _mm_mul_ps(_mm_add_ps(a, p1), c); + + __m128 b = _mm_sub_ps(_90, a); + a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask)); + + b = _mm_sub_ps(_180, a); + mask = _mm_cmplt_ps(x, z); + a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask)); + + b = _mm_sub_ps(_360, a); + mask = _mm_cmplt_ps(y, z); + a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask)); + + a = _mm_mul_ps(a, scale4); + _mm_storeu_ps(angle + i, a); + } +#elif CV_NEON + float32x4_t eps = vdupq_n_f32((float)DBL_EPSILON); + float32x4_t _90 = vdupq_n_f32(90.f), _180 = vdupq_n_f32(180.f), _360 = vdupq_n_f32(360.f); + float32x4_t z = vdupq_n_f32(0.0f), scale4 = vdupq_n_f32(scale); + float32x4_t p1 = vdupq_n_f32(atan2_p1), p3 = vdupq_n_f32(atan2_p3); + float32x4_t p5 = vdupq_n_f32(atan2_p5), p7 = vdupq_n_f32(atan2_p7); + + for( ; i <= len - 4; i += 4 ) + { + float32x4_t x = vld1q_f32(X + i), y = vld1q_f32(Y + i); + float32x4_t ax = vabsq_f32(x), ay = vabsq_f32(y); + float32x4_t tmin = vminq_f32(ax, ay), tmax = vmaxq_f32(ax, ay); + float32x4_t c = vmulq_f32(tmin, cv_vrecpq_f32(vaddq_f32(tmax, eps))); + float32x4_t c2 = vmulq_f32(c, c); + float32x4_t a = vmulq_f32(c2, p7); + a = vmulq_f32(vaddq_f32(a, p5), c2); + a = vmulq_f32(vaddq_f32(a, p3), c2); + a = vmulq_f32(vaddq_f32(a, p1), c); + + a = vbslq_f32(vcgeq_f32(ax, ay), a, vsubq_f32(_90, a)); + a = vbslq_f32(vcltq_f32(x, z), vsubq_f32(_180, a), a); + a = vbslq_f32(vcltq_f32(y, z), vsubq_f32(_360, a), a); + + vst1q_f32(angle + i, vmulq_f32(a, scale4)); + } +#endif + + for( ; i < len; i++ ) + { + float x = X[i], y = Y[i]; + float ax = std::abs(x), ay = std::abs(y); + float a, c, c2; + if( ax >= ay ) + { + c = ay/(ax + (float)DBL_EPSILON); + c2 = c*c; + a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c; + } + else + { + c = ax/(ay + (float)DBL_EPSILON); + c2 = c*c; + a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c; + } + if( x < 0 ) + a = 180.f - a; + if( y < 0 ) + a = 360.f - a; + angle[i] = (float)(a*scale); + } +} + + +void magnitude(const float* x, const float* y, float* mag, int len) +{ + int i = 0; + +#if CV_SIMD128 + for( ; i <= len - 8; i += 8 ) + { + v_float32x4 x0 = v_load(x + i), x1 = v_load(x + i + 4); + v_float32x4 y0 = v_load(y + i), y1 = v_load(y + i + 4); + x0 = v_sqrt(v_muladd(x0, x0, y0*y0)); + x1 = v_sqrt(v_muladd(x1, x1, y1*y1)); + v_store(mag + i, x0); + v_store(mag + i + 4, x1); + } +#endif + + for( ; i < len; i++ ) + { + float x0 = x[i], y0 = y[i]; + mag[i] = std::sqrt(x0*x0 + y0*y0); + } +} + +void magnitude(const double* x, const double* y, double* mag, int len) +{ + int i = 0; + +#if defined CV_SIMD128_64F && CV_SIMD128_64F + for( ; i <= len - 4; i += 4 ) + { + v_float64x2 x0 = v_load(x + i), x1 = v_load(x + i + 2); + v_float64x2 y0 = v_load(y + i), y1 = v_load(y + i + 2); + x0 = v_sqrt(v_muladd(x0, x0, y0*y0)); + x1 = v_sqrt(v_muladd(x1, x1, y1*y1)); + v_store(mag + i, x0); + v_store(mag + i + 2, x1); + } +#endif + + for( ; i < len; i++ ) + { + double x0 = x[i], y0 = y[i]; + mag[i] = std::sqrt(x0*x0 + y0*y0); + } +} + + +void invSqrt(const float* src, float* dst, int len) +{ + int i = 0; + +#if CV_SIMD128 + for( ; i <= len - 8; i += 8 ) + { + v_float32x4 t0 = v_load(src + i), t1 = v_load(src + i + 4); + t0 = v_invsqrt(t0); + t1 = v_invsqrt(t1); + v_store(dst + i, t0); v_store(dst + i + 4, t1); + } +#endif + + for( ; i < len; i++ ) + dst[i] = 1/std::sqrt(src[i]); +} + + +void invSqrt(const double* src, double* dst, int len) +{ + int i = 0; + +#if CV_SSE2 + __m128d v_1 = _mm_set1_pd(1.0); + for ( ; i <= len - 2; i += 2) + _mm_storeu_pd(dst + i, _mm_div_pd(v_1, _mm_sqrt_pd(_mm_loadu_pd(src + i)))); +#endif + + for( ; i < len; i++ ) + dst[i] = 1/std::sqrt(src[i]); +} + + +void sqrt(const float* src, float* dst, int len) +{ + int i = 0; + +#if CV_SIMD128 + for( ; i <= len - 8; i += 8 ) + { + v_float32x4 t0 = v_load(src + i), t1 = v_load(src + i + 4); + t0 = v_sqrt(t0); + t1 = v_sqrt(t1); + v_store(dst + i, t0); v_store(dst + i + 4, t1); + } +#endif + + for( ; i < len; i++ ) + dst[i] = std::sqrt(src[i]); +} + + +void sqrt(const double* src, double* dst, int len) +{ + int i = 0; + +#if defined CV_SIMD128_64F && CV_SIMD128_64F + for( ; i <= len - 4; i += 4 ) + { + v_float64x2 t0 = v_load(src + i), t1 = v_load(src + i + 2); + t0 = v_sqrt(t0); + t1 = v_sqrt(t1); + v_store(dst + i, t0); v_store(dst + i + 2, t1); + } +#endif + + for( ; i < len; i++ ) + dst[i] = std::sqrt(src[i]); +} + +////////////////////////////////////// EXP ///////////////////////////////////// + +typedef union +{ + struct { +#if ( defined( WORDS_BIGENDIAN ) && !defined( OPENCV_UNIVERSAL_BUILD ) ) || defined( __BIG_ENDIAN__ ) + int hi; + int lo; +#else + int lo; + int hi; +#endif + } i; + double d; +} +DBLINT; + +#define EXPTAB_SCALE 6 +#define EXPTAB_MASK ((1 << EXPTAB_SCALE) - 1) + +#define EXPPOLY_32F_A0 .9670371139572337719125840413672004409288e-2 + +static const double expTab[] = { + 1.0 * EXPPOLY_32F_A0, + 1.0108892860517004600204097905619 * EXPPOLY_32F_A0, + 1.0218971486541166782344801347833 * EXPPOLY_32F_A0, + 1.0330248790212284225001082839705 * EXPPOLY_32F_A0, + 1.0442737824274138403219664787399 * EXPPOLY_32F_A0, + 1.0556451783605571588083413251529 * EXPPOLY_32F_A0, + 1.0671404006768236181695211209928 * EXPPOLY_32F_A0, + 1.0787607977571197937406800374385 * EXPPOLY_32F_A0, + 1.0905077326652576592070106557607 * EXPPOLY_32F_A0, + 1.1023825833078409435564142094256 * EXPPOLY_32F_A0, + 1.1143867425958925363088129569196 * EXPPOLY_32F_A0, + 1.126521618608241899794798643787 * EXPPOLY_32F_A0, + 1.1387886347566916537038302838415 * EXPPOLY_32F_A0, + 1.151189229952982705817759635202 * EXPPOLY_32F_A0, + 1.1637248587775775138135735990922 * EXPPOLY_32F_A0, + 1.1763969916502812762846457284838 * EXPPOLY_32F_A0, + 1.1892071150027210667174999705605 * EXPPOLY_32F_A0, + 1.2021567314527031420963969574978 * EXPPOLY_32F_A0, + 1.2152473599804688781165202513388 * EXPPOLY_32F_A0, + 1.2284805361068700056940089577928 * EXPPOLY_32F_A0, + 1.2418578120734840485936774687266 * EXPPOLY_32F_A0, + 1.2553807570246910895793906574423 * EXPPOLY_32F_A0, + 1.2690509571917332225544190810323 * EXPPOLY_32F_A0, + 1.2828700160787782807266697810215 * EXPPOLY_32F_A0, + 1.2968395546510096659337541177925 * EXPPOLY_32F_A0, + 1.3109612115247643419229917863308 * EXPPOLY_32F_A0, + 1.3252366431597412946295370954987 * EXPPOLY_32F_A0, + 1.3396675240533030053600306697244 * EXPPOLY_32F_A0, + 1.3542555469368927282980147401407 * EXPPOLY_32F_A0, + 1.3690024229745906119296011329822 * EXPPOLY_32F_A0, + 1.3839098819638319548726595272652 * EXPPOLY_32F_A0, + 1.3989796725383111402095281367152 * EXPPOLY_32F_A0, + 1.4142135623730950488016887242097 * EXPPOLY_32F_A0, + 1.4296133383919700112350657782751 * EXPPOLY_32F_A0, + 1.4451808069770466200370062414717 * EXPPOLY_32F_A0, + 1.4609177941806469886513028903106 * EXPPOLY_32F_A0, + 1.476826145939499311386907480374 * EXPPOLY_32F_A0, + 1.4929077282912648492006435314867 * EXPPOLY_32F_A0, + 1.5091644275934227397660195510332 * EXPPOLY_32F_A0, + 1.5255981507445383068512536895169 * EXPPOLY_32F_A0, + 1.5422108254079408236122918620907 * EXPPOLY_32F_A0, + 1.5590044002378369670337280894749 * EXPPOLY_32F_A0, + 1.5759808451078864864552701601819 * EXPPOLY_32F_A0, + 1.5931421513422668979372486431191 * EXPPOLY_32F_A0, + 1.6104903319492543081795206673574 * EXPPOLY_32F_A0, + 1.628027421857347766848218522014 * EXPPOLY_32F_A0, + 1.6457554781539648445187567247258 * EXPPOLY_32F_A0, + 1.6636765803267364350463364569764 * EXPPOLY_32F_A0, + 1.6817928305074290860622509524664 * EXPPOLY_32F_A0, + 1.7001063537185234695013625734975 * EXPPOLY_32F_A0, + 1.7186192981224779156293443764563 * EXPPOLY_32F_A0, + 1.7373338352737062489942020818722 * EXPPOLY_32F_A0, + 1.7562521603732994831121606193753 * EXPPOLY_32F_A0, + 1.7753764925265212525505592001993 * EXPPOLY_32F_A0, + 1.7947090750031071864277032421278 * EXPPOLY_32F_A0, + 1.8142521755003987562498346003623 * EXPPOLY_32F_A0, + 1.8340080864093424634870831895883 * EXPPOLY_32F_A0, + 1.8539791250833855683924530703377 * EXPPOLY_32F_A0, + 1.8741676341102999013299989499544 * EXPPOLY_32F_A0, + 1.8945759815869656413402186534269 * EXPPOLY_32F_A0, + 1.9152065613971472938726112702958 * EXPPOLY_32F_A0, + 1.9360617934922944505980559045667 * EXPPOLY_32F_A0, + 1.9571441241754002690183222516269 * EXPPOLY_32F_A0, + 1.9784560263879509682582499181312 * EXPPOLY_32F_A0, +}; + + +// the code below uses _mm_cast* intrinsics, which are not avialable on VS2005 +#if (defined _MSC_VER && _MSC_VER < 1500) || \ +(!defined __APPLE__ && defined __GNUC__ && __GNUC__*100 + __GNUC_MINOR__ < 402) +#undef CV_SSE2 +#define CV_SSE2 0 +#endif + +static const double exp_prescale = 1.4426950408889634073599246810019 * (1 << EXPTAB_SCALE); +static const double exp_postscale = 1./(1 << EXPTAB_SCALE); +static const double exp_max_val = 3000.*(1 << EXPTAB_SCALE); // log10(DBL_MAX) < 3000 + +void exp( const float *_x, float *y, int n ) +{ + static const float + A4 = (float)(1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0), + A3 = (float)(.6931471805521448196800669615864773144641 / EXPPOLY_32F_A0), + A2 = (float)(.2402265109513301490103372422686535526573 / EXPPOLY_32F_A0), + A1 = (float)(.5550339366753125211915322047004666939128e-1 / EXPPOLY_32F_A0); + +#undef EXPPOLY +#define EXPPOLY(x) \ +(((((x) + A1)*(x) + A2)*(x) + A3)*(x) + A4) + + int i = 0; + const Cv32suf* x = (const Cv32suf*)_x; + Cv32suf buf[4]; + +#if CV_SSE2 + if( n >= 8 ) + { + static const __m128d prescale2 = _mm_set1_pd(exp_prescale); + static const __m128 postscale4 = _mm_set1_ps((float)exp_postscale); + static const __m128 maxval4 = _mm_set1_ps((float)(exp_max_val/exp_prescale)); + static const __m128 minval4 = _mm_set1_ps((float)(-exp_max_val/exp_prescale)); + + static const __m128 mA1 = _mm_set1_ps(A1); + static const __m128 mA2 = _mm_set1_ps(A2); + static const __m128 mA3 = _mm_set1_ps(A3); + static const __m128 mA4 = _mm_set1_ps(A4); + bool y_aligned = (size_t)(void*)y % 16 == 0; + + ushort CV_DECL_ALIGNED(16) tab_idx[8]; + + for( ; i <= n - 8; i += 8 ) + { + __m128 xf0, xf1; + xf0 = _mm_loadu_ps(&x[i].f); + xf1 = _mm_loadu_ps(&x[i+4].f); + __m128i xi0, xi1, xi2, xi3; + + xf0 = _mm_min_ps(_mm_max_ps(xf0, minval4), maxval4); + xf1 = _mm_min_ps(_mm_max_ps(xf1, minval4), maxval4); + + __m128d xd0 = _mm_cvtps_pd(xf0); + __m128d xd2 = _mm_cvtps_pd(_mm_movehl_ps(xf0, xf0)); + __m128d xd1 = _mm_cvtps_pd(xf1); + __m128d xd3 = _mm_cvtps_pd(_mm_movehl_ps(xf1, xf1)); + + xd0 = _mm_mul_pd(xd0, prescale2); + xd2 = _mm_mul_pd(xd2, prescale2); + xd1 = _mm_mul_pd(xd1, prescale2); + xd3 = _mm_mul_pd(xd3, prescale2); + + xi0 = _mm_cvtpd_epi32(xd0); + xi2 = _mm_cvtpd_epi32(xd2); + + xi1 = _mm_cvtpd_epi32(xd1); + xi3 = _mm_cvtpd_epi32(xd3); + + xd0 = _mm_sub_pd(xd0, _mm_cvtepi32_pd(xi0)); + xd2 = _mm_sub_pd(xd2, _mm_cvtepi32_pd(xi2)); + xd1 = _mm_sub_pd(xd1, _mm_cvtepi32_pd(xi1)); + xd3 = _mm_sub_pd(xd3, _mm_cvtepi32_pd(xi3)); + + xf0 = _mm_movelh_ps(_mm_cvtpd_ps(xd0), _mm_cvtpd_ps(xd2)); + xf1 = _mm_movelh_ps(_mm_cvtpd_ps(xd1), _mm_cvtpd_ps(xd3)); + + xf0 = _mm_mul_ps(xf0, postscale4); + xf1 = _mm_mul_ps(xf1, postscale4); + + xi0 = _mm_unpacklo_epi64(xi0, xi2); + xi1 = _mm_unpacklo_epi64(xi1, xi3); + xi0 = _mm_packs_epi32(xi0, xi1); + + _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi16(EXPTAB_MASK))); + + xi0 = _mm_add_epi16(_mm_srai_epi16(xi0, EXPTAB_SCALE), _mm_set1_epi16(127)); + xi0 = _mm_max_epi16(xi0, _mm_setzero_si128()); + xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(255)); + xi1 = _mm_unpackhi_epi16(xi0, _mm_setzero_si128()); + xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128()); + + __m128d yd0 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[0]), _mm_load_sd(expTab + tab_idx[1])); + __m128d yd1 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[2]), _mm_load_sd(expTab + tab_idx[3])); + __m128d yd2 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[4]), _mm_load_sd(expTab + tab_idx[5])); + __m128d yd3 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[6]), _mm_load_sd(expTab + tab_idx[7])); + + __m128 yf0 = _mm_movelh_ps(_mm_cvtpd_ps(yd0), _mm_cvtpd_ps(yd1)); + __m128 yf1 = _mm_movelh_ps(_mm_cvtpd_ps(yd2), _mm_cvtpd_ps(yd3)); + + yf0 = _mm_mul_ps(yf0, _mm_castsi128_ps(_mm_slli_epi32(xi0, 23))); + yf1 = _mm_mul_ps(yf1, _mm_castsi128_ps(_mm_slli_epi32(xi1, 23))); + + __m128 zf0 = _mm_add_ps(xf0, mA1); + __m128 zf1 = _mm_add_ps(xf1, mA1); + + zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA2); + zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA2); + + zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA3); + zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA3); + + zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA4); + zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA4); + + zf0 = _mm_mul_ps(zf0, yf0); + zf1 = _mm_mul_ps(zf1, yf1); + + if( y_aligned ) + { + _mm_store_ps(y + i, zf0); + _mm_store_ps(y + i + 4, zf1); + } + else + { + _mm_storeu_ps(y + i, zf0); + _mm_storeu_ps(y + i + 4, zf1); + } + } + } + else +#endif + for( ; i <= n - 4; i += 4 ) + { + double x0 = x[i].f * exp_prescale; + double x1 = x[i + 1].f * exp_prescale; + double x2 = x[i + 2].f * exp_prescale; + double x3 = x[i + 3].f * exp_prescale; + int val0, val1, val2, val3, t; + + if( ((x[i].i >> 23) & 255) > 127 + 10 ) + x0 = x[i].i < 0 ? -exp_max_val : exp_max_val; + + if( ((x[i+1].i >> 23) & 255) > 127 + 10 ) + x1 = x[i+1].i < 0 ? -exp_max_val : exp_max_val; + + if( ((x[i+2].i >> 23) & 255) > 127 + 10 ) + x2 = x[i+2].i < 0 ? -exp_max_val : exp_max_val; + + if( ((x[i+3].i >> 23) & 255) > 127 + 10 ) + x3 = x[i+3].i < 0 ? -exp_max_val : exp_max_val; + + val0 = cvRound(x0); + val1 = cvRound(x1); + val2 = cvRound(x2); + val3 = cvRound(x3); + + x0 = (x0 - val0)*exp_postscale; + x1 = (x1 - val1)*exp_postscale; + x2 = (x2 - val2)*exp_postscale; + x3 = (x3 - val3)*exp_postscale; + + t = (val0 >> EXPTAB_SCALE) + 127; + t = !(t & ~255) ? t : t < 0 ? 0 : 255; + buf[0].i = t << 23; + + t = (val1 >> EXPTAB_SCALE) + 127; + t = !(t & ~255) ? t : t < 0 ? 0 : 255; + buf[1].i = t << 23; + + t = (val2 >> EXPTAB_SCALE) + 127; + t = !(t & ~255) ? t : t < 0 ? 0 : 255; + buf[2].i = t << 23; + + t = (val3 >> EXPTAB_SCALE) + 127; + t = !(t & ~255) ? t : t < 0 ? 0 : 255; + buf[3].i = t << 23; + + x0 = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 ); + x1 = buf[1].f * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 ); + + y[i] = (float)x0; + y[i + 1] = (float)x1; + + x2 = buf[2].f * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 ); + x3 = buf[3].f * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 ); + + y[i + 2] = (float)x2; + y[i + 3] = (float)x3; + } + + for( ; i < n; i++ ) + { + double x0 = x[i].f * exp_prescale; + int val0, t; + + if( ((x[i].i >> 23) & 255) > 127 + 10 ) + x0 = x[i].i < 0 ? -exp_max_val : exp_max_val; + + val0 = cvRound(x0); + t = (val0 >> EXPTAB_SCALE) + 127; + t = !(t & ~255) ? t : t < 0 ? 0 : 255; + + buf[0].i = t << 23; + x0 = (x0 - val0)*exp_postscale; + + y[i] = (float)(buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY(x0)); + } +} + +void exp( const double *_x, double *y, int n ) +{ + static const double + A5 = .99999999999999999998285227504999 / EXPPOLY_32F_A0, + A4 = .69314718055994546743029643825322 / EXPPOLY_32F_A0, + A3 = .24022650695886477918181338054308 / EXPPOLY_32F_A0, + A2 = .55504108793649567998466049042729e-1 / EXPPOLY_32F_A0, + A1 = .96180973140732918010002372686186e-2 / EXPPOLY_32F_A0, + A0 = .13369713757180123244806654839424e-2 / EXPPOLY_32F_A0; + +#undef EXPPOLY +#define EXPPOLY(x) (((((A0*(x) + A1)*(x) + A2)*(x) + A3)*(x) + A4)*(x) + A5) + + int i = 0; + Cv64suf buf[4]; + const Cv64suf* x = (const Cv64suf*)_x; + +#if CV_SSE2 + static const __m128d prescale2 = _mm_set1_pd(exp_prescale); + static const __m128d postscale2 = _mm_set1_pd(exp_postscale); + static const __m128d maxval2 = _mm_set1_pd(exp_max_val); + static const __m128d minval2 = _mm_set1_pd(-exp_max_val); + + static const __m128d mA0 = _mm_set1_pd(A0); + static const __m128d mA1 = _mm_set1_pd(A1); + static const __m128d mA2 = _mm_set1_pd(A2); + static const __m128d mA3 = _mm_set1_pd(A3); + static const __m128d mA4 = _mm_set1_pd(A4); + static const __m128d mA5 = _mm_set1_pd(A5); + + int CV_DECL_ALIGNED(16) tab_idx[4]; + + for( ; i <= n - 4; i += 4 ) + { + __m128d xf0 = _mm_loadu_pd(&x[i].f), xf1 = _mm_loadu_pd(&x[i+2].f); + __m128i xi0, xi1; + xf0 = _mm_min_pd(_mm_max_pd(xf0, minval2), maxval2); + xf1 = _mm_min_pd(_mm_max_pd(xf1, minval2), maxval2); + xf0 = _mm_mul_pd(xf0, prescale2); + xf1 = _mm_mul_pd(xf1, prescale2); + + xi0 = _mm_cvtpd_epi32(xf0); + xi1 = _mm_cvtpd_epi32(xf1); + xf0 = _mm_mul_pd(_mm_sub_pd(xf0, _mm_cvtepi32_pd(xi0)), postscale2); + xf1 = _mm_mul_pd(_mm_sub_pd(xf1, _mm_cvtepi32_pd(xi1)), postscale2); + + xi0 = _mm_unpacklo_epi64(xi0, xi1); + _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi32(EXPTAB_MASK))); + + xi0 = _mm_add_epi32(_mm_srai_epi32(xi0, EXPTAB_SCALE), _mm_set1_epi32(1023)); + xi0 = _mm_packs_epi32(xi0, xi0); + xi0 = _mm_max_epi16(xi0, _mm_setzero_si128()); + xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(2047)); + xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128()); + xi1 = _mm_unpackhi_epi32(xi0, _mm_setzero_si128()); + xi0 = _mm_unpacklo_epi32(xi0, _mm_setzero_si128()); + + __m128d yf0 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[0]), _mm_load_sd(expTab + tab_idx[1])); + __m128d yf1 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[2]), _mm_load_sd(expTab + tab_idx[3])); + yf0 = _mm_mul_pd(yf0, _mm_castsi128_pd(_mm_slli_epi64(xi0, 52))); + yf1 = _mm_mul_pd(yf1, _mm_castsi128_pd(_mm_slli_epi64(xi1, 52))); + + __m128d zf0 = _mm_add_pd(_mm_mul_pd(mA0, xf0), mA1); + __m128d zf1 = _mm_add_pd(_mm_mul_pd(mA0, xf1), mA1); + + zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA2); + zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA2); + + zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA3); + zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA3); + + zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA4); + zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA4); + + zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA5); + zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA5); + + zf0 = _mm_mul_pd(zf0, yf0); + zf1 = _mm_mul_pd(zf1, yf1); + + _mm_storeu_pd(y + i, zf0); + _mm_storeu_pd(y + i + 2, zf1); + } +#endif + for( ; i <= n - 4; i += 4 ) + { + double x0 = x[i].f * exp_prescale; + double x1 = x[i + 1].f * exp_prescale; + double x2 = x[i + 2].f * exp_prescale; + double x3 = x[i + 3].f * exp_prescale; + + double y0, y1, y2, y3; + int val0, val1, val2, val3, t; + + t = (int)(x[i].i >> 52); + if( (t & 2047) > 1023 + 10 ) + x0 = t < 0 ? -exp_max_val : exp_max_val; + + t = (int)(x[i+1].i >> 52); + if( (t & 2047) > 1023 + 10 ) + x1 = t < 0 ? -exp_max_val : exp_max_val; + + t = (int)(x[i+2].i >> 52); + if( (t & 2047) > 1023 + 10 ) + x2 = t < 0 ? -exp_max_val : exp_max_val; + + t = (int)(x[i+3].i >> 52); + if( (t & 2047) > 1023 + 10 ) + x3 = t < 0 ? -exp_max_val : exp_max_val; + + val0 = cvRound(x0); + val1 = cvRound(x1); + val2 = cvRound(x2); + val3 = cvRound(x3); + + x0 = (x0 - val0)*exp_postscale; + x1 = (x1 - val1)*exp_postscale; + x2 = (x2 - val2)*exp_postscale; + x3 = (x3 - val3)*exp_postscale; + + t = (val0 >> EXPTAB_SCALE) + 1023; + t = !(t & ~2047) ? t : t < 0 ? 0 : 2047; + buf[0].i = (int64)t << 52; + + t = (val1 >> EXPTAB_SCALE) + 1023; + t = !(t & ~2047) ? t : t < 0 ? 0 : 2047; + buf[1].i = (int64)t << 52; + + t = (val2 >> EXPTAB_SCALE) + 1023; + t = !(t & ~2047) ? t : t < 0 ? 0 : 2047; + buf[2].i = (int64)t << 52; + + t = (val3 >> EXPTAB_SCALE) + 1023; + t = !(t & ~2047) ? t : t < 0 ? 0 : 2047; + buf[3].i = (int64)t << 52; + + y0 = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 ); + y1 = buf[1].f * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 ); + + y[i] = y0; + y[i + 1] = y1; + + y2 = buf[2].f * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 ); + y3 = buf[3].f * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 ); + + y[i + 2] = y2; + y[i + 3] = y3; + } + + for( ; i < n; i++ ) + { + double x0 = x[i].f * exp_prescale; + int val0, t; + + t = (int)(x[i].i >> 52); + if( (t & 2047) > 1023 + 10 ) + x0 = t < 0 ? -exp_max_val : exp_max_val; + + val0 = cvRound(x0); + t = (val0 >> EXPTAB_SCALE) + 1023; + t = !(t & ~2047) ? t : t < 0 ? 0 : 2047; + + buf[0].i = (int64)t << 52; + x0 = (x0 - val0)*exp_postscale; + + y[i] = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 ); + } +} + +#undef EXPTAB_SCALE +#undef EXPTAB_MASK +#undef EXPPOLY_32F_A0 + +/////////////////////////////////////////// LOG /////////////////////////////////////// + +#define LOGTAB_SCALE 8 +#define LOGTAB_MASK ((1 << LOGTAB_SCALE) - 1) +#define LOGTAB_MASK2 ((1 << (20 - LOGTAB_SCALE)) - 1) +#define LOGTAB_MASK2_32F ((1 << (23 - LOGTAB_SCALE)) - 1) + +static const double CV_DECL_ALIGNED(16) icvLogTab[] = { + 0.0000000000000000000000000000000000000000, 1.000000000000000000000000000000000000000, + .00389864041565732288852075271279318258166, .9961089494163424124513618677042801556420, + .00778214044205494809292034119607706088573, .9922480620155038759689922480620155038760, + .01165061721997527263705585198749759001657, .9884169884169884169884169884169884169884, + .01550418653596525274396267235488267033361, .9846153846153846153846153846153846153846, + .01934296284313093139406447562578250654042, .9808429118773946360153256704980842911877, + .02316705928153437593630670221500622574241, .9770992366412213740458015267175572519084, + .02697658769820207233514075539915211265906, .9733840304182509505703422053231939163498, + .03077165866675368732785500469617545604706, .9696969696969696969696969696969696969697, + .03455238150665972812758397481047722976656, .9660377358490566037735849056603773584906, + .03831886430213659461285757856785494368522, .9624060150375939849624060150375939849624, + .04207121392068705056921373852674150839447, .9588014981273408239700374531835205992509, + .04580953603129420126371940114040626212953, .9552238805970149253731343283582089552239, + .04953393512227662748292900118940451648088, .9516728624535315985130111524163568773234, + .05324451451881227759255210685296333394944, .9481481481481481481481481481481481481481, + .05694137640013842427411105973078520037234, .9446494464944649446494464944649446494465, + .06062462181643483993820353816772694699466, .9411764705882352941176470588235294117647, + .06429435070539725460836422143984236754475, .9377289377289377289377289377289377289377, + .06795066190850773679699159401934593915938, .9343065693430656934306569343065693430657, + .07159365318700880442825962290953611955044, .9309090909090909090909090909090909090909, + .07522342123758751775142172846244648098944, .9275362318840579710144927536231884057971, + .07884006170777602129362549021607264876369, .9241877256317689530685920577617328519856, + .08244366921107458556772229485432035289706, .9208633093525179856115107913669064748201, + .08603433734180314373940490213499288074675, .9175627240143369175627240143369175627240, + .08961215868968712416897659522874164395031, .9142857142857142857142857142857142857143, + .09317722485418328259854092721070628613231, .9110320284697508896797153024911032028470, + .09672962645855109897752299730200320482256, .9078014184397163120567375886524822695035, + .10026945316367513738597949668474029749630, .9045936395759717314487632508833922261484, + .10379679368164355934833764649738441221420, .9014084507042253521126760563380281690141, + .10731173578908805021914218968959175981580, .8982456140350877192982456140350877192982, + .11081436634029011301105782649756292812530, .8951048951048951048951048951048951048951, + .11430477128005862852422325204315711744130, .8919860627177700348432055749128919860627, + .11778303565638344185817487641543266363440, .8888888888888888888888888888888888888889, + .12124924363286967987640707633545389398930, .8858131487889273356401384083044982698962, + .12470347850095722663787967121606925502420, .8827586206896551724137931034482758620690, + .12814582269193003360996385708858724683530, .8797250859106529209621993127147766323024, + .13157635778871926146571524895989568904040, .8767123287671232876712328767123287671233, + .13499516453750481925766280255629681050780, .8737201365187713310580204778156996587031, + .13840232285911913123754857224412262439730, .8707482993197278911564625850340136054422, + .14179791186025733629172407290752744302150, .8677966101694915254237288135593220338983, + .14518200984449788903951628071808954700830, .8648648648648648648648648648648648648649, + .14855469432313711530824207329715136438610, .8619528619528619528619528619528619528620, + .15191604202584196858794030049466527998450, .8590604026845637583892617449664429530201, + .15526612891112392955683674244937719777230, .8561872909698996655518394648829431438127, + .15860503017663857283636730244325008243330, .8533333333333333333333333333333333333333, + .16193282026931324346641360989451641216880, .8504983388704318936877076411960132890365, + .16524957289530714521497145597095368430010, .8476821192052980132450331125827814569536, + .16855536102980664403538924034364754334090, .8448844884488448844884488448844884488449, + .17185025692665920060697715143760433420540, .8421052631578947368421052631578947368421, + .17513433212784912385018287750426679849630, .8393442622950819672131147540983606557377, + .17840765747281828179637841458315961062910, .8366013071895424836601307189542483660131, + .18167030310763465639212199675966985523700, .8338762214983713355048859934853420195440, + .18492233849401198964024217730184318497780, .8311688311688311688311688311688311688312, + .18816383241818296356839823602058459073300, .8284789644012944983818770226537216828479, + .19139485299962943898322009772527962923050, .8258064516129032258064516129032258064516, + .19461546769967164038916962454095482826240, .8231511254019292604501607717041800643087, + .19782574332991986754137769821682013571260, .8205128205128205128205128205128205128205, + .20102574606059073203390141770796617493040, .8178913738019169329073482428115015974441, + .20421554142869088876999228432396193966280, .8152866242038216560509554140127388535032, + .20739519434607056602715147164417430758480, .8126984126984126984126984126984126984127, + .21056476910734961416338251183333341032260, .8101265822784810126582278481012658227848, + .21372432939771812687723695489694364368910, .8075709779179810725552050473186119873817, + .21687393830061435506806333251006435602900, .8050314465408805031446540880503144654088, + .22001365830528207823135744547471404075630, .8025078369905956112852664576802507836991, + .22314355131420973710199007200571941211830, .8000000000000000000000000000000000000000, + .22626367865045338145790765338460914790630, .7975077881619937694704049844236760124611, + .22937410106484582006380890106811420992010, .7950310559006211180124223602484472049689, + .23247487874309405442296849741978803649550, .7925696594427244582043343653250773993808, + .23556607131276688371634975283086532726890, .7901234567901234567901234567901234567901, + .23864773785017498464178231643018079921600, .7876923076923076923076923076923076923077, + .24171993688714515924331749374687206000090, .7852760736196319018404907975460122699387, + .24478272641769091566565919038112042471760, .7828746177370030581039755351681957186544, + .24783616390458124145723672882013488560910, .7804878048780487804878048780487804878049, + .25088030628580937353433455427875742316250, .7781155015197568389057750759878419452888, + .25391520998096339667426946107298135757450, .7757575757575757575757575757575757575758, + .25694093089750041913887912414793390780680, .7734138972809667673716012084592145015106, + .25995752443692604627401010475296061486000, .7710843373493975903614457831325301204819, + .26296504550088134477547896494797896593800, .7687687687687687687687687687687687687688, + .26596354849713793599974565040611196309330, .7664670658682634730538922155688622754491, + .26895308734550393836570947314612567424780, .7641791044776119402985074626865671641791, + .27193371548364175804834985683555714786050, .7619047619047619047619047619047619047619, + .27490548587279922676529508862586226314300, .7596439169139465875370919881305637982196, + .27786845100345625159121709657483734190480, .7573964497041420118343195266272189349112, + .28082266290088775395616949026589281857030, .7551622418879056047197640117994100294985, + .28376817313064456316240580235898960381750, .7529411764705882352941176470588235294118, + .28670503280395426282112225635501090437180, .7507331378299120234604105571847507331378, + .28963329258304265634293983566749375313530, .7485380116959064327485380116959064327485, + .29255300268637740579436012922087684273730, .7463556851311953352769679300291545189504, + .29546421289383584252163927885703742504130, .7441860465116279069767441860465116279070, + .29836697255179722709783618483925238251680, .7420289855072463768115942028985507246377, + .30126133057816173455023545102449133992200, .7398843930635838150289017341040462427746, + .30414733546729666446850615102448500692850, .7377521613832853025936599423631123919308, + .30702503529491181888388950937951449304830, .7356321839080459770114942528735632183908, + .30989447772286465854207904158101882785550, .7335243553008595988538681948424068767908, + .31275571000389684739317885942000430077330, .7314285714285714285714285714285714285714, + .31560877898630329552176476681779604405180, .7293447293447293447293447293447293447293, + .31845373111853458869546784626436419785030, .7272727272727272727272727272727272727273, + .32129061245373424782201254856772720813750, .7252124645892351274787535410764872521246, + .32411946865421192853773391107097268104550, .7231638418079096045197740112994350282486, + .32694034499585328257253991068864706903700, .7211267605633802816901408450704225352113, + .32975328637246797969240219572384376078850, .7191011235955056179775280898876404494382, + .33255833730007655635318997155991382896900, .7170868347338935574229691876750700280112, + .33535554192113781191153520921943709254280, .7150837988826815642458100558659217877095, + .33814494400871636381467055798566434532400, .7130919220055710306406685236768802228412, + .34092658697059319283795275623560883104800, .7111111111111111111111111111111111111111, + .34370051385331840121395430287520866841080, .7091412742382271468144044321329639889197, + .34646676734620857063262633346312213689100, .7071823204419889502762430939226519337017, + .34922538978528827602332285096053965389730, .7052341597796143250688705234159779614325, + .35197642315717814209818925519357435405250, .7032967032967032967032967032967032967033, + .35471990910292899856770532096561510115850, .7013698630136986301369863013698630136986, + .35745588892180374385176833129662554711100, .6994535519125683060109289617486338797814, + .36018440357500774995358483465679455548530, .6975476839237057220708446866485013623978, + .36290549368936841911903457003063522279280, .6956521739130434782608695652173913043478, + .36561919956096466943762379742111079394830, .6937669376693766937669376693766937669377, + .36832556115870762614150635272380895912650, .6918918918918918918918918918918918918919, + .37102461812787262962487488948681857436900, .6900269541778975741239892183288409703504, + .37371640979358405898480555151763837784530, .6881720430107526881720430107526881720430, + .37640097516425302659470730759494472295050, .6863270777479892761394101876675603217158, + .37907835293496944251145919224654790014030, .6844919786096256684491978609625668449198, + .38174858149084833769393299007788300514230, .6826666666666666666666666666666666666667, + .38441169891033200034513583887019194662580, .6808510638297872340425531914893617021277, + .38706774296844825844488013899535872042180, .6790450928381962864721485411140583554377, + .38971675114002518602873692543653305619950, .6772486772486772486772486772486772486772, + .39235876060286384303665840889152605086580, .6754617414248021108179419525065963060686, + .39499380824086893770896722344332374632350, .6736842105263157894736842105263157894737, + .39762193064713846624158577469643205404280, .6719160104986876640419947506561679790026, + .40024316412701266276741307592601515352730, .6701570680628272251308900523560209424084, + .40285754470108348090917615991202183067800, .6684073107049608355091383812010443864230, + .40546510810816432934799991016916465014230, .6666666666666666666666666666666666666667, + .40806588980822172674223224930756259709600, .6649350649350649350649350649350649350649, + .41065992498526837639616360320360399782650, .6632124352331606217616580310880829015544, + .41324724855021932601317757871584035456180, .6614987080103359173126614987080103359173, + .41582789514371093497757669865677598863850, .6597938144329896907216494845360824742268, + .41840189913888381489925905043492093682300, .6580976863753213367609254498714652956298, + .42096929464412963239894338585145305842150, .6564102564102564102564102564102564102564, + .42353011550580327293502591601281892508280, .6547314578005115089514066496163682864450, + .42608439531090003260516141381231136620050, .6530612244897959183673469387755102040816, + .42863216738969872610098832410585600882780, .6513994910941475826972010178117048346056, + .43117346481837132143866142541810404509300, .6497461928934010152284263959390862944162, + .43370832042155937902094819946796633303180, .6481012658227848101265822784810126582278, + .43623676677491801667585491486534010618930, .6464646464646464646464646464646464646465, + .43875883620762790027214350629947148263450, .6448362720403022670025188916876574307305, + .44127456080487520440058801796112675219780, .6432160804020100502512562814070351758794, + .44378397241030093089975139264424797147500, .6416040100250626566416040100250626566416, + .44628710262841947420398014401143882423650, .6400000000000000000000000000000000000000, + .44878398282700665555822183705458883196130, .6384039900249376558603491271820448877805, + .45127464413945855836729492693848442286250, .6368159203980099502487562189054726368159, + .45375911746712049854579618113348260521900, .6352357320099255583126550868486352357320, + .45623743348158757315857769754074979573500, .6336633663366336633663366336633663366337, + .45870962262697662081833982483658473938700, .6320987654320987654320987654320987654321, + .46117571512217014895185229761409573256980, .6305418719211822660098522167487684729064, + .46363574096303250549055974261136725544930, .6289926289926289926289926289926289926290, + .46608972992459918316399125615134835243230, .6274509803921568627450980392156862745098, + .46853771156323925639597405279346276074650, .6259168704156479217603911980440097799511, + .47097971521879100631480241645476780831830, .6243902439024390243902439024390243902439, + .47341577001667212165614273544633761048330, .6228710462287104622871046228710462287105, + .47584590486996386493601107758877333253630, .6213592233009708737864077669902912621359, + .47827014848147025860569669930555392056700, .6198547215496368038740920096852300242131, + .48068852934575190261057286988943815231330, .6183574879227053140096618357487922705314, + .48310107575113581113157579238759353756900, .6168674698795180722891566265060240963855, + .48550781578170076890899053978500887751580, .6153846153846153846153846153846153846154, + .48790877731923892879351001283794175833480, .6139088729016786570743405275779376498801, + .49030398804519381705802061333088204264650, .6124401913875598086124401913875598086124, + .49269347544257524607047571407747454941280, .6109785202863961813842482100238663484487, + .49507726679785146739476431321236304938800, .6095238095238095238095238095238095238095, + .49745538920281889838648226032091770321130, .6080760095011876484560570071258907363420, + .49982786955644931126130359189119189977650, .6066350710900473933649289099526066350711, + .50219473456671548383667413872899487614650, .6052009456264775413711583924349881796690, + .50455601075239520092452494282042607665050, .6037735849056603773584905660377358490566, + .50691172444485432801997148999362252652650, .6023529411764705882352941176470588235294, + .50926190178980790257412536448100581765150, .6009389671361502347417840375586854460094, + .51160656874906207391973111953120678663250, .5995316159250585480093676814988290398126, + .51394575110223428282552049495279788970950, .5981308411214953271028037383177570093458, + .51627947444845445623684554448118433356300, .5967365967365967365967365967365967365967, + .51860776420804555186805373523384332656850, .5953488372093023255813953488372093023256, + .52093064562418522900344441950437612831600, .5939675174013921113689095127610208816705, + .52324814376454775732838697877014055848100, .5925925925925925925925925925925925925926, + .52556028352292727401362526507000438869000, .5912240184757505773672055427251732101617, + .52786708962084227803046587723656557500350, .5898617511520737327188940092165898617512, + .53016858660912158374145519701414741575700, .5885057471264367816091954022988505747126, + .53246479886947173376654518506256863474850, .5871559633027522935779816513761467889908, + .53475575061602764748158733709715306758900, .5858123569794050343249427917620137299771, + .53704146589688361856929077475797384977350, .5844748858447488584474885844748858447489, + .53932196859560876944783558428753167390800, .5831435079726651480637813211845102505695, + .54159728243274429804188230264117009937750, .5818181818181818181818181818181818181818, + .54386743096728351609669971367111429572100, .5804988662131519274376417233560090702948, + .54613243759813556721383065450936555862450, .5791855203619909502262443438914027149321, + .54839232556557315767520321969641372561450, .5778781038374717832957110609480812641084, + .55064711795266219063194057525834068655950, .5765765765765765765765765765765765765766, + .55289683768667763352766542084282264113450, .5752808988764044943820224719101123595506, + .55514150754050151093110798683483153581600, .5739910313901345291479820627802690582960, + .55738115013400635344709144192165695130850, .5727069351230425055928411633109619686801, + .55961578793542265941596269840374588966350, .5714285714285714285714285714285714285714, + .56184544326269181269140062795486301183700, .5701559020044543429844097995545657015590, + .56407013828480290218436721261241473257550, .5688888888888888888888888888888888888889, + .56628989502311577464155334382667206227800, .5676274944567627494456762749445676274945, + .56850473535266865532378233183408156037350, .5663716814159292035398230088495575221239, + .57071468100347144680739575051120482385150, .5651214128035320088300220750551876379691, + .57291975356178548306473885531886480748650, .5638766519823788546255506607929515418502, + .57511997447138785144460371157038025558000, .5626373626373626373626373626373626373626, + .57731536503482350219940144597785547375700, .5614035087719298245614035087719298245614, + .57950594641464214795689713355386629700650, .5601750547045951859956236323851203501094, + .58169173963462239562716149521293118596100, .5589519650655021834061135371179039301310, + .58387276558098266665552955601015128195300, .5577342047930283224400871459694989106754, + .58604904500357812846544902640744112432000, .5565217391304347826086956521739130434783, + .58822059851708596855957011939608491957200, .5553145336225596529284164859002169197397, + .59038744660217634674381770309992134571100, .5541125541125541125541125541125541125541, + .59254960960667157898740242671919986605650, .5529157667386609071274298056155507559395, + .59470710774669277576265358220553025603300, .5517241379310344827586206896551724137931, + .59685996110779382384237123915227130055450, .5505376344086021505376344086021505376344, + .59900818964608337768851242799428291618800, .5493562231759656652360515021459227467811, + .60115181318933474940990890900138765573500, .5481798715203426124197002141327623126338, + .60329085143808425240052883964381180703650, .5470085470085470085470085470085470085470, + .60542532396671688843525771517306566238400, .5458422174840085287846481876332622601279, + .60755525022454170969155029524699784815300, .5446808510638297872340425531914893617021, + .60968064953685519036241657886421307921400, .5435244161358811040339702760084925690021, + .61180154110599282990534675263916142284850, .5423728813559322033898305084745762711864, + .61391794401237043121710712512140162289150, .5412262156448202959830866807610993657505, + .61602987721551394351138242200249806046500, .5400843881856540084388185654008438818565, + .61813735955507864705538167982012964785100, .5389473684210526315789473684210526315789, + .62024040975185745772080281312810257077200, .5378151260504201680672268907563025210084, + .62233904640877868441606324267922900617100, .5366876310272536687631027253668763102725, + .62443328801189346144440150965237990021700, .5355648535564853556485355648535564853556, + .62652315293135274476554741340805776417250, .5344467640918580375782881002087682672234, + .62860865942237409420556559780379757285100, .5333333333333333333333333333333333333333, + .63068982562619868570408243613201193511500, .5322245322245322245322245322245322245322, + .63276666957103777644277897707070223987100, .5311203319502074688796680497925311203320, + .63483920917301017716738442686619237065300, .5300207039337474120082815734989648033126, + .63690746223706917739093569252872839570050, .5289256198347107438016528925619834710744, + .63897144645792069983514238629140891134750, .5278350515463917525773195876288659793814, + .64103117942093124081992527862894348800200, .5267489711934156378600823045267489711934, + .64308667860302726193566513757104985415950, .5256673511293634496919917864476386036961, + .64513796137358470073053240412264131009600, .5245901639344262295081967213114754098361, + .64718504499530948859131740391603671014300, .5235173824130879345603271983640081799591, + .64922794662510974195157587018911726772800, .5224489795918367346938775510204081632653, + .65126668331495807251485530287027359008800, .5213849287169042769857433808553971486762, + .65330127201274557080523663898929953575150, .5203252032520325203252032520325203252033, + .65533172956312757406749369692988693714150, .5192697768762677484787018255578093306288, + .65735807270835999727154330685152672231200, .5182186234817813765182186234817813765182, + .65938031808912778153342060249997302889800, .5171717171717171717171717171717171717172, + .66139848224536490484126716182800009846700, .5161290322580645161290322580645161290323, + .66341258161706617713093692145776003599150, .5150905432595573440643863179074446680080, + .66542263254509037562201001492212526500250, .5140562248995983935742971887550200803213, + .66742865127195616370414654738851822912700, .5130260521042084168336673346693386773547, + .66943065394262923906154583164607174694550, .5120000000000000000000000000000000000000, + .67142865660530226534774556057527661323550, .5109780439121756487025948103792415169661, + .67342267521216669923234121597488410770900, .5099601593625498007968127490039840637450, + .67541272562017662384192817626171745359900, .5089463220675944333996023856858846918489, + .67739882359180603188519853574689477682100, .5079365079365079365079365079365079365079, + .67938098479579733801614338517538271844400, .5069306930693069306930693069306930693069, + .68135922480790300781450241629499942064300, .5059288537549407114624505928853754940711, + .68333355911162063645036823800182901322850, .5049309664694280078895463510848126232742, + .68530400309891936760919861626462079584600, .5039370078740157480314960629921259842520, + .68727057207096020619019327568821609020250, .5029469548133595284872298624754420432220, + .68923328123880889251040571252815425395950, .5019607843137254901960784313725490196078, + .69314718055994530941723212145818, 5.0e-01, +}; + + + +#define LOGTAB_TRANSLATE(x,h) (((x) - 1.)*icvLogTab[(h)+1]) +static const double ln_2 = 0.69314718055994530941723212145818; + +void log( const float *_x, float *y, int n ) +{ + static const float shift[] = { 0, -1.f/512 }; + static const float + A0 = 0.3333333333333333333333333f, + A1 = -0.5f, + A2 = 1.f; + +#undef LOGPOLY +#define LOGPOLY(x) (((A0*(x) + A1)*(x) + A2)*(x)) + + int i = 0; + Cv32suf buf[4]; + const int* x = (const int*)_x; + +#if CV_SSE2 + static const __m128d ln2_2 = _mm_set1_pd(ln_2); + static const __m128 _1_4 = _mm_set1_ps(1.f); + static const __m128 shift4 = _mm_set1_ps(-1.f/512); + + static const __m128 mA0 = _mm_set1_ps(A0); + static const __m128 mA1 = _mm_set1_ps(A1); + static const __m128 mA2 = _mm_set1_ps(A2); + + int CV_DECL_ALIGNED(16) idx[4]; + + for( ; i <= n - 4; i += 4 ) + { + __m128i h0 = _mm_loadu_si128((const __m128i*)(x + i)); + __m128i yi0 = _mm_sub_epi32(_mm_and_si128(_mm_srli_epi32(h0, 23), _mm_set1_epi32(255)), _mm_set1_epi32(127)); + __m128d yd0 = _mm_mul_pd(_mm_cvtepi32_pd(yi0), ln2_2); + __m128d yd1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_unpackhi_epi64(yi0,yi0)), ln2_2); + + __m128i xi0 = _mm_or_si128(_mm_and_si128(h0, _mm_set1_epi32(LOGTAB_MASK2_32F)), _mm_set1_epi32(127 << 23)); + + h0 = _mm_and_si128(_mm_srli_epi32(h0, 23 - LOGTAB_SCALE - 1), _mm_set1_epi32(LOGTAB_MASK*2)); + _mm_store_si128((__m128i*)idx, h0); + h0 = _mm_cmpeq_epi32(h0, _mm_set1_epi32(510)); + + __m128d t0, t1, t2, t3, t4; + t0 = _mm_load_pd(icvLogTab + idx[0]); + t2 = _mm_load_pd(icvLogTab + idx[1]); + t1 = _mm_unpackhi_pd(t0, t2); + t0 = _mm_unpacklo_pd(t0, t2); + t2 = _mm_load_pd(icvLogTab + idx[2]); + t4 = _mm_load_pd(icvLogTab + idx[3]); + t3 = _mm_unpackhi_pd(t2, t4); + t2 = _mm_unpacklo_pd(t2, t4); + + yd0 = _mm_add_pd(yd0, t0); + yd1 = _mm_add_pd(yd1, t2); + + __m128 yf0 = _mm_movelh_ps(_mm_cvtpd_ps(yd0), _mm_cvtpd_ps(yd1)); + + __m128 xf0 = _mm_sub_ps(_mm_castsi128_ps(xi0), _1_4); + xf0 = _mm_mul_ps(xf0, _mm_movelh_ps(_mm_cvtpd_ps(t1), _mm_cvtpd_ps(t3))); + xf0 = _mm_add_ps(xf0, _mm_and_ps(_mm_castsi128_ps(h0), shift4)); + + __m128 zf0 = _mm_mul_ps(xf0, mA0); + zf0 = _mm_mul_ps(_mm_add_ps(zf0, mA1), xf0); + zf0 = _mm_mul_ps(_mm_add_ps(zf0, mA2), xf0); + yf0 = _mm_add_ps(yf0, zf0); + + _mm_storeu_ps(y + i, yf0); + } +#endif + for( ; i <= n - 4; i += 4 ) + { + double x0, x1, x2, x3; + double y0, y1, y2, y3; + int h0, h1, h2, h3; + + h0 = x[i]; + h1 = x[i+1]; + buf[0].i = (h0 & LOGTAB_MASK2_32F) | (127 << 23); + buf[1].i = (h1 & LOGTAB_MASK2_32F) | (127 << 23); + + y0 = (((h0 >> 23) & 0xff) - 127) * ln_2; + y1 = (((h1 >> 23) & 0xff) - 127) * ln_2; + + h0 = (h0 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; + h1 = (h1 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; + + y0 += icvLogTab[h0]; + y1 += icvLogTab[h1]; + + h2 = x[i+2]; + h3 = x[i+3]; + + x0 = LOGTAB_TRANSLATE( buf[0].f, h0 ); + x1 = LOGTAB_TRANSLATE( buf[1].f, h1 ); + + buf[2].i = (h2 & LOGTAB_MASK2_32F) | (127 << 23); + buf[3].i = (h3 & LOGTAB_MASK2_32F) | (127 << 23); + + y2 = (((h2 >> 23) & 0xff) - 127) * ln_2; + y3 = (((h3 >> 23) & 0xff) - 127) * ln_2; + + h2 = (h2 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; + h3 = (h3 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; + + y2 += icvLogTab[h2]; + y3 += icvLogTab[h3]; + + x2 = LOGTAB_TRANSLATE( buf[2].f, h2 ); + x3 = LOGTAB_TRANSLATE( buf[3].f, h3 ); + + x0 += shift[h0 == 510]; + x1 += shift[h1 == 510]; + y0 += LOGPOLY( x0 ); + y1 += LOGPOLY( x1 ); + + y[i] = (float) y0; + y[i + 1] = (float) y1; + + x2 += shift[h2 == 510]; + x3 += shift[h3 == 510]; + y2 += LOGPOLY( x2 ); + y3 += LOGPOLY( x3 ); + + y[i + 2] = (float) y2; + y[i + 3] = (float) y3; + } + + for( ; i < n; i++ ) + { + int h0 = x[i]; + double y0; + float x0; + + y0 = (((h0 >> 23) & 0xff) - 127) * ln_2; + + buf[0].i = (h0 & LOGTAB_MASK2_32F) | (127 << 23); + h0 = (h0 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; + + y0 += icvLogTab[h0]; + x0 = (float)LOGTAB_TRANSLATE( buf[0].f, h0 ); + x0 += shift[h0 == 510]; + y0 += LOGPOLY( x0 ); + + y[i] = (float)y0; + } +} + +void log( const double *x, double *y, int n ) +{ + static const double shift[] = { 0, -1./512 }; + static const double + A7 = 1.0, + A6 = -0.5, + A5 = 0.333333333333333314829616256247390992939472198486328125, + A4 = -0.25, + A3 = 0.2, + A2 = -0.1666666666666666574148081281236954964697360992431640625, + A1 = 0.1428571428571428769682682968777953647077083587646484375, + A0 = -0.125; + +#undef LOGPOLY +#define LOGPOLY(x,k) ((x)+=shift[k], xq = (x)*(x),\ +(((A0*xq + A2)*xq + A4)*xq + A6)*xq + \ +(((A1*xq + A3)*xq + A5)*xq + A7)*(x)) + + int i = 0; + DBLINT buf[4]; + DBLINT *X = (DBLINT *) x; + +#if CV_SSE2 + static const __m128d ln2_2 = _mm_set1_pd(ln_2); + static const __m128d _1_2 = _mm_set1_pd(1.); + static const __m128d shift2 = _mm_set1_pd(-1./512); + + static const __m128i log_and_mask2 = _mm_set_epi32(LOGTAB_MASK2, 0xffffffff, LOGTAB_MASK2, 0xffffffff); + static const __m128i log_or_mask2 = _mm_set_epi32(1023 << 20, 0, 1023 << 20, 0); + + static const __m128d mA0 = _mm_set1_pd(A0); + static const __m128d mA1 = _mm_set1_pd(A1); + static const __m128d mA2 = _mm_set1_pd(A2); + static const __m128d mA3 = _mm_set1_pd(A3); + static const __m128d mA4 = _mm_set1_pd(A4); + static const __m128d mA5 = _mm_set1_pd(A5); + static const __m128d mA6 = _mm_set1_pd(A6); + static const __m128d mA7 = _mm_set1_pd(A7); + + int CV_DECL_ALIGNED(16) idx[4]; + + for( ; i <= n - 4; i += 4 ) + { + __m128i h0 = _mm_loadu_si128((const __m128i*)(x + i)); + __m128i h1 = _mm_loadu_si128((const __m128i*)(x + i + 2)); + + __m128d xd0 = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(h0, log_and_mask2), log_or_mask2)); + __m128d xd1 = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(h1, log_and_mask2), log_or_mask2)); + + h0 = _mm_unpackhi_epi32(_mm_unpacklo_epi32(h0, h1), _mm_unpackhi_epi32(h0, h1)); + + __m128i yi0 = _mm_sub_epi32(_mm_and_si128(_mm_srli_epi32(h0, 20), + _mm_set1_epi32(2047)), _mm_set1_epi32(1023)); + __m128d yd0 = _mm_mul_pd(_mm_cvtepi32_pd(yi0), ln2_2); + __m128d yd1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_unpackhi_epi64(yi0, yi0)), ln2_2); + + h0 = _mm_and_si128(_mm_srli_epi32(h0, 20 - LOGTAB_SCALE - 1), _mm_set1_epi32(LOGTAB_MASK * 2)); + _mm_store_si128((__m128i*)idx, h0); + h0 = _mm_cmpeq_epi32(h0, _mm_set1_epi32(510)); + + __m128d t0, t1, t2, t3, t4; + t0 = _mm_load_pd(icvLogTab + idx[0]); + t2 = _mm_load_pd(icvLogTab + idx[1]); + t1 = _mm_unpackhi_pd(t0, t2); + t0 = _mm_unpacklo_pd(t0, t2); + t2 = _mm_load_pd(icvLogTab + idx[2]); + t4 = _mm_load_pd(icvLogTab + idx[3]); + t3 = _mm_unpackhi_pd(t2, t4); + t2 = _mm_unpacklo_pd(t2, t4); + + yd0 = _mm_add_pd(yd0, t0); + yd1 = _mm_add_pd(yd1, t2); + + xd0 = _mm_mul_pd(_mm_sub_pd(xd0, _1_2), t1); + xd1 = _mm_mul_pd(_mm_sub_pd(xd1, _1_2), t3); + + xd0 = _mm_add_pd(xd0, _mm_and_pd(_mm_castsi128_pd(_mm_unpacklo_epi32(h0, h0)), shift2)); + xd1 = _mm_add_pd(xd1, _mm_and_pd(_mm_castsi128_pd(_mm_unpackhi_epi32(h0, h0)), shift2)); + + __m128d zd0 = _mm_mul_pd(xd0, mA0); + __m128d zd1 = _mm_mul_pd(xd1, mA0); + zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA1), xd0); + zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA1), xd1); + zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA2), xd0); + zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA2), xd1); + zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA3), xd0); + zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA3), xd1); + zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA4), xd0); + zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA4), xd1); + zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA5), xd0); + zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA5), xd1); + zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA6), xd0); + zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA6), xd1); + zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA7), xd0); + zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA7), xd1); + + yd0 = _mm_add_pd(yd0, zd0); + yd1 = _mm_add_pd(yd1, zd1); + + _mm_storeu_pd(y + i, yd0); + _mm_storeu_pd(y + i + 2, yd1); + } +#endif + for( ; i <= n - 4; i += 4 ) + { + double xq; + double x0, x1, x2, x3; + double y0, y1, y2, y3; + int h0, h1, h2, h3; + + h0 = X[i].i.lo; + h1 = X[i + 1].i.lo; + buf[0].i.lo = h0; + buf[1].i.lo = h1; + + h0 = X[i].i.hi; + h1 = X[i + 1].i.hi; + buf[0].i.hi = (h0 & LOGTAB_MASK2) | (1023 << 20); + buf[1].i.hi = (h1 & LOGTAB_MASK2) | (1023 << 20); + + y0 = (((h0 >> 20) & 0x7ff) - 1023) * ln_2; + y1 = (((h1 >> 20) & 0x7ff) - 1023) * ln_2; + + h2 = X[i + 2].i.lo; + h3 = X[i + 3].i.lo; + buf[2].i.lo = h2; + buf[3].i.lo = h3; + + h0 = (h0 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; + h1 = (h1 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; + + y0 += icvLogTab[h0]; + y1 += icvLogTab[h1]; + + h2 = X[i + 2].i.hi; + h3 = X[i + 3].i.hi; + + x0 = LOGTAB_TRANSLATE( buf[0].d, h0 ); + x1 = LOGTAB_TRANSLATE( buf[1].d, h1 ); + + buf[2].i.hi = (h2 & LOGTAB_MASK2) | (1023 << 20); + buf[3].i.hi = (h3 & LOGTAB_MASK2) | (1023 << 20); + + y2 = (((h2 >> 20) & 0x7ff) - 1023) * ln_2; + y3 = (((h3 >> 20) & 0x7ff) - 1023) * ln_2; + + h2 = (h2 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; + h3 = (h3 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; + + y2 += icvLogTab[h2]; + y3 += icvLogTab[h3]; + + x2 = LOGTAB_TRANSLATE( buf[2].d, h2 ); + x3 = LOGTAB_TRANSLATE( buf[3].d, h3 ); + + y0 += LOGPOLY( x0, h0 == 510 ); + y1 += LOGPOLY( x1, h1 == 510 ); + + y[i] = y0; + y[i + 1] = y1; + + y2 += LOGPOLY( x2, h2 == 510 ); + y3 += LOGPOLY( x3, h3 == 510 ); + + y[i + 2] = y2; + y[i + 3] = y3; + } + + for( ; i < n; i++ ) + { + int h0 = X[i].i.hi; + double xq; + double x0, y0 = (((h0 >> 20) & 0x7ff) - 1023) * ln_2; + + buf[0].i.hi = (h0 & LOGTAB_MASK2) | (1023 << 20); + buf[0].i.lo = X[i].i.lo; + h0 = (h0 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2; + + y0 += icvLogTab[h0]; + x0 = LOGTAB_TRANSLATE( buf[0].d, h0 ); + y0 += LOGPOLY( x0, h0 == 510 ); + y[i] = y0; + } +} + }} diff --git a/modules/hal/src/matrix.cpp b/modules/hal/src/matrix.cpp index a3f69facca..9506aaf478 100644 --- a/modules/hal/src/matrix.cpp +++ b/modules/hal/src/matrix.cpp @@ -44,4 +44,165 @@ namespace cv { namespace hal { +/****************************************************************************************\ +* LU & Cholesky implementation for small matrices * +\****************************************************************************************/ + +template static inline int +LUImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n) +{ + int i, j, k, p = 1; + astep /= sizeof(A[0]); + bstep /= sizeof(b[0]); + + for( i = 0; i < m; i++ ) + { + k = i; + + for( j = i+1; j < m; j++ ) + if( std::abs(A[j*astep + i]) > std::abs(A[k*astep + i]) ) + k = j; + + if( std::abs(A[k*astep + i]) < std::numeric_limits<_Tp>::epsilon() ) + return 0; + + if( k != i ) + { + for( j = i; j < m; j++ ) + std::swap(A[i*astep + j], A[k*astep + j]); + if( b ) + for( j = 0; j < n; j++ ) + std::swap(b[i*bstep + j], b[k*bstep + j]); + p = -p; + } + + _Tp d = -1/A[i*astep + i]; + + for( j = i+1; j < m; j++ ) + { + _Tp alpha = A[j*astep + i]*d; + + for( k = i+1; k < m; k++ ) + A[j*astep + k] += alpha*A[i*astep + k]; + + if( b ) + for( k = 0; k < n; k++ ) + b[j*bstep + k] += alpha*b[i*bstep + k]; + } + + A[i*astep + i] = -d; + } + + if( b ) + { + for( i = m-1; i >= 0; i-- ) + for( j = 0; j < n; j++ ) + { + _Tp s = b[i*bstep + j]; + for( k = i+1; k < m; k++ ) + s -= A[i*astep + k]*b[k*bstep + j]; + b[i*bstep + j] = s*A[i*astep + i]; + } + } + + return p; +} + + +int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n) +{ + return LUImpl(A, astep, m, b, bstep, n); +} + + +int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n) +{ + return LUImpl(A, astep, m, b, bstep, n); +} + + +template static inline bool +CholImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n) +{ + _Tp* L = A; + int i, j, k; + double s; + astep /= sizeof(A[0]); + bstep /= sizeof(b[0]); + + for( i = 0; i < m; i++ ) + { + for( j = 0; j < i; j++ ) + { + s = A[i*astep + j]; + for( k = 0; k < j; k++ ) + s -= L[i*astep + k]*L[j*astep + k]; + L[i*astep + j] = (_Tp)(s*L[j*astep + j]); + } + s = A[i*astep + i]; + for( k = 0; k < j; k++ ) + { + double t = L[i*astep + k]; + s -= t*t; + } + if( s < std::numeric_limits<_Tp>::epsilon() ) + return false; + L[i*astep + i] = (_Tp)(1./std::sqrt(s)); + } + + if( !b ) + return true; + + // LLt x = b + // 1: L y = b + // 2. Lt x = y + + /* + [ L00 ] y0 b0 + [ L10 L11 ] y1 = b1 + [ L20 L21 L22 ] y2 b2 + [ L30 L31 L32 L33 ] y3 b3 + + [ L00 L10 L20 L30 ] x0 y0 + [ L11 L21 L31 ] x1 = y1 + [ L22 L32 ] x2 y2 + [ L33 ] x3 y3 + */ + + for( i = 0; i < m; i++ ) + { + for( j = 0; j < n; j++ ) + { + s = b[i*bstep + j]; + for( k = 0; k < i; k++ ) + s -= L[i*astep + k]*b[k*bstep + j]; + b[i*bstep + j] = (_Tp)(s*L[i*astep + i]); + } + } + + for( i = m-1; i >= 0; i-- ) + { + for( j = 0; j < n; j++ ) + { + s = b[i*bstep + j]; + for( k = m-1; k > i; k-- ) + s -= L[k*astep + i]*b[k*bstep + j]; + b[i*bstep + j] = (_Tp)(s*L[i*astep + i]); + } + } + + return true; +} + + +bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n) +{ + return CholImpl(A, astep, m, b, bstep, n); +} + +bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n) +{ + return CholImpl(A, astep, m, b, bstep, n); +} + }} diff --git a/modules/hal/src/precomp.hpp b/modules/hal/src/precomp.hpp index e6923fb897..95ddac9bc8 100644 --- a/modules/hal/src/precomp.hpp +++ b/modules/hal/src/precomp.hpp @@ -42,3 +42,7 @@ #include "opencv2/hal.hpp" #include "opencv2/hal/intrin.hpp" +#include +#include +#include +#include diff --git a/modules/hal/src/stat.cpp b/modules/hal/src/stat.cpp index bdcf9ed727..ec3b8db5a1 100644 --- a/modules/hal/src/stat.cpp +++ b/modules/hal/src/stat.cpp @@ -80,10 +80,10 @@ static const uchar popCountTable4[] = 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 }; -Error::Code normHamming(const uchar* a, int n, int & result) +int normHamming(const uchar* a, int n) { int i = 0; - result = 0; + int result = 0; #if CV_NEON { uint32x4_t bits = vmovq_n_u32(0); @@ -104,13 +104,13 @@ Error::Code normHamming(const uchar* a, int n, int & result) popCountTable[a[i+2]] + popCountTable[a[i+3]]; for( ; i < n; i++ ) result += popCountTable[a[i]]; - return Error::Ok; + return result; } -Error::Code normHamming(const uchar* a, const uchar* b, int n, int & result) +int normHamming(const uchar* a, const uchar* b, int n) { int i = 0; - result = 0; + int result = 0; #if CV_NEON { uint32x4_t bits = vmovq_n_u32(0); @@ -133,44 +133,44 @@ Error::Code normHamming(const uchar* a, const uchar* b, int n, int & result) popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]]; for( ; i < n; i++ ) result += popCountTable[a[i] ^ b[i]]; - return Error::Ok; + return result; } -Error::Code normHamming(const uchar* a, int n, int cellSize, int & result) +int normHamming(const uchar* a, int n, int cellSize) { if( cellSize == 1 ) - return normHamming(a, n, result); + return normHamming(a, n); const uchar* tab = 0; if( cellSize == 2 ) tab = popCountTable2; else if( cellSize == 4 ) tab = popCountTable4; else - return Error::Unknown; + return -1; int i = 0; - result = 0; + int result = 0; #if CV_ENABLE_UNROLLED for( ; i <= n - 4; i += 4 ) result += tab[a[i]] + tab[a[i+1]] + tab[a[i+2]] + tab[a[i+3]]; #endif for( ; i < n; i++ ) result += tab[a[i]]; - return Error::Ok; + return result; } -Error::Code normHamming(const uchar* a, const uchar* b, int n, int cellSize, int & result) +int normHamming(const uchar* a, const uchar* b, int n, int cellSize) { if( cellSize == 1 ) - return normHamming(a, b, n, result); + return normHamming(a, b, n); const uchar* tab = 0; if( cellSize == 2 ) tab = popCountTable2; else if( cellSize == 4 ) tab = popCountTable4; else - return Error::Unknown; + return -1; int i = 0; - result = 0; + int result = 0; #if CV_ENABLE_UNROLLED for( ; i <= n - 4; i += 4 ) result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] + @@ -178,7 +178,129 @@ Error::Code normHamming(const uchar* a, const uchar* b, int n, int cellSize, int #endif for( ; i < n; i++ ) result += tab[a[i] ^ b[i]]; - return Error::Ok; + return result; +} + +float normL2Sqr_(const float* a, const float* b, int n) +{ + int j = 0; float d = 0.f; +#if CV_SSE + float CV_DECL_ALIGNED(16) buf[4]; + __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps(); + + for( ; j <= n - 8; j += 8 ) + { + __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j)); + __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4)); + d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0)); + d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1)); + } + _mm_store_ps(buf, _mm_add_ps(d0, d1)); + d = buf[0] + buf[1] + buf[2] + buf[3]; +#endif + { + for( ; j <= n - 4; j += 4 ) + { + float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3]; + d += t0*t0 + t1*t1 + t2*t2 + t3*t3; + } + } + + for( ; j < n; j++ ) + { + float t = a[j] - b[j]; + d += t*t; + } + return d; +} + + +float normL1_(const float* a, const float* b, int n) +{ + int j = 0; float d = 0.f; +#if CV_SSE + float CV_DECL_ALIGNED(16) buf[4]; + static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; + __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps(); + __m128 absmask = _mm_load_ps((const float*)absbuf); + + for( ; j <= n - 8; j += 8 ) + { + __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j)); + __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4)); + d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask)); + d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask)); + } + _mm_store_ps(buf, _mm_add_ps(d0, d1)); + d = buf[0] + buf[1] + buf[2] + buf[3]; +#elif CV_NEON + float32x4_t v_sum = vdupq_n_f32(0.0f); + for ( ; j <= n - 4; j += 4) + v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j))); + + float CV_DECL_ALIGNED(16) buf[4]; + vst1q_f32(buf, v_sum); + d = buf[0] + buf[1] + buf[2] + buf[3]; +#endif + { + for( ; j <= n - 4; j += 4 ) + { + d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) + + std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]); + } + } + + for( ; j < n; j++ ) + d += std::abs(a[j] - b[j]); + return d; +} + +int normL1_(const uchar* a, const uchar* b, int n) +{ + int j = 0, d = 0; +#if CV_SSE + __m128i d0 = _mm_setzero_si128(); + + for( ; j <= n - 16; j += 16 ) + { + __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j)); + __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j)); + + d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1)); + } + + for( ; j <= n - 4; j += 4 ) + { + __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j)); + __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j)); + + d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1)); + } + d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0))); +#elif CV_NEON + uint32x4_t v_sum = vdupq_n_u32(0.0f); + for ( ; j <= n - 16; j += 16) + { + uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j)); + uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst)); + v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high))); + v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high))); + } + + uint CV_DECL_ALIGNED(16) buf[4]; + vst1q_u32(buf, v_sum); + d = buf[0] + buf[1] + buf[2] + buf[3]; +#endif + { + for( ; j <= n - 4; j += 4 ) + { + d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) + + std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]); + } + } + for( ; j < n; j++ ) + d += std::abs(a[j] - b[j]); + return d; } }} //cv::hal diff --git a/modules/photo/src/arrays.hpp b/modules/photo/src/arrays.hpp index 4aec5f7a1e..cdd59a3286 100644 --- a/modules/photo/src/arrays.hpp +++ b/modules/photo/src/arrays.hpp @@ -44,6 +44,9 @@ #ifndef __OPENCV_DENOISING_ARRAYS_HPP__ #define __OPENCV_DENOISING_ARRAYS_HPP__ +namespace cv +{ + template struct Array2d { @@ -176,4 +179,6 @@ struct Array4d } }; +} + #endif diff --git a/modules/stitching/src/autocalib.cpp b/modules/stitching/src/autocalib.cpp index 56a9df57b8..91244bde15 100644 --- a/modules/stitching/src/autocalib.cpp +++ b/modules/stitching/src/autocalib.cpp @@ -49,7 +49,7 @@ namespace { template static inline bool decomposeCholesky(_Tp* A, size_t astep, int m) { - if (!Cholesky(A, astep, m, 0, 0, 0)) + if (!hal::Cholesky(A, astep, m, 0, 0, 0)) return false; astep /= sizeof(A[0]); for (int i = 0; i < m; ++i) From c7121e877202c44d4def8f9c1d5113c6ee646b39 Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Thu, 16 Apr 2015 23:42:00 +0300 Subject: [PATCH 03/12] fixed compile warnings --- modules/hal/include/opencv2/hal/defs.h | 2 ++ modules/hal/include/opencv2/hal/intrin.hpp | 27 ++++++++++++++-------- modules/hal/src/mathfuncs.cpp | 4 ++-- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/modules/hal/include/opencv2/hal/defs.h b/modules/hal/include/opencv2/hal/defs.h index c011fe617e..18c1adc6ce 100644 --- a/modules/hal/include/opencv2/hal/defs.h +++ b/modules/hal/include/opencv2/hal/defs.h @@ -666,6 +666,8 @@ template<> inline unsigned saturate_cast(double v) { return cvRound(v) //! @endcond +//! @} core_utils + } #endif // __cplusplus diff --git a/modules/hal/include/opencv2/hal/intrin.hpp b/modules/hal/include/opencv2/hal/intrin.hpp index c3c47e0594..bfd41ad0d1 100644 --- a/modules/hal/include/opencv2/hal/intrin.hpp +++ b/modules/hal/include/opencv2/hal/intrin.hpp @@ -350,7 +350,6 @@ OPENCV_HAL_IMPL_BIT_OP(^) template inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) { v_reg<_Tp, n> c; - typedef typename TypeTraits<_Tp>::int_type itype; for( int i = 0; i < n; i++ ) c.s[i] = TypeTraits<_Tp>::reinterpret_from_int(~TypeTraits<_Tp>::reinterpret_int(a.s[i])); return c; @@ -1426,13 +1425,13 @@ OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_max, _mm_max_pd) inline v_int8x16 v_min(const v_int8x16& a, const v_int8x16& b) { - __m128i delta = _mm_set1_epi8((char)0x80); + __m128i delta = _mm_set1_epi8((char)-128); return v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta)))); } inline v_int8x16 v_max(const v_int8x16& a, const v_int8x16& b) { - __m128i delta = _mm_set1_epi8((char)0x80); + __m128i delta = _mm_set1_epi8((char)-128); return v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta)))); } @@ -1523,8 +1522,8 @@ inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \ return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \ } -OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)0x80) -OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)0x8000) +OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)-128) +OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768) OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000) #define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \ @@ -1553,21 +1552,21 @@ OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8) OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16) OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16) -#define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, sbit) \ +#define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \ inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \ { \ return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a.val, b.val), _mm_subs_epu##bits(b.val, a.val))); \ } \ inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \ { \ - __m128i smask = _mm_set1_epi8(sbit); \ + __m128i smask = _mm_set1_epi32(smask32); \ __m128i a1 = _mm_xor_si128(a.val, smask); \ __m128i b1 = _mm_xor_si128(b.val, smask); \ return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a1, b1), _mm_subs_epu##bits(b1, a1))); \ } -OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (char)0x80) -OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (char)0x8000) +OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080) +OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000) #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \ inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \ @@ -1704,7 +1703,7 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max) OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min) #define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \ -inline bool v_signmask(const _Tpvec& a) \ +inline int v_signmask(const _Tpvec& a) \ { \ return and_op(_mm_movemask_##suffix(pack_op(a.val)), signmask); \ } \ @@ -2850,4 +2849,12 @@ inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, }} +#ifndef CV_SIMD128 +#define CV_SIMD128 0 +#endif + +#ifndef CV_SIMD128_64F +#define CV_SIMD128_64F 0 +#endif + #endif diff --git a/modules/hal/src/mathfuncs.cpp b/modules/hal/src/mathfuncs.cpp index e970cfedb8..7d0199f00c 100644 --- a/modules/hal/src/mathfuncs.cpp +++ b/modules/hal/src/mathfuncs.cpp @@ -185,7 +185,7 @@ void magnitude(const double* x, const double* y, double* mag, int len) { int i = 0; -#if defined CV_SIMD128_64F && CV_SIMD128_64F +#if CV_SIMD128_64F for( ; i <= len - 4; i += 4 ) { v_float64x2 x0 = v_load(x + i), x1 = v_load(x + i + 2); @@ -262,7 +262,7 @@ void sqrt(const double* src, double* dst, int len) { int i = 0; -#if defined CV_SIMD128_64F && CV_SIMD128_64F +#if CV_SIMD128_64F for( ; i <= len - 4; i += 4 ) { v_float64x2 t0 = v_load(src + i), t1 = v_load(src + i + 2); From 1fcc83b84b7ff2a6c945d3a26cc487aae0352b70 Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Thu, 16 Apr 2015 23:52:23 +0300 Subject: [PATCH 04/12] fixed some more compile warnings --- modules/hal/include/opencv2/hal/intrin.hpp | 18 +++++++++--------- modules/hal/src/precomp.hpp | 1 + 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/modules/hal/include/opencv2/hal/intrin.hpp b/modules/hal/include/opencv2/hal/intrin.hpp index bfd41ad0d1..adb95d6a12 100644 --- a/modules/hal/include/opencv2/hal/intrin.hpp +++ b/modules/hal/include/opencv2/hal/intrin.hpp @@ -1140,7 +1140,7 @@ inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a) template inline v_float64x2 v_reinterpret_as_f64(const _Tpvec& a) { return v_float64x2(_mm_castsi128_pd(a.val)); } -inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a) +inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a) { return v_float64x2(_mm_castps_pd(a.val)); } inline v_uint8x16 v_cvtn_u16(const v_uint16x8& a, const v_uint16x8& b) @@ -1893,10 +1893,10 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); // a8 b8 c8 d8 ... __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 48)); // a12 b12 c12 d12 ... - __m128 v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ... - __m128 v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ... - __m128 v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ... - __m128 v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b4 b14 ... + __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ... + __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ... + __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ... + __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b4 b14 ... u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ... u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ... @@ -1928,10 +1928,10 @@ inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ... __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 24)); // a6 b6 c6 d6 ... - __m128 v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ... - __m128 v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ... - __m128 v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ... - __m128 v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ... + __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ... + __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ... + __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ... + __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ... u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ... u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ... diff --git a/modules/hal/src/precomp.hpp b/modules/hal/src/precomp.hpp index 95ddac9bc8..630565bec3 100644 --- a/modules/hal/src/precomp.hpp +++ b/modules/hal/src/precomp.hpp @@ -45,4 +45,5 @@ #include #include #include +#include #include From bfbf864a73210112d49867632d60d23dc30a143f Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Fri, 17 Apr 2015 00:13:08 +0300 Subject: [PATCH 05/12] fixed some more compile warnings (2) --- modules/hal/include/opencv2/hal/defs.h | 4 +- modules/hal/include/opencv2/hal/intrin.hpp | 52 +++------------------- 2 files changed, 8 insertions(+), 48 deletions(-) diff --git a/modules/hal/include/opencv2/hal/defs.h b/modules/hal/include/opencv2/hal/defs.h index 18c1adc6ce..197533993b 100644 --- a/modules/hal/include/opencv2/hal/defs.h +++ b/modules/hal/include/opencv2/hal/defs.h @@ -666,10 +666,10 @@ template<> inline unsigned saturate_cast(double v) { return cvRound(v) //! @endcond -//! @} core_utils - } #endif // __cplusplus +//! @} core_utils + #endif //__OPENCV_HAL_H__ diff --git a/modules/hal/include/opencv2/hal/intrin.hpp b/modules/hal/include/opencv2/hal/intrin.hpp index adb95d6a12..3c53d60797 100644 --- a/modules/hal/include/opencv2/hal/intrin.hpp +++ b/modules/hal/include/opencv2/hal/intrin.hpp @@ -905,10 +905,6 @@ struct v_uint8x16 (char)v8, (char)v9, (char)v10, (char)v11, (char)v12, (char)v13, (char)v14, (char)v15); } - uchar get(const int i) const - { - return (uchar)(_mm_extract_epi16(val, i/2) >> ((i&1)*8)); - } uchar get0() const { return (uchar)_mm_cvtsi128_si32(val); @@ -928,10 +924,6 @@ struct v_int8x16 (char)v8, (char)v9, (char)v10, (char)v11, (char)v12, (char)v13, (char)v14, (char)v15); } - schar get(const int i) const - { - return (schar)(_mm_extract_epi16(val, i/2) >> ((i&1)*8)); - } schar get0() const { return (schar)_mm_cvtsi128_si32(val); @@ -948,10 +940,6 @@ struct v_uint16x8 val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3, (short)v4, (short)v5, (short)v6, (short)v7); } - ushort get(const int i) const - { - return (ushort)_mm_extract_epi16(val, i); - } ushort get0() const { return (ushort)_mm_cvtsi128_si32(val); @@ -968,10 +956,6 @@ struct v_int16x8 val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3, (short)v4, (short)v5, (short)v6, (short)v7); } - short get(const int i) const - { - return (short)_mm_extract_epi16(val, i); - } short get0() const { return (short)_mm_cvtsi128_si32(val); @@ -986,12 +970,6 @@ struct v_uint32x4 { val = _mm_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3); } - unsigned get(const int i) const - { - unsigned CV_DECL_ALIGNED(16) buf[4]; - _mm_store_si128((__m128i*)buf, val); - return buf[i]; - } unsigned get0() const { return (unsigned)_mm_cvtsi128_si32(val); @@ -1006,12 +984,6 @@ struct v_int32x4 { val = _mm_setr_epi32(v0, v1, v2, v3); } - int get(int i) const - { - int CV_DECL_ALIGNED(16) buf[4]; - _mm_store_si128((__m128i*)buf, val); - return buf[i]; - } int get0() const { return _mm_cvtsi128_si32(val); @@ -1026,12 +998,6 @@ struct v_float32x4 { val = _mm_setr_ps(v0, v1, v2, v3); } - float get(int i) const - { - float CV_DECL_ALIGNED(16) buf[4]; - _mm_store_ps(buf, val); - return buf[i]; - } float get0() const { return _mm_cvtss_f32(val); @@ -1046,12 +1012,6 @@ struct v_float64x2 { val = _mm_setr_pd(v0, v1); } - double get(int i) const - { - double CV_DECL_ALIGNED(16) buf[2]; - _mm_store_pd(buf, val); - return buf[i]; - } double get0() const { return _mm_cvtsd_f64(val); @@ -1376,10 +1336,10 @@ OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int32x4, si128, _mm_set1_epi32(-1)) OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1))) OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1))) -inline v_float32x4 v_sqrt(v_float32x4 x) +inline v_float32x4 v_sqrt(const v_float32x4& x) { return v_float32x4(_mm_sqrt_ps(x.val)); } -inline v_float32x4 v_invsqrt(v_float32x4 x) +inline v_float32x4 v_invsqrt(const v_float32x4& x) { static const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f); __m128 t = x.val; @@ -1389,18 +1349,18 @@ inline v_float32x4 v_invsqrt(v_float32x4 x) return v_float32x4(t); } -inline v_float64x2 v_sqrt(v_float64x2 x) +inline v_float64x2 v_sqrt(const v_float64x2& x) { return v_float64x2(_mm_sqrt_pd(x.val)); } -inline v_float64x2 v_invsqrt(v_float64x2 x) +inline v_float64x2 v_invsqrt(const v_float64x2& x) { static const __m128d v_1 = _mm_set1_pd(1.); return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val))); } -inline v_float32x4 v_abs(v_float32x4 x) +inline v_float32x4 v_abs(const v_float32x4& x) { return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); } -inline v_float64x2 v_abs(v_float64x2 x) +inline v_float64x2 v_abs(const v_float64x2& x) { return v_float64x2(_mm_and_pd(x.val, _mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1)))); From ce7590bd248a418eab6245ec6f9bf70b885904c6 Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Fri, 17 Apr 2015 08:32:06 +0300 Subject: [PATCH 06/12] continuing fixing universal intrinsics --- modules/hal/include/opencv2/hal/intrin.hpp | 38 +++++++++++----------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/modules/hal/include/opencv2/hal/intrin.hpp b/modules/hal/include/opencv2/hal/intrin.hpp index 3c53d60797..e959a49cff 100644 --- a/modules/hal/include/opencv2/hal/intrin.hpp +++ b/modules/hal/include/opencv2/hal/intrin.hpp @@ -2218,7 +2218,7 @@ inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8) OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8) OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8) -OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_u8) +OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8) OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16) OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16) OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint16x8, vmulq_u16) @@ -2342,11 +2342,11 @@ inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); } OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8) -OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, OPENCV_HAL_NOP, s8, u8) +OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, vreinterpretq_s8_u8, s8, u8) OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16) -OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, OPENCV_HAL_NOP, s16, u16) +OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16) OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32) -OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, OPENCV_HAL_NOP, s32, u32) +OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32) OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32) OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8) @@ -2381,18 +2381,18 @@ inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_ } // trade efficiency for convenience -#define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, _Tp, suffix) \ +#define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \ inline _Tpvec operator << (const _Tpvec& a, int n) \ -{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##suffix((_Tp)n))); } \ +{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \ inline _Tpvec operator >> (const _Tpvec& a, int n) \ -{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##suffix((_Tp)-n))); } +{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } -OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, uchar, u8) -OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, schar, s8) -OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, ushort, u16) -OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, short, s16) -OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, unsigned, u32) -OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, int, s32) +OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8) +OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8) +OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, u16, short, s16) +OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, s16, short, s16) +OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, u32, int, s32) +OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32) OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_mullo, vmulq_u16) OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mullo, vmulq_s16) @@ -2444,8 +2444,8 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, min, std::min) inline int v_signmask(const v_uint8x16& a) { - uint8x8_t m0 = vcreate_u8(CV_BIG_UINT(0x0706050403020100)); - uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_u8(m0, m0)); + int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100)); + uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0)); uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0))); return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8); } @@ -2454,8 +2454,8 @@ inline int v_signmask(const v_int8x16& a) inline int v_signmask(const v_uint16x8& a) { - uint16x4_t m0 = vcreate_u16(CV_BIG_UINT(0x0003000200010000)); - uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_u16(m0, m0)); + int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000)); + uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0)); uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0)); return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4); } @@ -2464,8 +2464,8 @@ inline int v_signmask(const v_int16x8& a) inline int v_signmask(const v_uint32x4& a) { - uint32x2_t m0 = vcreate_u32(CV_BIG_UINT(0x0000000100000000)); - uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_u32(m0, m0)); + int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000)); + uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0)); uint64x2_t v1 = vpaddlq_u32(v0); return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2); } From 17675b4cb83810dcd582cca0eb0d1cd7ddb545e8 Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Fri, 17 Apr 2015 10:51:49 +0300 Subject: [PATCH 07/12] trying to fix test failures --- modules/hal/include/opencv2/hal/intrin.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/hal/include/opencv2/hal/intrin.hpp b/modules/hal/include/opencv2/hal/intrin.hpp index e959a49cff..b067301d82 100644 --- a/modules/hal/include/opencv2/hal/intrin.hpp +++ b/modules/hal/include/opencv2/hal/intrin.hpp @@ -2287,16 +2287,16 @@ inline v_float32x4 operator ~ (const v_float32x4& a) inline v_float32x4 v_sqrt(const v_float32x4& x) { float32x4_t e = vrsqrteq_f32(x.val); - e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), x.val), e); - e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), x.val), e); - return v_float32x4(vmulq_f32(e, x.val)); + e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e); + e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e); + return v_float32x4(vmulq_f32(x.val, e)); } inline v_float32x4 v_invsqrt(const v_float32x4& x) { float32x4_t e = vrsqrteq_f32(x.val); - e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), x.val), e); - e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), x.val), e); + e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e); + e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e); return v_float32x4(e); } From 92f2ad0a1128f5559708359efe01899586c89c33 Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Fri, 17 Apr 2015 11:32:20 +0300 Subject: [PATCH 08/12] trying to fix test failures (take 2) --- modules/hal/include/opencv2/hal/intrin.hpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/modules/hal/include/opencv2/hal/intrin.hpp b/modules/hal/include/opencv2/hal/intrin.hpp index b067301d82..fc40cfda8d 100644 --- a/modules/hal/include/opencv2/hal/intrin.hpp +++ b/modules/hal/include/opencv2/hal/intrin.hpp @@ -47,6 +47,7 @@ #include #include +#include #include #define OPENCV_HAL_ADD(a, b) ((a) + (b)) @@ -2286,9 +2287,10 @@ inline v_float32x4 operator ~ (const v_float32x4& a) inline v_float32x4 v_sqrt(const v_float32x4& x) { - float32x4_t e = vrsqrteq_f32(x.val); - e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e); - e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e); + float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN)); + float32x4_t e = vrsqrteq_f32(x1); + e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e); + e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e); return v_float32x4(vmulq_f32(x.val, e)); } From ca9a2887fc88c1f96b4380febab4d670d2ee9465 Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Mon, 20 Apr 2015 20:27:03 +0300 Subject: [PATCH 09/12] extended universal intrinsics: added 64-bit integers, improved shift operators, renamed "pack" operations; changed semantics of extended integer multiplication, added v_dotprod. also, split intrinsics into separate files. C implementation is compiled only when none of SSE2 or NEON is available. --- modules/hal/include/opencv2/hal/intrin.hpp | 2634 +---------------- .../hal/include/opencv2/hal/intrin_cpp.hpp | 811 +++++ .../hal/include/opencv2/hal/intrin_neon.hpp | 823 +++++ .../hal/include/opencv2/hal/intrin_sse.hpp | 1544 ++++++++++ 4 files changed, 3230 insertions(+), 2582 deletions(-) create mode 100644 modules/hal/include/opencv2/hal/intrin_cpp.hpp create mode 100644 modules/hal/include/opencv2/hal/intrin_neon.hpp create mode 100644 modules/hal/include/opencv2/hal/intrin_sse.hpp diff --git a/modules/hal/include/opencv2/hal/intrin.hpp b/modules/hal/include/opencv2/hal/intrin.hpp index fc40cfda8d..439a04c74e 100644 --- a/modules/hal/include/opencv2/hal/intrin.hpp +++ b/modules/hal/include/opencv2/hal/intrin.hpp @@ -45,7 +45,6 @@ #ifndef __OPENCV_HAL_INTRIN_HPP__ #define __OPENCV_HAL_INTRIN_HPP__ -#include #include #include #include @@ -55,9 +54,12 @@ #define OPENCV_HAL_NOP(a) (a) #define OPENCV_HAL_1ST(a, b) (a) -namespace cv { namespace hal { +// unlike HAL API, which is in cv::hall, +// we put intrinsics into cv namespace to make its +// access from within opencv code more accessible +namespace cv { -template struct TypeTraits +template struct V_TypeTraits { typedef _Tp int_type; typedef _Tp uint_type; @@ -71,7 +73,7 @@ template struct TypeTraits static _Tp reinterpret_from_int(int_type x) { return (_Tp)x; } }; -template<> struct TypeTraits +template<> struct V_TypeTraits { typedef uchar value_type; typedef schar int_type; @@ -88,7 +90,7 @@ template<> struct TypeTraits static value_type reinterpret_from_int(int_type x) { return (value_type)x; } }; -template<> struct TypeTraits +template<> struct V_TypeTraits { typedef schar value_type; typedef schar int_type; @@ -105,7 +107,7 @@ template<> struct TypeTraits static value_type reinterpret_from_int(int_type x) { return (value_type)x; } }; -template<> struct TypeTraits +template<> struct V_TypeTraits { typedef ushort value_type; typedef short int_type; @@ -123,7 +125,7 @@ template<> struct TypeTraits static value_type reinterpret_from_int(int_type x) { return (value_type)x; } }; -template<> struct TypeTraits +template<> struct V_TypeTraits { typedef short value_type; typedef short int_type; @@ -142,7 +144,7 @@ template<> struct TypeTraits static value_type reinterpret_from_int(int_type x) { return (value_type)x; } }; -template<> struct TypeTraits +template<> struct V_TypeTraits { typedef unsigned value_type; typedef int int_type; @@ -150,6 +152,7 @@ template<> struct TypeTraits typedef unsigned abs_type; typedef unsigned sum_type; + typedef uint64 w_type; typedef ushort nu_type; static int_type reinterpret_int(value_type x) { return (int_type)x; } @@ -157,7 +160,7 @@ template<> struct TypeTraits static value_type reinterpret_from_int(int_type x) { return (value_type)x; } }; -template<> struct TypeTraits +template<> struct V_TypeTraits { typedef int value_type; typedef int int_type; @@ -165,6 +168,7 @@ template<> struct TypeTraits typedef unsigned abs_type; typedef int sum_type; + typedef int64 w_type; typedef short n_type; typedef ushort nu_type; @@ -173,7 +177,38 @@ template<> struct TypeTraits static value_type reinterpret_from_int(int_type x) { return (value_type)x; } }; -template<> struct TypeTraits +template<> struct V_TypeTraits +{ + typedef uint64 value_type; + typedef int64 int_type; + typedef uint64 uint_type; + typedef uint64 abs_type; + typedef uint64 sum_type; + + typedef unsigned nu_type; + + static int_type reinterpret_int(value_type x) { return (int_type)x; } + static uint_type reinterpret_uint(value_type x) { return (uint_type)x; } + static value_type reinterpret_from_int(int_type x) { return (value_type)x; } +}; + +template<> struct V_TypeTraits +{ + typedef int64 value_type; + typedef int64 int_type; + typedef uint64 uint_type; + typedef uint64 abs_type; + typedef int64 sum_type; + + typedef int nu_type; + + static int_type reinterpret_int(value_type x) { return (int_type)x; } + static uint_type reinterpret_uint(value_type x) { return (uint_type)x; } + static value_type reinterpret_from_int(int_type x) { return (value_type)x; } +}; + + +template<> struct V_TypeTraits { typedef float value_type; typedef int int_type; @@ -203,7 +238,7 @@ template<> struct TypeTraits } }; -template<> struct TypeTraits +template<> struct V_TypeTraits { typedef double value_type; typedef int64 int_type; @@ -230,2587 +265,22 @@ template<> struct TypeTraits } }; -template struct v_reg -{ - typedef _Tp scalar_type; - typedef v_reg::int_type, n> int_vec; - typedef v_reg::abs_type, n> abs_vec; - enum { channels = n }; - - explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; } - v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; } - v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; } - v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3, - _Tp s4, _Tp s5, _Tp s6, _Tp s7) - { - s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; - s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7; - } - v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3, - _Tp s4, _Tp s5, _Tp s6, _Tp s7, - _Tp s8, _Tp s9, _Tp s10, _Tp s11, - _Tp s12, _Tp s13, _Tp s14, _Tp s15) - { - s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; - s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7; - s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11; - s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15; - } - - v_reg() {} - v_reg(const v_reg<_Tp, n> & r) - { - for( int i = 0; i < n; i++ ) - s[i] = r.s[i]; - } - - _Tp get(const int i) const { return s[i]; } - _Tp get0() const { return s[0]; } - v_reg<_Tp, n> high() const - { - v_reg<_Tp, n> c; - int i; - for( i = 0; i < n/2; i++ ) - { - c.s[i] = s[i+(n/2)]; - c.s[i+(n/2)] = 0; - } - return c; - } - - static v_reg<_Tp, n> zero() - { - v_reg<_Tp, n> c; - for( int i = 0; i < n; i++ ) - c.s[i] = (_Tp)0; - return c; - } - - static v_reg<_Tp, n> all(_Tp s) - { - v_reg<_Tp, n> c; - for( int i = 0; i < n; i++ ) - c.s[i] = s; - return c; - } - - template static v_reg<_Tp2, n2> reinterpret_as(const v_reg<_Tp, n>& a) - { - size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n); - v_reg<_Tp2, n2> c; - memcpy(&c.s[0], &a.s[0], bytes); - return c; - } - - _Tp s[n]; -}; - -#define OPENCV_HAL_IMPL_BIN_OP(bin_op) \ -template inline v_reg<_Tp, n> operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ -{ \ - v_reg<_Tp, n> c; \ - for( int i = 0; i < n; i++ ) \ - c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \ - return c; \ -} \ -template inline v_reg<_Tp, n>& operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ -{ \ - for( int i = 0; i < n; i++ ) \ - a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \ - return a; \ -} - -OPENCV_HAL_IMPL_BIN_OP(+) -OPENCV_HAL_IMPL_BIN_OP(-) -OPENCV_HAL_IMPL_BIN_OP(*) -OPENCV_HAL_IMPL_BIN_OP(/) - -#define OPENCV_HAL_IMPL_BIT_OP(bit_op) \ -template inline v_reg<_Tp, n> operator bit_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ -{ \ - v_reg<_Tp, n> c; \ - typedef typename TypeTraits<_Tp>::int_type itype; \ - for( int i = 0; i < n; i++ ) \ - c.s[i] = TypeTraits<_Tp>::reinterpret_from_int((itype)(TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \ - TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \ - return c; \ -} \ -template inline v_reg<_Tp, n>& operator bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ -{ \ - typedef typename TypeTraits<_Tp>::int_type itype; \ - for( int i = 0; i < n; i++ ) \ - a.s[i] = TypeTraits<_Tp>::reinterpret_from_int((itype)(TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \ - TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \ - return a; \ -} - -OPENCV_HAL_IMPL_BIT_OP(&) -OPENCV_HAL_IMPL_BIT_OP(|) -OPENCV_HAL_IMPL_BIT_OP(^) - -template inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) -{ - v_reg<_Tp, n> c; - for( int i = 0; i < n; i++ ) - c.s[i] = TypeTraits<_Tp>::reinterpret_from_int(~TypeTraits<_Tp>::reinterpret_int(a.s[i])); - return c; -} - -#define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \ -template inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \ -{ \ - v_reg<_Tp2, n> c; \ - for( int i = 0; i < n; i++ ) \ - c.s[i] = cfunc(a.s[i]); \ - return c; \ -} - -OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp) -OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp) -OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp) -OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp) -OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp) -OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename TypeTraits<_Tp>::abs_type)std::abs, typename TypeTraits<_Tp>::abs_type) -OPENCV_HAL_IMPL_MATH_FUNC(v_round, cvRound, int) -OPENCV_HAL_IMPL_MATH_FUNC(v_floor, cvFloor, int) -OPENCV_HAL_IMPL_MATH_FUNC(v_ceil, cvCeil, int) -OPENCV_HAL_IMPL_MATH_FUNC(v_trunc, int, int) - -#define OPENCV_HAL_IMPL_MINMAX_FUNC(func, hfunc, cfunc) \ -template inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ -{ \ - v_reg<_Tp, n> c; \ - for( int i = 0; i < n; i++ ) \ - c.s[i] = cfunc(a.s[i], b.s[i]); \ - return c; \ -} \ -template inline _Tp hfunc(const v_reg<_Tp, n>& a) \ -{ \ - _Tp c = a.s[0]; \ - for( int i = 1; i < n; i++ ) \ - c = cfunc(c, a.s[i]); \ - return c; \ -} - -OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, v_reduce_min, std::min) -OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, v_reduce_max, std::max) - -template inline void v_minmax(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, - v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval) -{ - for( int i = 0; i < n; i++ ) - { - minval.s[i] = std::min(a.s[i], b.s[i]); - maxval.s[i] = std::max(a.s[i], b.s[i]); - } -} - - -#define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \ -template inline v_reg<_Tp, n> \ - operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ -{ \ - typedef typename TypeTraits<_Tp>::int_type itype; \ - v_reg<_Tp, n> c; \ - for( int i = 0; i < n; i++ ) \ - c.s[i] = TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \ - return c; \ -} - -OPENCV_HAL_IMPL_CMP_OP(<) -OPENCV_HAL_IMPL_CMP_OP(>) -OPENCV_HAL_IMPL_CMP_OP(<=) -OPENCV_HAL_IMPL_CMP_OP(>=) -OPENCV_HAL_IMPL_CMP_OP(==) -OPENCV_HAL_IMPL_CMP_OP(!=) - -#define OPENCV_HAL_IMPL_ADDSUB_OP(func, bin_op, cast_op, _Tp2) \ -template inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ -{ \ - typedef _Tp2 rtype; \ - v_reg c; \ - for( int i = 0; i < n; i++ ) \ - c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \ - return c; \ -} - -OPENCV_HAL_IMPL_ADDSUB_OP(v_add_wrap, +, (_Tp), _Tp) -OPENCV_HAL_IMPL_ADDSUB_OP(v_sub_wrap, -, (_Tp), _Tp) -OPENCV_HAL_IMPL_ADDSUB_OP(v_absdiff, -, (rtype)std::abs, typename TypeTraits<_Tp>::abs_type) - -template inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a) -{ - v_reg<_Tp, n> c; - for( int i = 0; i < n; i++ ) - c.s[i] = 1.f/std::sqrt(a.s[i]); - return c; -} - -template inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) -{ - v_reg<_Tp, n> c; - for( int i = 0; i < n; i++ ) - c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]); - return c; -} - - -template inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) -{ - v_reg<_Tp, n> c; - for( int i = 0; i < n; i++ ) - c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i]; - return c; -} - -template inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, - const v_reg<_Tp, n>& c) -{ - v_reg<_Tp, n> d; - for( int i = 0; i < n; i++ ) - d.s[i] = a.s[i]*b.s[i] + c.s[i]; - return d; -} - -template inline v_reg<_Tp, n> v_mullo(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) -{ - v_reg<_Tp, n> c; - for( int i = 0; i < n; i++ ) - c.s[i] = (_Tp)(a.s[i]*b.s[i]); - return c; -} - -template inline v_reg<_Tp, n> v_mulhi2(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) -{ - v_reg<_Tp, n> c; - for( int i = 0; i < n; i++ ) - c.s[i] = (_Tp)((a.s[i]*b.s[i]*2 + TypeTraits<_Tp>::delta) >> TypeTraits<_Tp>::shift); - return c; -} - -#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \ -template inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \ -{ \ - v_reg<_Tp, n> c; \ - for( int i = 0; i < n; i++ ) \ - c.s[i] = (_Tp)(a.s[i] shift_op imm); \ - return c; \ -} - -OPENCV_HAL_IMPL_SHIFT_OP(<<) -OPENCV_HAL_IMPL_SHIFT_OP(>>) - -template inline typename TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a) -{ - typename TypeTraits<_Tp>::sum_type c = a.s[0]; - for( int i = 1; i < n; i++ ) - c += a.s[i]; - return c; -} - -template inline int v_signmask(const v_reg<_Tp, n>& a) -{ - int mask = 0; - for( int i = 0; i < n; i++ ) - mask |= (TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i; - return mask; -} - -template inline bool v_check_all(const v_reg<_Tp, n>& a) -{ - for( int i = 0; i < n; i++ ) - if( TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 ) - return false; - return true; -} - -template inline bool v_check_any(const v_reg<_Tp, n>& a) -{ - for( int i = 0; i < n; i++ ) - if( TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 ) - return true; - return false; -} - -template inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask, - const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) -{ - v_reg<_Tp, n> c; - for( int i = 0; i < n; i++ ) - c.s[i] = TypeTraits<_Tp>::reinterpret_int(mask.s[i]) < 0 ? b.s[i] : a.s[i]; - return c; -} - -template inline void v_expand(const v_reg<_Tp, n>& a, - v_reg::w_type, n/2>& b0, - v_reg::w_type, n/2>& b1) -{ - for( int i = 0; i < (n/2); i++ ) - { - b0.s[i] = a.s[i]; - b1.s[i] = a.s[i+(n/2)]; - } -} - -template inline v_reg::int_type, n> - v_reinterpret_as_int(const v_reg<_Tp, n>& a) -{ - v_reg::int_type, n> c; - for( int i = 0; i < n; i++ ) - c.s[i] = TypeTraits<_Tp>::reinterpret_int(a.s[i]); - return c; -} - -template inline v_reg::uint_type, n> - v_reinterpret_as_uint(const v_reg<_Tp, n>& a) -{ - v_reg::uint_type, n> c; - for( int i = 0; i < n; i++ ) - c.s[i] = TypeTraits<_Tp>::reinterpret_uint(a.s[i]); - return c; -} - -template inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1, - v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 ) -{ - int i; - for( i = 0; i < n/2; i++ ) - { - b0.s[i*2] = a0.s[i]; - b0.s[i*2+1] = a1.s[i]; - } - for( ; i < n; i++ ) - { - b1.s[i*2-n] = a0.s[i]; - b1.s[i*2-n+1] = a1.s[i]; - } -} - -template inline v_reg<_Tp, n> v_load(const _Tp* ptr) -{ - return v_reg<_Tp, n>(ptr); -} - -template inline v_reg<_Tp, n> v_load_aligned(const _Tp* ptr) -{ - return v_reg<_Tp, n>(ptr); -} - -template inline void v_load_halves(const _Tp* loptr, const _Tp* hiptr) -{ - v_reg<_Tp, n> c; - for( int i = 0; i < n/2; i++ ) - { - c.s[i] = loptr[i]; - c.s[i+n/2] = hiptr[i]; - } - return c; -} - -template inline v_reg::w_type, n> v_load_expand(const _Tp* ptr) -{ - typedef typename TypeTraits<_Tp>::w_type w_type; - v_reg c; - for( int i = 0; i < n; i++ ) - { - c.s[i] = ptr[i]; - } - return c; -} - -template inline v_reg::w_type>::w_type, n> v_load_expand_q(const _Tp* ptr) -{ - typedef typename TypeTraits::w_type>::w_type w_type; - v_reg c; - for( int i = 0; i < n; i++ ) - { - c.s[i] = ptr[i]; - } - return c; -} - -template inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a, - v_reg<_Tp, n>& b, v_reg<_Tp, n>& c) -{ - int i, i3; - for( i = i3 = 0; i < n; i++, i3 += 3 ) - { - a.s[i] = ptr[i3]; - b.s[i] = ptr[i3+1]; - c.s[i] = ptr[i3+2]; - } -} - -template inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a, - v_reg<_Tp, n>& b, v_reg<_Tp, n>& c, - v_reg<_Tp, n>& d) -{ - int i, i4; - for( i = i4 = 0; i < n; i++, i4 += 4 ) - { - a.s[i] = ptr[i4]; - b.s[i] = ptr[i4+1]; - c.s[i] = ptr[i4+2]; - d.s[i] = ptr[i4+3]; - } -} - -template inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, - const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c) -{ - int i, i3; - for( i = i3 = 0; i < n; i++, i3 += 3 ) - { - ptr[i3] = a.s[i]; - ptr[i3+1] = b.s[i]; - ptr[i3+2] = c.s[i]; - } -} - -template inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, - const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c, - const v_reg<_Tp, n>& d) -{ - int i, i4; - for( i = i4 = 0; i < n; i++, i4 += 4 ) - { - ptr[i4] = a.s[i]; - ptr[i4+1] = b.s[i]; - ptr[i4+2] = c.s[i]; - ptr[i4+3] = d.s[i]; - } -} - -template inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a) -{ - for( int i = 0; i < n; i++ ) - ptr[i] = a.s[i]; -} - -template inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a) -{ - for( int i = 0; i < (n/2); i++ ) - ptr[i] = a.s[i]; -} - -template inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a) -{ - for( int i = 0; i < (n/2); i++ ) - ptr[i] = a.s[i+(n/2)]; -} - -template inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a) -{ - for( int i = 0; i < n; i++ ) - ptr[i] = a.s[i]; -} - -template inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) -{ - v_reg<_Tp, n> c; - for( int i = 0; i < (n/2); i++ ) - { - c.s[i] = a.s[i]; - c.s[i+(n/2)] = b.s[i]; - } -} - -template inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) -{ - v_reg<_Tp, n> c; - for( int i = 0; i < (n/2); i++ ) - { - c.s[i] = a.s[i+(n/2)]; - c.s[i+(n/2)] = b.s[i+(n/2)]; - } -} - -template inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, - v_reg<_Tp, n>& low, v_reg<_Tp, n>& high) -{ - for( int i = 0; i < (n/2); i++ ) - { - low.s[i] = a.s[i]; - low.s[i+(n/2)] = b.s[i]; - high.s[i] = a.s[i+(n/2)]; - high.s[i+(n/2)] = b.s[i+(n/2)]; - } -} - -template inline v_reg v_round(const v_reg& a) -{ - v_reg c; - for( int i = 0; i < n; i++ ) - c.s[i] = cvRound(a.s[i]); - return c; -} - -template inline v_reg v_floor(const v_reg& a) -{ - v_reg c; - for( int i = 0; i < n; i++ ) - c.s[i] = cvFloor(a.s[i]); - return c; -} - -template inline v_reg v_ceil(const v_reg& a) -{ - v_reg c; - for( int i = 0; i < n; i++ ) - c.s[i] = cvCeil(a.s[i]); - return c; -} - -template inline v_reg v_trunc(const v_reg& a) -{ - v_reg c; - for( int i = 0; i < n; i++ ) - c.s[i] = (int)(a.s[i]); - return c; -} - -template inline v_reg v_round(const v_reg& a) -{ - v_reg c; - for( int i = 0; i < n; i++ ) - { - c.s[i] = cvRound(a.s[i]); - c.s[i+n] = 0; - } - return c; -} - -template inline v_reg v_floor(const v_reg& a) -{ - v_reg c; - for( int i = 0; i < n; i++ ) - { - c.s[i] = cvFloor(a.s[i]); - c.s[i+n] = 0; - } - return c; -} - -template inline v_reg v_ceil(const v_reg& a) -{ - v_reg c; - for( int i = 0; i < n; i++ ) - { - c.s[i] = cvCeil(a.s[i]); - c.s[i+n] = 0; - } - return c; -} - -template inline v_reg v_trunc(const v_reg& a) -{ - v_reg c; - for( int i = 0; i < n; i++ ) - { - c.s[i] = cvCeil(a.s[i]); - c.s[i+n] = 0; - } - return c; -} - -template inline v_reg v_cvt_f32(const v_reg& a) -{ - v_reg c; - for( int i = 0; i < n; i++ ) - c.s[i] = (float)a.s[i]; - return c; -} - -template inline v_reg v_cvt_f64(const v_reg& a) -{ - v_reg c; - for( int i = 0; i < n; i++ ) - c.s[i] = (double)a.s[i]; - return c; -} - -template inline v_reg v_cvt_f64(const v_reg& a) -{ - v_reg c; - for( int i = 0; i < n; i++ ) - c.s[i] = (double)a.s[i]; - return c; } -template inline v_reg<_Tp2, n*2> v_cvtsat(const v_reg<_Tp, n>& a, - const v_reg<_Tp, n>& b) -{ - v_reg<_Tp2, n*2> c; - for( int i = 0; i < n; i++ ) - { - c.s[i] = saturate_cast<_Tp2>(a.s[i]); - c.s[i+n] = saturate_cast<_Tp2>(b.s[i]); - } - return c; -} +#if CV_SSE2 -template inline v_reg<_Tp2, n*2> v_cvtsat(const v_reg<_Tp, n>& a, - const v_reg<_Tp, n>& b, - int rshift) -{ - v_reg<_Tp2, n*2> c; - for( int i = 0; i < n; i++ ) - { - c.s[i] = saturate_cast<_Tp2>((a.s[i] + (1<<(rshift-1))) >> rshift); - c.s[i+n] = saturate_cast<_Tp2>((b.s[i] + (1<<(rshift-1))) >> rshift); - } - return c; -} +#include "opencv2/hal/intrin_sse.hpp" -template inline void v_storesat(_Tp2* ptr, const v_reg<_Tp, n>& a) -{ - for( int i = 0; i < n; i++ ) - { - ptr[i] = saturate_cast<_Tp2>(a.s[i]); - } -} +#elif CV_NEON -template inline void v_storesat(_Tp2* ptr, const v_reg<_Tp, n>& a, int rshift) -{ - for( int i = 0; i < n; i++ ) - { - ptr[i] = saturate_cast<_Tp2>((a.s[i] + (1<<(rshift-1))) >> rshift); - } -} +#include "opencv2/hal/intrin_neon.hpp" -template inline void v_transpose4x4(const v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1, - const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3, - v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1, - v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3) -{ - b0 = v_reg<_Tp, 4>(a0.s[0], a1.s[0], a2.s[0], a3.s[0]); - b1 = v_reg<_Tp, 4>(a0.s[1], a1.s[1], a2.s[1], a3.s[1]); - b2 = v_reg<_Tp, 4>(a0.s[2], a1.s[2], a2.s[2], a3.s[2]); - b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]); -} +#else -#if CV_SSE2 - -#define CV_SIMD128 1 -#define CV_SIMD128_64F 1 - -struct v_uint8x16 -{ - explicit v_uint8x16(__m128i v) : val(v) {} - v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7, - uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15) - { - val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3, - (char)v4, (char)v5, (char)v6, (char)v7, - (char)v8, (char)v9, (char)v10, (char)v11, - (char)v12, (char)v13, (char)v14, (char)v15); - } - uchar get0() const - { - return (uchar)_mm_cvtsi128_si32(val); - } - - __m128i val; -}; - -struct v_int8x16 -{ - explicit v_int8x16(__m128i v) : val(v) {} - v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7, - schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15) - { - val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3, - (char)v4, (char)v5, (char)v6, (char)v7, - (char)v8, (char)v9, (char)v10, (char)v11, - (char)v12, (char)v13, (char)v14, (char)v15); - } - schar get0() const - { - return (schar)_mm_cvtsi128_si32(val); - } - - __m128i val; -}; - -struct v_uint16x8 -{ - explicit v_uint16x8(__m128i v) : val(v) {} - v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7) - { - val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3, - (short)v4, (short)v5, (short)v6, (short)v7); - } - ushort get0() const - { - return (ushort)_mm_cvtsi128_si32(val); - } - - __m128i val; -}; - -struct v_int16x8 -{ - explicit v_int16x8(__m128i v) : val(v) {} - v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7) - { - val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3, - (short)v4, (short)v5, (short)v6, (short)v7); - } - short get0() const - { - return (short)_mm_cvtsi128_si32(val); - } - __m128i val; -}; - -struct v_uint32x4 -{ - explicit v_uint32x4(__m128i v) : val(v) {} - v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3) - { - val = _mm_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3); - } - unsigned get0() const - { - return (unsigned)_mm_cvtsi128_si32(val); - } - __m128i val; -}; - -struct v_int32x4 -{ - explicit v_int32x4(__m128i v) : val(v) {} - v_int32x4(int v0, int v1, int v2, int v3) - { - val = _mm_setr_epi32(v0, v1, v2, v3); - } - int get0() const - { - return _mm_cvtsi128_si32(val); - } - __m128i val; -}; - -struct v_float32x4 -{ - explicit v_float32x4(__m128 v) : val(v) {} - v_float32x4(float v0, float v1, float v2, float v3) - { - val = _mm_setr_ps(v0, v1, v2, v3); - } - float get0() const - { - return _mm_cvtss_f32(val); - } - __m128 val; -}; - -struct v_float64x2 -{ - explicit v_float64x2(__m128d v) : val(v) {} - v_float64x2(double v0, double v1) - { - val = _mm_setr_pd(v0, v1); - } - double get0() const - { - return _mm_cvtsd_f64(val); - } - __m128d val; -}; - -inline v_uint8x16 v_setzero_u8() { return v_uint8x16(_mm_setzero_si128()); } -inline v_int8x16 v_setzero_s8() { return v_int8x16(_mm_setzero_si128()); } -inline v_uint16x8 v_setzero_u16() { return v_uint16x8(_mm_setzero_si128()); } -inline v_int16x8 v_setzero_s16() { return v_int16x8(_mm_setzero_si128()); } -inline v_uint32x4 v_setzero_u32() { return v_uint32x4(_mm_setzero_si128()); } -inline v_int32x4 v_setzero_s32() { return v_int32x4(_mm_setzero_si128()); } -inline v_float32x4 v_setzero_f32() { return v_float32x4(_mm_setzero_ps()); } -inline v_float64x2 v_setzero_f64() { return v_float64x2(_mm_setzero_pd()); } - -inline v_uint8x16 v_setall_u8(uchar v) { return v_uint8x16(_mm_set1_epi8((char)v)); } -inline v_int8x16 v_setall_s8(schar v) { return v_int8x16(_mm_set1_epi8((char)v)); } -inline v_uint16x8 v_setall_u16(ushort v) { return v_uint16x8(_mm_set1_epi16((short)v)); } -inline v_int16x8 v_setall_s16(short v) { return v_int16x8(_mm_set1_epi16((short)v)); } -inline v_uint32x4 v_setall_u32(unsigned v) { return v_uint32x4(_mm_set1_epi32((int)v)); } -inline v_int32x4 v_setall_s32(int v) { return v_int32x4(_mm_set1_epi32(v)); } -inline v_float32x4 v_setall_f32(float v) { return v_float32x4(_mm_set1_ps(v)); } -inline v_float64x2 v_setall_f64(double v) { return v_float64x2(_mm_set1_pd(v)); } - -template inline v_uint8x16 v_reinterpret_as_u8(const _Tpvec& a) -{ return v_uint8x16(a.val); } - -inline v_uint8x16 v_reinterpret_as_u8(const v_float32x4& a) -{ return v_uint8x16(_mm_castps_si128(a.val)); } - -inline v_uint8x16 v_reinterpret_as_u8(const v_float64x2& a) -{ return v_uint8x16(_mm_castpd_si128(a.val)); } - -template inline v_int8x16 v_reinterpret_as_s8(const _Tpvec& a) -{ return v_int8x16(a.val); } - -inline v_int8x16 v_reinterpret_as_s8(const v_float32x4& a) -{ return v_int8x16(_mm_castps_si128(a.val)); } - -inline v_int8x16 v_reinterpret_as_s8(const v_float64x2& a) -{ return v_int8x16(_mm_castpd_si128(a.val)); } - -template inline v_uint16x8 v_reinterpret_as_u16(const _Tpvec& a) -{ return v_uint16x8(a.val); } - -inline v_uint16x8 v_reinterpret_as_u16(const v_float32x4& a) -{ return v_uint16x8(_mm_castps_si128(a.val)); } - -inline v_uint16x8 v_reinterpret_as_u16(const v_float64x2& a) -{ return v_uint16x8(_mm_castpd_si128(a.val)); } - -template inline v_int16x8 v_reinterpret_as_s16(const _Tpvec& a) -{ return v_int16x8(a.val); } - -inline v_int16x8 v_reinterpret_as_s16(const v_float32x4& a) -{ return v_int16x8(_mm_castps_si128(a.val)); } - -inline v_int16x8 v_reinterpret_as_s16(const v_float64x2& a) -{ return v_int16x8(_mm_castpd_si128(a.val)); } - -template inline v_uint32x4 v_reinterpret_as_u32(const _Tpvec& a) -{ return v_uint32x4(a.val); } - -inline v_uint32x4 v_reinterpret_as_u32(const v_float32x4& a) -{ return v_uint32x4(_mm_castps_si128(a.val)); } - -inline v_uint32x4 v_reinterpret_as_u32(const v_float64x2& a) -{ return v_uint32x4(_mm_castpd_si128(a.val)); } - -template inline v_int32x4 v_reinterpret_as_s32(const _Tpvec& a) -{ return v_int32x4(a.val); } - -inline v_int32x4 v_reinterpret_as_s32(const v_float32x4& a) -{ return v_int32x4(_mm_castps_si128(a.val)); } - -inline v_int32x4 v_reinterpret_as_s32(const v_float64x2& a) -{ return v_int32x4(_mm_castpd_si128(a.val)); } - -template inline v_float32x4 v_reinterpret_as_f32(const _Tpvec& a) -{ return v_float32x4(_mm_castsi128_ps(a.val)); } - -inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a) -{ return v_float32x4(_mm_castpd_ps(a.val)); } - -template inline v_float64x2 v_reinterpret_as_f64(const _Tpvec& a) -{ return v_float64x2(_mm_castsi128_pd(a.val)); } - -inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a) -{ return v_float64x2(_mm_castps_pd(a.val)); } - -inline v_uint8x16 v_cvtn_u16(const v_uint16x8& a, const v_uint16x8& b) -{ - __m128i delta = _mm_set1_epi16(255); - return v_uint8x16(_mm_packus_epi16(_mm_adds_epu16(_mm_subs_epu16(a.val, delta), delta), - _mm_adds_epu16(_mm_subs_epu16(b.val, delta), delta))); -} -inline v_uint8x16 v_shiftn_u16(const v_uint16x8& a, const v_uint16x8& b, int n) -{ - // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers. - __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); - return v_uint8x16(_mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a.val, delta), n), - _mm_srli_epi16(_mm_add_epi16(b.val, delta), n))); -} - -inline v_uint8x16 v_cvtun_s16(const v_int16x8& a, const v_int16x8& b) -{ return v_uint8x16(_mm_packus_epi16(a.val, b.val)); } -inline v_uint8x16 v_shiftun_s16(const v_int16x8& a, const v_int16x8& b, int n) -{ - __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); - return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_add_epi16(a.val, delta), n), - _mm_srai_epi16(_mm_add_epi16(b.val, delta), n))); -} - -inline void v_storen_u16(uchar* ptr, const v_uint16x8& a) -{ - __m128i delta = _mm_set1_epi16(255); - _mm_storel_epi64((__m128i*)ptr, - _mm_packus_epi16(_mm_adds_epu16(_mm_subs_epu16(a.val, delta), delta), delta)); -} - -inline void v_shiftstoren_u16(uchar* ptr, const v_uint16x8& a, int n) -{ - __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); - _mm_storel_epi64((__m128i*)ptr, - _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a.val, delta), n), delta)); -} - -inline void v_storeun_s16(uchar* ptr, const v_int16x8& a) -{ _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); } - -inline void v_shiftstoreun_s16(uchar* ptr, const v_int16x8& a, int n) -{ - __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); - _mm_storel_epi64((__m128i*)ptr, - _mm_packus_epi16(_mm_srai_epi16(_mm_add_epi16(a.val, delta), n), delta)); -} - -inline v_int8x16 v_cvtn_s16(const v_int16x8& a, const v_int16x8& b) -{ return v_int8x16(_mm_packs_epi16(a.val, b.val)); } - -inline v_int8x16 v_shiftn_s16(const v_int16x8& a, const v_int16x8& b, int n) -{ - __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); - return v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_add_epi16(a.val, delta), n), - _mm_srai_epi16(_mm_add_epi16(b.val, delta), n))); -} - -inline void v_storen_s16(schar* ptr, const v_int16x8& a) -{ _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); } - -inline void v_shiftstoren_s16(schar* ptr, const v_int16x8& a, int n) -{ - __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); - _mm_storel_epi64((__m128i*)ptr, - _mm_packs_epi16(_mm_srai_epi16(_mm_add_epi16(a.val, delta), n), delta)); -} - -// bit-wise "mask ? a : b" -inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b) -{ - return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask)); -} - -inline v_uint16x8 v_cvtn_u32(const v_uint32x4& a, const v_uint32x4& b) -{ - __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768); - __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32); - __m128i b1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, b.val), maxval32, b.val), delta32); - __m128i r = _mm_packs_epi32(a1, b1); - return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768))); -} -inline v_uint16x8 v_shiftn_u32(const v_uint32x4& a, const v_uint32x4& b, int n) -{ - __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); - __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32); - __m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val, delta), n), delta32); - return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768))); -} -inline v_uint16x8 v_cvtun_s32(const v_int32x4& a, const v_int32x4& b) -{ - __m128i delta32 = _mm_set1_epi32(32768); - __m128i r = _mm_packs_epi32(_mm_sub_epi32(a.val, delta32), _mm_sub_epi32(b.val, delta32)); - return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768))); -} -inline v_uint16x8 v_shiftun_s32(const v_int32x4& a, const v_int32x4& b, int n) -{ - __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); - __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32); - __m128i b1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(b.val, delta), n), delta32); - return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768))); -} - -inline void v_storen_u32(ushort* ptr, const v_uint32x4& a) -{ - __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768); - __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32); - __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, delta32), _mm_set1_epi16(-32768)); - _mm_storel_epi64((__m128i*)ptr, r); -} -inline void v_shiftstoren_u32(ushort* ptr, const v_uint32x4& a, int n) -{ - __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); - __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32); - __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, delta32), _mm_set1_epi16(-32768)); - _mm_storel_epi64((__m128i*)ptr, r); -} -inline void v_storeun_s32(ushort* ptr, const v_int32x4& a) -{ - __m128i delta32 = _mm_set1_epi32(32768); - __m128i a1 = _mm_sub_epi32(a.val, delta32); - __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, delta32), _mm_set1_epi16(-32768)); - _mm_storel_epi64((__m128i*)ptr, r); -} -inline void v_shiftstoreun_s32(ushort* ptr, const v_int32x4& a, int n) -{ - __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); - __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32); - __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, delta32), _mm_set1_epi16(-32768)); - _mm_storel_epi64((__m128i*)ptr, r); -} - -inline v_int16x8 v_cvtn_s32(const v_int32x4& a, const v_int32x4& b) -{ return v_int16x8(_mm_packs_epi32(a.val, b.val)); } -inline v_int16x8 v_shiftn_s32(const v_int32x4& a, const v_int32x4& b, int n) -{ - __m128i delta = _mm_set1_epi32(1 << (n-1)); - return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), - _mm_srai_epi32(_mm_add_epi32(b.val, delta), n))); -} - -inline void v_storen_s32(short* ptr, const v_int32x4& a) -{ - _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val)); -} -inline void v_shiftstoren_s32(short* ptr, const v_int32x4& a, int n) -{ - __m128i delta = _mm_set1_epi32(1 << (n-1)); - __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n); - _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1)); -} - -inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, - const v_float32x4& m1, const v_float32x4& m2, - const v_float32x4& m3) -{ - __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val); - __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val); - __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val); - __m128 v3 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(3, 3, 3, 3)), m3.val); - - return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3))); -} - - -#define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \ - inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ - { \ - return _Tpvec(intrin(a.val, b.val)); \ - } \ - inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ - { \ - a.val = intrin(a.val, b.val); \ - return a; \ - } - -OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8) -OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8) -OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8) -OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8) -OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16) -OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16) -OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint16x8, _mm_mullo_epi16) -OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16) -OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16) -OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int16x8, _mm_mullo_epi16) -OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32) -OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32) -OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32) -OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32) -OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps) -OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps) -OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps) -OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps) -OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd) -OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd) -OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd) -OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd) - -inline v_uint32x4 operator * (const v_uint32x4& a, const v_uint32x4& b) -{ - __m128i c0 = _mm_mul_epu32(a.val, b.val); - __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32)); - __m128i d0 = _mm_unpacklo_epi32(c0, c1); - __m128i d1 = _mm_unpackhi_epi32(c0, c1); - return v_uint32x4(_mm_unpacklo_epi64(d0, d1)); -} -inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b) -{ - __m128i c0 = _mm_mul_epu32(a.val, b.val); - __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32)); - __m128i d0 = _mm_unpacklo_epi32(c0, c1); - __m128i d1 = _mm_unpackhi_epi32(c0, c1); - return v_int32x4(_mm_unpacklo_epi64(d0, d1)); -} - -#define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \ - OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \ - OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \ - OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \ - inline _Tpvec operator ~ (const _Tpvec& a) \ - { \ - return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \ - } - -OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint8x16, si128, _mm_set1_epi32(-1)) -OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int8x16, si128, _mm_set1_epi32(-1)) -OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint16x8, si128, _mm_set1_epi32(-1)) -OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int16x8, si128, _mm_set1_epi32(-1)) -OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint32x4, si128, _mm_set1_epi32(-1)) -OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int32x4, si128, _mm_set1_epi32(-1)) -OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1))) -OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1))) - -inline v_float32x4 v_sqrt(const v_float32x4& x) -{ return v_float32x4(_mm_sqrt_ps(x.val)); } - -inline v_float32x4 v_invsqrt(const v_float32x4& x) -{ - static const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f); - __m128 t = x.val; - __m128 h = _mm_mul_ps(t, _0_5); - t = _mm_rsqrt_ps(t); - t = _mm_mul_ps(t, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t, t), h))); - return v_float32x4(t); -} - -inline v_float64x2 v_sqrt(const v_float64x2& x) -{ return v_float64x2(_mm_sqrt_pd(x.val)); } - -inline v_float64x2 v_invsqrt(const v_float64x2& x) -{ - static const __m128d v_1 = _mm_set1_pd(1.); - return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val))); -} - -inline v_float32x4 v_abs(const v_float32x4& x) -{ return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); } -inline v_float64x2 v_abs(const v_float64x2& x) -{ - return v_float64x2(_mm_and_pd(x.val, - _mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1)))); -} - -// TODO: exp, log, sin, cos - -#define OPENCV_HAL_IMPL_SSE_BIN_FUNC(_Tpvec, func, intrin) \ -inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \ -{ \ - return _Tpvec(intrin(a.val, b.val)); \ -} - -OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_min, _mm_min_epu8) -OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_max, _mm_max_epu8) -OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_min, _mm_min_epi16) -OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_max, _mm_max_epi16) -OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_min, _mm_min_ps) -OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_max, _mm_max_ps) -OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_min, _mm_min_pd) -OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_max, _mm_max_pd) - -inline v_int8x16 v_min(const v_int8x16& a, const v_int8x16& b) -{ - __m128i delta = _mm_set1_epi8((char)-128); - return v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta), - _mm_xor_si128(b.val, delta)))); -} -inline v_int8x16 v_max(const v_int8x16& a, const v_int8x16& b) -{ - __m128i delta = _mm_set1_epi8((char)-128); - return v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta), - _mm_xor_si128(b.val, delta)))); -} -inline v_uint16x8 v_min(const v_uint16x8& a, const v_uint16x8& b) -{ - return v_uint16x8(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, b.val))); -} -inline v_uint16x8 v_max(const v_uint16x8& a, const v_uint16x8& b) -{ - return v_uint16x8(_mm_adds_epu16(_mm_subs_epu16(a.val, b.val), b.val)); -} -inline v_uint32x4 v_min(const v_uint32x4& a, const v_uint32x4& b) -{ - __m128i delta = _mm_set1_epi32((int)0x80000000); - __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta)); - return v_uint32x4(v_select_si128(mask, b.val, a.val)); -} -inline v_uint32x4 v_max(const v_uint32x4& a, const v_uint32x4& b) -{ - __m128i delta = _mm_set1_epi32((int)0x80000000); - __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta)); - return v_uint32x4(v_select_si128(mask, a.val, b.val)); -} -inline v_int32x4 v_min(const v_int32x4& a, const v_int32x4& b) -{ - return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), b.val, a.val)); -} -inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b) -{ - return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), a.val, b.val)); -} - -#define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \ -inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \ -{ return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \ -inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \ -{ \ - __m128i not_mask = _mm_set1_epi32(-1); \ - return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \ -} \ -inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \ -{ return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \ -inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \ -{ \ - __m128i not_mask = _mm_set1_epi32(-1); \ - return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \ -} \ -inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \ -{ \ - __m128i smask = _mm_set1_##suffix(sbit); \ - return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \ -} \ -inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \ -{ \ - __m128i smask = _mm_set1_##suffix(sbit); \ - return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \ -} \ -inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \ -{ \ - __m128i smask = _mm_set1_##suffix(sbit); \ - __m128i not_mask = _mm_set1_epi32(-1); \ - __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \ - return _Tpuvec(_mm_xor_si128(res, not_mask)); \ -} \ -inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \ -{ \ - __m128i smask = _mm_set1_##suffix(sbit); \ - __m128i not_mask = _mm_set1_epi32(-1); \ - __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \ - return _Tpuvec(_mm_xor_si128(res, not_mask)); \ -} \ -inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \ -{ \ - return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \ -} \ -inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \ -{ \ - return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \ -} \ -inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \ -{ \ - __m128i not_mask = _mm_set1_epi32(-1); \ - return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \ -} \ -inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \ -{ \ - __m128i not_mask = _mm_set1_epi32(-1); \ - return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \ -} - -OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)-128) -OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768) -OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000) - -#define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \ -inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ -{ return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \ -inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ -{ return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \ -inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ -{ return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \ -inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \ -{ return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \ -inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ -{ return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \ -inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ -{ return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); } - -OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps) -OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd) - -OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8) -OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8) -OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16) -OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_add_wrap, _mm_add_epi16) -OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8) -OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8) -OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16) -OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16) - -#define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \ -inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \ -{ \ - return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a.val, b.val), _mm_subs_epu##bits(b.val, a.val))); \ -} \ -inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \ -{ \ - __m128i smask = _mm_set1_epi32(smask32); \ - __m128i a1 = _mm_xor_si128(a.val, smask); \ - __m128i b1 = _mm_xor_si128(b.val, smask); \ - return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a1, b1), _mm_subs_epu##bits(b1, a1))); \ -} - -OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080) -OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000) - -#define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \ -inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \ -{ \ - _Tpreg absmask = _mm_castsi128_##suffix(absmask_vec); \ - return _Tpvec(_mm_and_##suffix(_mm_sub_##suffix(a.val, b.val), absmask)); \ -} \ -inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \ -{ \ - _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \ - return _Tpvec(_mm_sqrt_##suffix(res)); \ -} \ -inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \ -{ \ - _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \ - return _Tpvec(res); \ -} \ -inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \ -{ \ - return _Tpvec(_mm_add_##suffix(_mm_mul_##suffix(a.val, b.val), c.val)); \ -} - -OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff)) -OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1)) - -#define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix) \ -inline _Tpuvec operator << (const _Tpuvec& a, int imm) \ -{ \ - return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \ -} \ -inline _Tpsvec operator << (const _Tpsvec& a, int imm) \ -{ \ - return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \ -} \ -inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \ -{ \ - return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \ -} \ -inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \ -{ \ - return _Tpsvec(_mm_srai_##suffix(a.val, imm)); \ -} - -OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16) -OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32) - -inline v_int16x8 v_mullo(const v_int16x8& a, const v_int16x8& b) -{ - return v_int16x8(_mm_mullo_epi16(a.val, b.val)); -} -inline v_uint16x8 v_mullo(const v_uint16x8& a, const v_uint16x8& b) -{ - return v_uint16x8(_mm_mullo_epi16(a.val, b.val)); -} -inline v_int16x8 v_mulhi2(const v_int16x8& a, const v_int16x8& b) -{ - return v_int16x8(_mm_slli_epi16(_mm_mulhi_epi16(a.val, b.val), 1)); -} -inline v_uint16x8 v_mulhi2(const v_uint16x8& a, const v_uint16x8& b) -{ - return v_uint16x8(_mm_slli_epi16(_mm_mulhi_epu16(a.val, b.val), 1)); -} - -#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \ -inline _Tpvec v_load(const _Tp* ptr) \ -{ return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \ -inline _Tpvec v_load_aligned(const _Tp* ptr) \ -{ return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \ -inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ -{ \ - return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \ - _mm_loadl_epi64((const __m128i*)ptr1))); \ -} \ -inline void v_store(_Tp* ptr, const _Tpvec& a) \ -{ _mm_storeu_si128((__m128i*)ptr, a.val); } \ -inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ -{ _mm_store_si128((__m128i*)ptr, a.val); } \ -inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ -{ _mm_storel_epi64((__m128i*)ptr, a.val); } \ -inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ -{ _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a.val, a.val)); } - -OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint8x16, uchar) -OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int8x16, schar) -OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint16x8, ushort) -OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int16x8, short) -OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint32x4, unsigned) -OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int32x4, int) - -#define OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(_Tpvec, _Tp, suffix) \ -inline _Tpvec v_load(const _Tp* ptr) \ -{ return _Tpvec(_mm_loadu_##suffix(ptr)); } \ -inline _Tpvec v_load_aligned(const _Tp* ptr) \ -{ return _Tpvec(_mm_load_##suffix(ptr)); } \ -inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ -{ \ - return _Tpvec(_mm_castsi128_##suffix( \ - _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \ - _mm_loadl_epi64((const __m128i*)ptr1)))); \ -} \ -inline void v_store(_Tp* ptr, const _Tpvec& a) \ -{ _mm_storeu_##suffix(ptr, a.val); } \ -inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ -{ _mm_store_##suffix(ptr, a.val); } \ -inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ -{ _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \ -inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ -{ \ - __m128i a1 = _mm_cast##suffix##_si128(a.val); \ - _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a1, a1)); \ -} - -OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps) -OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd) - -#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \ -inline scalartype v_reduce_##func(const _Tpvec& a) \ -{ \ - scalartype CV_DECL_ALIGNED(16) buf[4]; \ - v_store_aligned(buf, a); \ - scalartype s0 = scalar_func(buf[0], buf[1]); \ - scalartype s1 = scalar_func(buf[2], buf[3]); \ - return scalar_func(s0, s1); \ -} - -OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, sum, OPENCV_HAL_ADD) -OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max) -OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min) -OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, sum, OPENCV_HAL_ADD) -OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max) -OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min) -OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD) -OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max) -OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min) - -#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \ -inline int v_signmask(const _Tpvec& a) \ -{ \ - return and_op(_mm_movemask_##suffix(pack_op(a.val)), signmask); \ -} \ -inline bool v_check_all(const _Tpvec& a) \ -{ return and_op(_mm_movemask_##suffix(a.val), allmask) == allmask; } \ -inline bool v_check_any(const _Tpvec& a) \ -{ return and_op(_mm_movemask_##suffix(a.val), allmask) != 0; } - -#define OPENCV_HAL_PACKS(a) _mm_packs_epi16(a, a) -inline __m128i v_packq_epi32(__m128i a) -{ - __m128i b = _mm_packs_epi32(a, a); - return _mm_packs_epi16(b, b); -} - -OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535) -OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535) -OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa) -OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa) -OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888) -OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888) -OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 15, 15) -OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 3, 3) - -#define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \ -inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \ -{ \ - return _Tpvec(_mm_xor_##suffix(b.val, _mm_and_##suffix(_mm_xor_##suffix(b.val, a.val), mask.val))); \ -} - -OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, si128) -OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, si128) -OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128) -OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128) -OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128) -OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128) -OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps) -OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd) - -#define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpuvec, _Tpwuvec, _Tpu, _Tpsvec, _Tpwsvec, _Tps, suffix, wsuffix, shift) \ -inline void v_expand(const _Tpuvec& a, _Tpwuvec& b0, _Tpwuvec& b1) \ -{ \ - __m128i z = _mm_setzero_si128(); \ - b0.val = _mm_unpacklo_##suffix(a.val, z); \ - b1.val = _mm_unpackhi_##suffix(a.val, z); \ -} \ -inline _Tpwuvec v_load_expand(const _Tpu* ptr) \ -{ \ - __m128i z = _mm_setzero_si128(); \ - return _Tpwuvec(_mm_unpacklo_##suffix(_mm_loadl_epi64((const __m128i*)ptr), z)); \ -} \ -inline void v_expand(const _Tpsvec& a, _Tpwsvec& b0, _Tpwsvec& b1) \ -{ \ - b0.val = _mm_srai_##wsuffix(_mm_unpacklo_##suffix(a.val, a.val), shift); \ - b1.val = _mm_srai_##wsuffix(_mm_unpackhi_##suffix(a.val, a.val), shift); \ -} \ -inline _Tpwsvec v_load_expand(const _Tps* ptr) \ -{ \ - __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \ - return _Tpwsvec(_mm_srai_##wsuffix(_mm_unpacklo_##suffix(a, a), shift)); \ -} - -OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, v_int8x16, v_int16x8, schar, epi8, epi16, 8) -OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, v_int16x8, v_int32x4, short, epi16, epi32, 16) - -inline v_uint32x4 v_load_expand_q(const uchar* ptr) -{ - __m128i z = _mm_setzero_si128(); - __m128i a = _mm_cvtsi32_si128(*(const int*)ptr); - return v_uint32x4(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z)); -} - -inline v_int32x4 v_load_expand_q(const schar* ptr) -{ - __m128i a = _mm_cvtsi32_si128(*(const int*)ptr); - a = _mm_unpacklo_epi8(a, a); - a = _mm_unpacklo_epi8(a, a); - return v_int32x4(_mm_srai_epi32(a, 24)); -} - -#define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \ -inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \ -{ \ - b0.val = _mm_unpacklo_##suffix(a0.val, a1.val); \ - b1.val = _mm_unpackhi_##suffix(a0.val, a1.val); \ -} \ -inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \ -{ \ - __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \ - return _Tpvec(cast_to(_mm_unpacklo_epi64(a1, b1))); \ -} \ -inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \ -{ \ - __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \ - return _Tpvec(cast_to(_mm_unpackhi_epi64(a1, b1))); \ -} \ -inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \ -{ \ - __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \ - c.val = cast_to(_mm_unpacklo_epi64(a1, b1)); \ - d.val = cast_to(_mm_unpackhi_epi64(a1, b1)); \ -} - -OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP) -OPENCV_HAL_IMPL_SSE_UNPACKS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP) -OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP) -OPENCV_HAL_IMPL_SSE_UNPACKS(v_int16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP) -OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) -OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) -OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps) -OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd) - -inline v_int32x4 v_round(const v_float32x4& a) -{ return v_int32x4(_mm_cvtps_epi32(a.val)); } - -inline v_int32x4 v_floor(const v_float32x4& a) -{ - __m128i a1 = _mm_cvtps_epi32(a.val); - __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(a1), a.val)); - return v_int32x4(_mm_add_epi32(a1, mask)); -} - -inline v_int32x4 v_ceil(const v_float32x4& a) -{ - __m128i a1 = _mm_cvtps_epi32(a.val); - __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(a.val, _mm_cvtepi32_ps(a1))); - return v_int32x4(_mm_sub_epi32(a1, mask)); -} - -inline v_int32x4 v_trunc(const v_float32x4& a) -{ return v_int32x4(_mm_cvttps_epi32(a.val)); } - -inline v_int32x4 v_round(const v_float64x2& a) -{ return v_int32x4(_mm_cvtpd_epi32(a.val)); } - -inline v_int32x4 v_floor(const v_float64x2& a) -{ - __m128i a1 = _mm_cvtpd_epi32(a.val); - __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(_mm_cvtepi32_pd(a1), a.val)); - mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0 - return v_int32x4(_mm_add_epi32(a1, mask)); -} - -inline v_int32x4 v_ceil(const v_float64x2& a) -{ - __m128i a1 = _mm_cvtpd_epi32(a.val); - __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(a.val, _mm_cvtepi32_pd(a1))); - mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0 - return v_int32x4(_mm_sub_epi32(a1, mask)); -} - -inline v_int32x4 v_trunc(const v_float64x2& a) -{ return v_int32x4(_mm_cvttpd_epi32(a.val)); } - -#define OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \ -inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \ - const _Tpvec& a2, const _Tpvec& a3, \ - _Tpvec& b0, _Tpvec& b1, \ - _Tpvec& b2, _Tpvec& b3) \ -{ \ - __m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \ - __m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \ - __m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \ - __m128i t3 = cast_from(_mm_unpackhi_##suffix(a2.val, a3.val)); \ -\ - b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \ - b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \ - b2.val = cast_to(_mm_unpacklo_epi64(t2, t3)); \ - b3.val = cast_to(_mm_unpackhi_epi64(t2, t3)); \ -} - -OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) -OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) -OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps) - -#if 0 -inline void v_load_deinterleave(const uchar*, v_uint8x16&, v_uint8x16&, v_uint8x16&) -{ - // !!! TODO !!! -} -#endif - -inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d) -{ - __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ... - __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ... - __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); // a8 b8 c8 d8 ... - __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 48)); // a12 b12 c12 d12 ... - - __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ... - __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ... - __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ... - __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b4 b14 ... - - u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ... - u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ... - u2 = _mm_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ... - u3 = _mm_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ... - - v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ... - v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ... - v2 = _mm_unpackhi_epi8(u0, u1); // b0 b2 b4 b6 ... - v3 = _mm_unpackhi_epi8(u2, u3); // b1 b3 b5 b7 ... - - a.val = _mm_unpacklo_epi8(v0, v1); - b.val = _mm_unpacklo_epi8(v2, v3); - c.val = _mm_unpackhi_epi8(v0, v1); - d.val = _mm_unpacklo_epi8(v2, v3); -} - -#if 0 -inline void v_load_deinterleave(const ushort*, v_uint16x8&, v_uint16x8&, v_uint16x8&) -{ - // !!! TODO !!! -} -#endif - -inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d) -{ - __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 - __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 8)); // a2 b2 c2 d2 ... - __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ... - __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 24)); // a6 b6 c6 d6 ... - - __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ... - __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ... - __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ... - __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ... - - u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ... - u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ... - u2 = _mm_unpackhi_epi16(v0, v2); // c0 c2 c4 c6 ... - u3 = _mm_unpackhi_epi16(v1, v3); // c1 c3 c5 c7 ... - - a.val = _mm_unpacklo_epi16(u0, u1); - b.val = _mm_unpackhi_epi16(u0, u1); - c.val = _mm_unpacklo_epi16(u2, u3); - d.val = _mm_unpackhi_epi16(u2, u3); -} - -#if 0 -inline void v_load_deinterleave(const unsigned*, v_uint32x4&, v_uint32x4&, v_uint32x4&) -{ - // !!! TODO !!! -} -#endif - -inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d) -{ - v_uint32x4 u0(_mm_loadu_si128((const __m128i*)ptr)); // a0 b0 c0 d0 - v_uint32x4 u1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1 - v_uint32x4 u2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2 - v_uint32x4 u3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3 - - v_transpose4x4(u0, u1, u2, u3, a, b, c, d); -} - -inline void v_load_deinterleave(const float*, v_float32x4&, v_float32x4&, v_float32x4&) -{ - // !!! TODO !!! -} - -inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c, v_float32x4& d) -{ - v_float32x4 u0(_mm_loadu_ps(ptr)); - v_float32x4 u1(_mm_loadu_ps(ptr + 4)); - v_float32x4 u2(_mm_loadu_ps(ptr + 8)); - v_float32x4 u3(_mm_loadu_ps(ptr + 12)); - - v_transpose4x4(u0, u1, u2, u3, a, b, c, d); -} - -inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, - const v_uint8x16& c, const v_uint8x16& d) -{ - // a0 a1 a2 a3 .... - // b0 b1 b2 b3 .... - // c0 c1 c2 c3 .... - // d0 d1 d2 d3 .... - __m128i u0 = _mm_unpacklo_epi8(a.val, c.val); // a0 c0 a1 c1 ... - __m128i u1 = _mm_unpackhi_epi8(a.val, c.val); // a8 c8 a9 c9 ... - __m128i u2 = _mm_unpacklo_epi8(b.val, d.val); // b0 d0 b1 d1 ... - __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ... - - __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ... - __m128i v1 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ... - __m128i v2 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ... - __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ... - - _mm_storeu_si128((__m128i*)ptr, v0); - _mm_storeu_si128((__m128i*)(ptr + 16), v2); - _mm_storeu_si128((__m128i*)(ptr + 32), v1); - _mm_storeu_si128((__m128i*)(ptr + 48), v3); -} - -inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b, - const v_uint16x8& c, const v_uint16x8& d) -{ - // a0 a1 a2 a3 .... - // b0 b1 b2 b3 .... - // c0 c1 c2 c3 .... - // d0 d1 d2 d3 .... - __m128i u0 = _mm_unpacklo_epi16(a.val, c.val); // a0 c0 a1 c1 ... - __m128i u1 = _mm_unpackhi_epi16(a.val, c.val); // a4 c4 a5 c5 ... - __m128i u2 = _mm_unpacklo_epi16(b.val, d.val); // b0 d0 b1 d1 ... - __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ... - - __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ... - __m128i v1 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ... - __m128i v2 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ... - __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ... - - _mm_storeu_si128((__m128i*)ptr, v0); - _mm_storeu_si128((__m128i*)(ptr + 8), v2); - _mm_storeu_si128((__m128i*)(ptr + 16), v1); - _mm_storeu_si128((__m128i*)(ptr + 24), v3); -} - -inline v_float32x4 v_cvt_f32(const v_int32x4& a) -{ - return v_float32x4(_mm_cvtepi32_ps(a.val)); -} - -inline v_float32x4 v_cvt_f32(const v_float64x2& a) -{ - return v_float32x4(_mm_cvtpd_ps(a.val)); -} - -inline v_float64x2 v_cvt_f64(const v_int32x4& a) -{ - return v_float64x2(_mm_cvtepi32_pd(a.val)); -} - -inline v_float64x2 v_cvt_f64(const v_float32x4& a) -{ - return v_float64x2(_mm_cvtps_pd(a.val)); -} - -#elif CV_NEON - -#define CV_SIMD128 1 - -struct v_uint8x16 -{ - explicit v_uint8x16(uint8x16_t v) : val(v) {} - v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7, - uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15) - { - uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15}; - val = vld1q_u8(v); - } - uchar get0() const - { - return vgetq_lane_u8(val, 0); - } - - uint8x16_t val; -}; - -struct v_int8x16 -{ - explicit v_int8x16(int8x16_t v) : val(v) {} - v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7, - schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15) - { - schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15}; - val = vld1q_s8(v); - } - schar get0() const - { - return vgetq_lane_s8(val, 0); - } - - int8x16_t val; -}; - -struct v_uint16x8 -{ - explicit v_uint16x8(uint16x8_t v) : val(v) {} - v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7) - { - ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7}; - val = vld1q_u16(v); - } - ushort get0() const - { - return vgetq_lane_u16(val, 0); - } - - uint16x8_t val; -}; - -struct v_int16x8 -{ - explicit v_int16x8(int16x8_t v) : val(v) {} - v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7) - { - short v[] = {v0, v1, v2, v3, v4, v5, v6, v7}; - val = vld1q_s16(v); - } - short get0() const - { - return vgetq_lane_s16(val, 0); - } - - int16x8_t val; -}; - -struct v_uint32x4 -{ - explicit v_uint32x4(uint32x4_t v) : val(v) {} - v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3) - { - unsigned v[] = {v0, v1, v2, v3}; - val = vld1q_u32(v); - } - unsigned get0() const - { - return vgetq_lane_u32(val, 0); - } - - uint32x4_t val; -}; - -struct v_int32x4 -{ - explicit v_int32x4(int32x4_t v) : val(v) {} - v_int32x4(int v0, int v1, int v2, int v3) - { - int v[] = {v0, v1, v2, v3}; - val = vld1q_s32(v); - } - int get0() const - { - return vgetq_lane_s32(val, 0); - } - int32x4_t val; -}; - -struct v_float32x4 -{ - explicit v_float32x4(float32x4_t v) : val(v) {} - v_float32x4(float v0, float v1, float v2, float v3) - { - float v[] = {v0, v1, v2, v3}; - val = vld1q_f32(v); - } - float get0() const - { - return vgetq_lane_f32(val, 0); - } - float32x4_t val; -}; - -typedef v_reg v_float64x2; -typedef v_reg v_float64x4; - -#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \ -inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \ -inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \ -inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \ -inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \ -inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \ -inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(vreinterpretq_u16_##suffix(v.val)); } \ -inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpretq_s16_##suffix(v.val)); } \ -inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(vreinterpretq_u32_##suffix(v.val)); } \ -inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(vreinterpretq_s32_##suffix(v.val)); } \ -inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); } - -OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8) -OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8) -OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16) -OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16) -OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32) -OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32) -OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32) - -inline v_uint8x16 v_cvtn_u16(const v_uint16x8& a, const v_uint16x8& b) -{ - uint8x8_t a1 = vqmovn_u16(a.val), b1 = vqmovn_u16(b.val); - return v_uint8x16(vcombine_u8(a1, b1)); -} -inline v_uint8x16 v_cvtun_s16(const v_int16x8& a, const v_int16x8& b) -{ - uint8x8_t a1 = vqmovun_s16(a.val), b1 = vqmovun_s16(b.val); - return v_uint8x16(vcombine_u8(a1, b1)); -} -inline v_int8x16 v_cvtn_s16(const v_int16x8& a, const v_int16x8& b) -{ - int8x8_t a1 = vqmovn_s16(a.val), b1 = vqmovn_s16(b.val); - return v_int8x16(vcombine_s8(a1, b1)); -} -inline void v_storen_u16(uchar* ptr, const v_uint16x8& a) { vst1_u8(ptr, vqmovn_u16(a.val)); } -inline void v_storeun_s16(uchar* ptr, const v_int16x8& a) { vst1_u8(ptr, vqmovun_s16(a.val)); } -inline void v_storen_s16(schar* ptr, const v_int16x8& a) { vst1_s8(ptr, vqmovn_s16(a.val)); } - -inline v_uint16x8 v_cvtn_u32(const v_uint32x4& a, const v_uint32x4& b) -{ - uint16x4_t a1 = vqmovn_u32(a.val), b1 = vqmovn_u32(b.val); - return v_uint16x8(vcombine_u16(a1, b1)); -} -inline v_uint16x8 v_cvtun_s32(const v_int32x4& a, const v_int32x4& b) -{ - uint16x4_t a1 = vqmovun_s32(a.val), b1 = vqmovun_s32(b.val); - return v_uint16x8(vcombine_u16(a1, b1)); -} -inline v_int16x8 v_cvtn_s32(const v_int32x4& a, const v_int32x4& b) -{ - int16x4_t a1 = vqmovn_s32(a.val), b1 = vqmovn_s32(b.val); - return v_int16x8(vcombine_s16(a1, b1)); -} -inline void v_storen_u32(ushort* ptr, const v_uint32x4& a) { vst1_u16(ptr, vqmovn_u32(a.val)); } -inline void v_storeun_s32(ushort* ptr, const v_int32x4& a) { vst1_u16(ptr, vqmovun_s32(a.val)); } -inline void v_storen_s32(short* ptr, const v_int32x4& a) { vst1_s16(ptr, vqmovn_s32(a.val)); } - -#define v_shiftn_u16(a, b, n) v_uint8x16(vcombine_u8(vqrshrn_n_u16((a).val, (n)), vqrshrn_n_u16((b).val, (n)))) -#define v_shiftn_s16(a, b, n) v_int8x16(vcombine_s8(vqrshrn_n_s16((a).val, (n)), vqrshrn_n_s16((b).val, (n)))) -#define v_shiftn_u32(a, b, n) v_uint16x8(vcombine_u16(vqrshrn_n_u32((a).val, (n)), vqrshrn_n_u32((b).val, (n)))) -#define v_shiftn_s32(a, b, n) v_int16x8(vcombine_s16(vqrshrn_n_s32((a).val, (n)), vqrshrn_n_s32((b).val, (n)))) -#define v_shiftun_s16(a, b, n) v_uint8x16(vcombine_u8(vqrshrun_n_s16((a).val, (n)), vqrshrun_n_s16((b).val, (n)))) -#define v_shiftun_s32(a, b, n) v_uint16x8(vcombine_u16(vqrshrun_n_s32((a).val, (n)), vqrshrun_n_s32((b).val, (n)))) -#define v_shiftstoren_u16(a, n) vst1_u8(vqrshrn_n_u16((a).val, (n))) -#define v_shiftstoren_s16(a, n) vst1_s8(vqrshrn_n_s16((a).val, (n))) -#define v_shiftstoreun_s16(a, n) vst1_u8(vqrshrun_n_s16((a).val, (n))) -#define v_shiftstoren_u32(a, n) vst1_u16(vqrshrn_n_u32((a).val, (n))) -#define v_shiftstoren_s32(a, n) vst1_s16(vqrshrn_n_s32((a).val, (n))) -#define v_shiftstoreun_s32(a, n) vst1_u16(vqrshrun_n_s32((a).val, (n))) - -inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, - const v_float32x4& m1, const v_float32x4& m2, - const v_float32x4& m3) -{ - float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val); - float32x4_t res = vmulq_lane_f32(m0.val, vl, 0); - res = vmlaq_lane_f32(res, m1.val, vl, 1); - res = vmlaq_lane_f32(res, m2.val, vh, 0); - res = vmlaq_lane_f32(res, m3.val, vh, 1); - return v_float32x4(res); -} - -#define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \ -inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ -{ \ - return _Tpvec(intrin(a.val, b.val)); \ -} \ -inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ -{ \ - a.val = intrin(a.val, b.val); \ - return a; \ -} - -OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8) -OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8) -OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8) -OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8) -OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16) -OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16) -OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint16x8, vmulq_u16) -OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16) -OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16) -OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int16x8, vmulq_s16) -OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32) -OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32) -OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32) -OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32) -OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32) -OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32) - -inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b) -{ - float32x4_t reciprocal = vrecpeq_f32(b.val); - reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal); - reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal); - return v_float32x4(vmulq_f32(a.val, reciprocal)); -} -inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b) -{ - float32x4_t reciprocal = vrecpeq_f32(b.val); - reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal); - reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal); - a.val = vmulq_f32(a.val, reciprocal); - return a; -} - -#define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \ - OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \ - OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \ - OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \ - inline _Tpvec operator ~ (const _Tpvec& a) \ - { \ - return _Tpvec(vmvnq_##suffix(a.val)); \ - } - -OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint8x16, u8) -OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int8x16, s8) -OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint16x8, u16) -OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int16x8, s16) -OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint32x4, u32) -OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32) - -#define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \ -inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \ -{ \ - return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \ -} \ -inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \ -{ \ - a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \ - return a; \ -} - -OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32) -OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32) -OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32) - -inline v_float32x4 operator ~ (const v_float32x4& a) -{ - return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val)))); -} - -inline v_float32x4 v_sqrt(const v_float32x4& x) -{ - float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN)); - float32x4_t e = vrsqrteq_f32(x1); - e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e); - e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e); - return v_float32x4(vmulq_f32(x.val, e)); -} - -inline v_float32x4 v_invsqrt(const v_float32x4& x) -{ - float32x4_t e = vrsqrteq_f32(x.val); - e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e); - e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e); - return v_float32x4(e); -} - -inline v_float32x4 v_abs(v_float32x4 x) -{ return v_float32x4(vabsq_f32(x.val)); } - -// TODO: exp, log, sin, cos - -#define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \ -inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \ -{ \ - return _Tpvec(intrin(a.val, b.val)); \ -} - -OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_min, vminq_u8) -OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_max, vmaxq_u8) -OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_min, vminq_s8) -OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_max, vmaxq_s8) -OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_min, vminq_u16) -OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_max, vmaxq_u16) -OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_min, vminq_s16) -OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_max, vmaxq_s16) -OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_min, vminq_u32) -OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_max, vmaxq_u32) -OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32) -OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_max, vmaxq_s32) -OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_min, vminq_f32) -OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_max, vmaxq_f32) - - -#define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \ -inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ -{ return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \ -inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ -{ return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \ -inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ -{ return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \ -inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \ -{ return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \ -inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ -{ return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \ -inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ -{ return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); } - -OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8) -OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, vreinterpretq_s8_u8, s8, u8) -OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16) -OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16) -OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32) -OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32) -OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32) - -OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8) -OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8) -OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16) -OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_add_wrap, vaddq_s16) -OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8) -OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8) -OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16) -OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16) - -// TODO: absdiff for signed integers -OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8) -OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16) -OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32) -OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32) - -inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b) -{ - v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val)); - return v_sqrt(x); -} - -inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b) -{ - return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val)); -} - -inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c) -{ - return v_float32x4(vmlaq_f32(c.val, a.val, b.val)); -} - -// trade efficiency for convenience -#define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \ -inline _Tpvec operator << (const _Tpvec& a, int n) \ -{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \ -inline _Tpvec operator >> (const _Tpvec& a, int n) \ -{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } - -OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8) -OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8) -OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, u16, short, s16) -OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, s16, short, s16) -OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, u32, int, s32) -OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32) - -OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_mullo, vmulq_u16) -OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mullo, vmulq_s16) -OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mulhi2, vqrdmulhq_s16) - -#define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \ -inline _Tpvec v_load(const _Tp* ptr) \ -{ return _Tpvec(vld1q_##suffix(ptr)); } \ -inline _Tpvec v_load_aligned(const _Tp* ptr) \ -{ return _Tpvec(vld1q_##suffix(ptr)); } \ -inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ -{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \ -inline void v_store(_Tp* ptr, const _Tpvec& a) \ -{ vst1q_##suffix(ptr, a.val); } \ -inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ -{ vst1q_##suffix(ptr, a.val); } \ -inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ -{ vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \ -inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ -{ vst1_##suffix(ptr, vget_high_##suffix(a.val)); } - -OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8) -OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8) -OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16) -OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16) -OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32) -OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32) -OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32) - -#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \ -inline scalartype v_reduce_##func(const _Tpvec& a) \ -{ \ - scalartype CV_DECL_ALIGNED(16) buf[4]; \ - v_store_aligned(buf, a); \ - scalartype s0 = scalar_func(buf[0], buf[1]); \ - scalartype s1 = scalar_func(buf[2], buf[3]); \ - return scalar_func(s0, s1); \ -} - -OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, sum, OPENCV_HAL_ADD) -OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max) -OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min) -OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, sum, OPENCV_HAL_ADD) -OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, max, std::max) -OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, min, std::min) -OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD) -OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, max, std::max) -OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, min, std::min) - -inline int v_signmask(const v_uint8x16& a) -{ - int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100)); - uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0)); - uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0))); - return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8); -} -inline int v_signmask(const v_int8x16& a) -{ return v_signmask(v_reinterpret_as_u8(a)); } - -inline int v_signmask(const v_uint16x8& a) -{ - int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000)); - uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0)); - uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0)); - return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4); -} -inline int v_signmask(const v_int16x8& a) -{ return v_signmask(v_reinterpret_as_u16(a)); } - -inline int v_signmask(const v_uint32x4& a) -{ - int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000)); - uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0)); - uint64x2_t v1 = vpaddlq_u32(v0); - return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2); -} -inline int v_signmask(const v_int32x4& a) -{ return v_signmask(v_reinterpret_as_u32(a)); } -inline int v_signmask(const v_float32x4& a) -{ return v_signmask(v_reinterpret_as_u32(a)); } - -#define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \ -inline bool v_check_all(const v_##_Tpvec& a) \ -{ \ - _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \ - uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \ - return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \ -} \ -inline bool v_check_any(const v_##_Tpvec& a) \ -{ \ - _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \ - uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \ - return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \ -} - -OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7) -OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15) -OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31) - -inline bool v_check_all(const v_int8x16& a) -{ return v_check_all(v_reinterpret_as_u8(a)); } -inline bool v_check_all(const v_int16x8& a) -{ return v_check_all(v_reinterpret_as_u16(a)); } -inline bool v_check_all(const v_int32x4& a) -{ return v_check_all(v_reinterpret_as_u32(a)); } -inline bool v_check_all(const v_float32x4& a) -{ return v_check_all(v_reinterpret_as_u32(a)); } - -inline bool v_check_any(const v_int8x16& a) -{ return v_check_all(v_reinterpret_as_u8(a)); } -inline bool v_check_any(const v_int16x8& a) -{ return v_check_all(v_reinterpret_as_u16(a)); } -inline bool v_check_any(const v_int32x4& a) -{ return v_check_all(v_reinterpret_as_u32(a)); } -inline bool v_check_any(const v_float32x4& a) -{ return v_check_all(v_reinterpret_as_u32(a)); } - -#define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \ -inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \ -{ \ - return _Tpvec(vbslq_##suffix(vreinterpretq_##usuffix##_##suffix(mask.val), a.val, b.val)); \ -} - -OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8) -OPENCV_HAL_IMPL_NEON_SELECT(v_int8x16, s8, u8) -OPENCV_HAL_IMPL_NEON_SELECT(v_uint16x8, u16, u16) -OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16) -OPENCV_HAL_IMPL_NEON_SELECT(v_uint32x4, u32, u32) -OPENCV_HAL_IMPL_NEON_SELECT(v_int32x4, s32, u32) -OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32) - -#define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \ -inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \ -{ \ - b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \ - b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \ -} \ -inline _Tpwvec v_load_expand(const _Tp* ptr) \ -{ \ - return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \ -} - -OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8) -OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8) -OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16) -OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16) - -inline v_uint32x4 v_load_expand_q(const uchar* ptr) -{ - uint8x8_t v0 = vcreate_u8(*(unsigned*)ptr); - uint16x4_t v1 = vget_low_u16(vmovl_u8(v0)); - return v_uint32x4(vmovl_u16(v1)); -} - -inline v_int32x4 v_load_expand_q(const schar* ptr) -{ - int8x8_t v0 = vcreate_s8(*(unsigned*)ptr); - int16x4_t v1 = vget_low_s16(vmovl_s8(v0)); - return v_int32x4(vmovl_s16(v1)); -} - -#define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \ -inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \ -{ \ - _Tpvec##x2_t p = vzipq_##suffix(a0.val, a1.val); \ - b0.val = p.val[0]; \ - b1.val = p.val[1]; \ -} \ -inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \ -{ \ - return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \ -} \ -inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \ -{ \ - return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \ -} \ -inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \ -{ \ - c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \ - d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \ -} - -OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8) -OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8) -OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16) -OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16) -OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32) -OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32) -OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32) - -inline v_int32x4 v_round(const v_float32x4& a) -{ - static const int32x4_t v_sign = vdupq_n_s32(1 << 31), - v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f)); - - int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val))); - return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition)))); -} - -inline v_int32x4 v_floor(const v_float32x4& a) -{ - int32x4_t a1 = vcvtq_s32_f32(a.val); - uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val); - return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask))); -} - -inline v_int32x4 v_ceil(const v_float32x4& a) -{ - int32x4_t a1 = vcvtq_s32_f32(a.val); - uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1)); - return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask))); -} - -inline v_int32x4 v_trunc(const v_float32x4& a) -{ return v_int32x4(vcvtq_s32_f32(a.val)); } - -#define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \ -inline void transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \ - const v_##_Tpvec& a2, const v_##_Tpvec& a3, \ - v_##_Tpvec& b0, v_##_Tpvec& b1, \ - v_##_Tpvec& b2, v_##_Tpvec& b3) \ -{ \ - /* m00 m01 m02 m03 */ \ - /* m10 m11 m12 m13 */ \ - /* m20 m21 m22 m23 */ \ - /* m30 m31 m32 m33 */ \ - _Tpvec##x2_t t0 = vtrnq_##suffix(a0.val, a1.val); \ - _Tpvec##x2_t t1 = vtrnq_##suffix(a2.val, a3.val); \ - /* m00 m10 m02 m12 */ \ - /* m01 m11 m03 m13 */ \ - /* m20 m30 m22 m32 */ \ - /* m21 m31 m23 m33 */ \ - b0.val = vcombine_##suffix(vget_low_##suffix(t0.val[0]), vget_low_##suffix(t1.val[0])); \ - b1.val = vcombine_##suffix(vget_low_##suffix(t0.val[1]), vget_low_##suffix(t1.val[1])); \ - b2.val = vcombine_##suffix(vget_high_##suffix(t0.val[0]), vget_high_##suffix(t1.val[0])); \ - b3.val = vcombine_##suffix(vget_high_##suffix(t0.val[1]), vget_high_##suffix(t1.val[1])); \ -} - -OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32) -OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32) -OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32) - -#define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \ -inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \ -{ \ - _Tpvec##x3_t v = vld3q_##suffix(ptr); \ - a.val = v.val[0]; \ - b.val = v.val[1]; \ - c.val = v.val[2]; \ -} \ -inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \ - v_##_Tpvec& c, v_##_Tpvec& d) \ -{ \ - _Tpvec##x4_t v = vld4q_##suffix(ptr); \ - a.val = v.val[0]; \ - b.val = v.val[1]; \ - c.val = v.val[2]; \ - d.val = v.val[3]; \ -} \ -inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \ -{ \ - _Tpvec##x3_t v; \ - v.val[0] = a.val; \ - v.val[1] = b.val; \ - v.val[2] = c.val; \ - vst3q_##suffix(ptr, v); \ -} \ -inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \ - const v_##_Tpvec& c, const v_##_Tpvec& d) \ -{ \ - _Tpvec##x4_t v; \ - v.val[0] = a.val; \ - v.val[1] = b.val; \ - v.val[2] = c.val; \ - v.val[3] = d.val; \ - vst4q_##suffix(ptr, v); \ -} - -OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8) -OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8) -OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16) -OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16) -OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32) -OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32) -OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32) - -inline v_float32x4 v_cvt_f32(const v_int32x4& a) -{ - return v_float32x4(vcvtq_f32_s32(a.val)); -} - -#else - -typedef v_reg v_uint8x16; -typedef v_reg v_int8x16; -typedef v_reg v_uint16x8; -typedef v_reg v_int16x8; -typedef v_reg v_uint32x4; -typedef v_reg v_int32x4; -typedef v_reg v_float32x4; -typedef v_reg v_float32x8; -typedef v_reg v_float64x2; -typedef v_reg v_float64x4; - -inline v_uint8x16 v_setzero_u8() { return v_uint8x16::zero(); } -inline v_int8x16 v_setzero_s8() { return v_int8x16::zero(); } -inline v_uint16x8 v_setzero_u16() { return v_uint16x8::zero(); } -inline v_int16x8 v_setzero_s16() { return v_int16x8::zero(); } -inline v_uint32x4 v_setzero_u32() { return v_uint32x4::zero(); } -inline v_int32x4 v_setzero_s32() { return v_int32x4::zero(); } -inline v_float32x4 v_setzero_f32() { return v_float32x4::zero(); } -inline v_float64x2 v_setzero_f64() { return v_float64x2::zero(); } - -inline v_uint8x16 v_setall_u8(uchar v) { return v_uint8x16::all(v); } -inline v_int8x16 v_setall_s8(schar v) { return v_int8x16::all(v); } -inline v_uint16x8 v_setall_u16(ushort v) { return v_uint16x8::all(v); } -inline v_int16x8 v_setall_s16(short v) { return v_int16x8::all(v); } -inline v_uint32x4 v_setall_u32(unsigned v) { return v_uint32x4::all(v); } -inline v_int32x4 v_setall_s32(int v) { return v_int32x4::all(v); } -inline v_float32x4 v_setall_f32(float v) { return v_float32x4::all(v); } -inline v_float64x2 v_setall_f64(double v) { return v_float64x2::all(v); } - -template inline v_uint8x16 v_reinterpret_as_u8(const v_reg<_Tp, n>& a) -{ return v_reg<_Tp, n>::template reinterpret_as(a); } - -template inline v_int8x16 v_reinterpret_as_s8(const v_reg<_Tp, n>& a) -{ return v_reg<_Tp, n>::template reinterpret_as(a); } - -template inline v_uint16x8 v_reinterpret_as_u16(const v_reg<_Tp, n>& a) -{ return v_reg<_Tp, n>::template reinterpret_as(a); } - -template inline v_int16x8 v_reinterpret_as_s16(const v_reg<_Tp, n>& a) -{ return v_reg<_Tp, n>::template reinterpret_as(a); } - -template inline v_uint32x4 v_reinterpret_as_u32(const v_reg<_Tp, n>& a) -{ return v_reg<_Tp, n>::template reinterpret_as(a); } - -template inline v_int32x4 v_reinterpret_as_s32(const v_reg<_Tp, n>& a) -{ return v_reg<_Tp, n>::template reinterpret_as(a); } - -template inline v_float32x4 v_reinterpret_as_f32(const v_reg<_Tp, n>& a) -{ return v_reg<_Tp, n>::template reinterpret_as(a); } - -template inline v_float64x2 v_reinterpret_as_f64(const v_reg<_Tp, n>& a) -{ return v_reg<_Tp, n>::template reinterpret_as(a); } - -inline v_uint8x16 v_cvtn_u16(const v_uint16x8& a, const v_uint16x8& b) -{ return v_cvtsat(a, b); } -inline v_uint8x16 v_shiftn_u16(const v_uint16x8& a, const v_uint16x8& b, int n) -{ return v_cvtsat(a, b, n); } -inline v_uint8x16 v_cvtun_s16(const v_int16x8& a, const v_int16x8& b) -{ return v_cvtsat(a, b); } -inline v_uint8x16 v_shiftun_s16(const v_int16x8& a, const v_int16x8& b, int n) -{ return v_cvtsat(a, b, n); } - -inline void v_storen_u16(uchar* ptr, const v_uint16x8& b) -{ return v_storesat(ptr, b); } -inline void v_shiftstoren_u16(uchar* ptr, const v_uint16x8& b, int n) -{ return v_storesat(ptr, b, n); } -inline void v_shiftstoreun_s16(uchar* ptr, const v_int16x8& b) -{ return v_storesat(ptr, b); } -inline void v_shiftstoreun_s16(uchar* ptr, const v_int16x8& b, int n) -{ return v_storesat(ptr, b, n); } - -inline v_int8x16 v_cvtn_s16(const v_int16x8& a, const v_int16x8& b) -{ return v_cvtsat(a, b); } -inline v_int8x16 v_shiftn_s16(const v_int16x8& a, const v_int16x8& b, int n) -{ return v_cvtsat(a, b, n); } - -inline void v_storen_s16(schar* ptr, const v_int16x8& b) -{ return v_storesat(ptr, b); } -inline void v_shiftstoren_s16(schar* ptr, const v_int16x8& b, int n) -{ return v_storesat(ptr, b, n); } - -inline v_uint16x8 v_cvtn_u32(const v_uint32x4& a, const v_uint32x4& b) -{ return v_cvtsat(a, b); } -inline v_uint16x8 v_shiftn_u32(const v_uint32x4& a, const v_uint32x4& b, int n) -{ return v_cvtsat(a, b, n); } -inline v_uint16x8 v_cvtun_s32(const v_int32x4& a, const v_int32x4& b) -{ return v_cvtsat(a, b); } -inline v_uint16x8 v_shiftun_s32(const v_int32x4& a, const v_int32x4& b, int n) -{ return v_cvtsat(a, b, n); } - -inline void v_storen_u32(ushort* ptr, const v_uint32x4& b) -{ return v_storesat(ptr, b); } -inline void v_shiftstoren_u32(ushort* ptr, const v_uint32x4& b, int n) -{ return v_storesat(ptr, b, n); } -inline void v_storeun_s32(ushort* ptr, const v_int32x4& b) -{ return v_storesat(ptr, b); } -inline void v_shiftstoreun_s32(ushort* ptr, const v_int32x4& b, int n) -{ return v_storesat(ptr, b, n); } - -inline v_int16x8 v_cvtn_s32(const v_int32x4& a, const v_int32x4& b) -{ return v_cvtsat(a, b); } -inline v_int16x8 v_shiftn_s32(const v_int32x4& a, const v_int32x4& b, int n) -{ return v_cvtsat(a, b, n); } - -inline void v_storen_s32(short* ptr, const v_int32x4& b) -{ return v_storesat(ptr, b); } -inline void v_shiftstoren_s32(short* ptr, const v_int32x4& b, int n) -{ return v_storesat(ptr, b, n); } - -inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, - const v_float32x4& m1, const v_float32x4& m2, - const v_float32x4& m3) -{ - return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + v.s[3]*m3.s[0], - v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + v.s[3]*m3.s[1], - v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + v.s[3]*m3.s[2], - v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]); -} +#include "opencv2/hal/intrin_cpp.hpp" #endif -}} - #ifndef CV_SIMD128 #define CV_SIMD128 0 #endif diff --git a/modules/hal/include/opencv2/hal/intrin_cpp.hpp b/modules/hal/include/opencv2/hal/intrin_cpp.hpp new file mode 100644 index 0000000000..d0d5b28a29 --- /dev/null +++ b/modules/hal/include/opencv2/hal/intrin_cpp.hpp @@ -0,0 +1,811 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Copyright (C) 2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_HAL_INTRIN_CPP_HPP__ +#define __OPENCV_HAL_INTRIN_CPP_HPP__ + +namespace cv +{ + +template struct v_reg +{ + typedef _Tp lane_type; + typedef v_reg::int_type, n> int_vec; + typedef v_reg::abs_type, n> abs_vec; + enum { nlanes = n }; + + explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; } + v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; } + v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; } + v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3, + _Tp s4, _Tp s5, _Tp s6, _Tp s7) + { + s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; + s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7; + } + v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3, + _Tp s4, _Tp s5, _Tp s6, _Tp s7, + _Tp s8, _Tp s9, _Tp s10, _Tp s11, + _Tp s12, _Tp s13, _Tp s14, _Tp s15) + { + s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; + s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7; + s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11; + s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15; + } + + v_reg() {} + v_reg(const v_reg<_Tp, n> & r) + { + for( int i = 0; i < n; i++ ) + s[i] = r.s[i]; + } + + _Tp get(const int i) const { return s[i]; } + _Tp get0() const { return s[0]; } + v_reg<_Tp, n> high() const + { + v_reg<_Tp, n> c; + int i; + for( i = 0; i < n/2; i++ ) + { + c.s[i] = s[i+(n/2)]; + c.s[i+(n/2)] = 0; + } + return c; + } + + static v_reg<_Tp, n> zero() + { + v_reg<_Tp, n> c; + for( int i = 0; i < n; i++ ) + c.s[i] = (_Tp)0; + return c; + } + + static v_reg<_Tp, n> all(_Tp s) + { + v_reg<_Tp, n> c; + for( int i = 0; i < n; i++ ) + c.s[i] = s; + return c; + } + + template v_reg<_Tp2, n2> reinterpret_as() const + { + size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n); + v_reg<_Tp2, n2> c; + memcpy(&c.s[0], &s[0], bytes); + return c; + } + + _Tp s[n]; +}; + +#define OPENCV_HAL_IMPL_BIN_OP(bin_op) \ +template inline v_reg<_Tp, n> \ + operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +{ \ + v_reg<_Tp, n> c; \ + for( int i = 0; i < n; i++ ) \ + c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \ + return c; \ +} \ +template inline v_reg<_Tp, n>& \ + operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +{ \ + for( int i = 0; i < n; i++ ) \ + a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \ + return a; \ +} + +OPENCV_HAL_IMPL_BIN_OP(+) +OPENCV_HAL_IMPL_BIN_OP(-) +OPENCV_HAL_IMPL_BIN_OP(*) +OPENCV_HAL_IMPL_BIN_OP(/) + +#define OPENCV_HAL_IMPL_BIT_OP(bit_op) \ +template inline v_reg<_Tp, n> operator bit_op \ + (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +{ \ + v_reg<_Tp, n> c; \ + typedef typename V_TypeTraits<_Tp>::int_type itype; \ + for( int i = 0; i < n; i++ ) \ + c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \ + V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \ + return c; \ +} \ +template inline v_reg<_Tp, n>& operator \ + bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +{ \ + typedef typename V_TypeTraits<_Tp>::int_type itype; \ + for( int i = 0; i < n; i++ ) \ + a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \ + V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \ + return a; \ +} + +OPENCV_HAL_IMPL_BIT_OP(&) +OPENCV_HAL_IMPL_BIT_OP(|) +OPENCV_HAL_IMPL_BIT_OP(^) + +template inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) +{ + v_reg<_Tp, n> c; + for( int i = 0; i < n; i++ ) + c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i])); + return c; +} + +#define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \ +template inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \ +{ \ + v_reg<_Tp2, n> c; \ + for( int i = 0; i < n; i++ ) \ + c.s[i] = cfunc(a.s[i]); \ + return c; \ +} + +OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp) +OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp) +OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp) +OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp) +OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp) +OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs, + typename V_TypeTraits<_Tp>::abs_type) +OPENCV_HAL_IMPL_MATH_FUNC(v_round, cvRound, int) +OPENCV_HAL_IMPL_MATH_FUNC(v_floor, cvFloor, int) +OPENCV_HAL_IMPL_MATH_FUNC(v_ceil, cvCeil, int) +OPENCV_HAL_IMPL_MATH_FUNC(v_trunc, int, int) + +#define OPENCV_HAL_IMPL_MINMAX_FUNC(func, hfunc, cfunc) \ +template inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +{ \ + v_reg<_Tp, n> c; \ + for( int i = 0; i < n; i++ ) \ + c.s[i] = cfunc(a.s[i], b.s[i]); \ + return c; \ +} \ +template inline _Tp hfunc(const v_reg<_Tp, n>& a) \ +{ \ + _Tp c = a.s[0]; \ + for( int i = 1; i < n; i++ ) \ + c = cfunc(c, a.s[i]); \ + return c; \ +} + +OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, v_reduce_min, std::min) +OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, v_reduce_max, std::max) + +template +inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, + v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval ) +{ + for( int i = 0; i < n; i++ ) + { + minval.s[i] = std::min(a.s[i], b.s[i]); + maxval.s[i] = std::max(a.s[i], b.s[i]); + } +} + + +#define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \ +template \ +inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +{ \ + typedef typename V_TypeTraits<_Tp>::int_type itype; \ + v_reg<_Tp, n> c; \ + for( int i = 0; i < n; i++ ) \ + c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \ + return c; \ +} + +OPENCV_HAL_IMPL_CMP_OP(<) +OPENCV_HAL_IMPL_CMP_OP(>) +OPENCV_HAL_IMPL_CMP_OP(<=) +OPENCV_HAL_IMPL_CMP_OP(>=) +OPENCV_HAL_IMPL_CMP_OP(==) +OPENCV_HAL_IMPL_CMP_OP(!=) + +#define OPENCV_HAL_IMPL_ADD_SUB_OP(func, bin_op, cast_op, _Tp2) \ +template \ +inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +{ \ + typedef _Tp2 rtype; \ + v_reg c; \ + for( int i = 0; i < n; i++ ) \ + c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \ + return c; \ +} + +OPENCV_HAL_IMPL_ADD_SUB_OP(v_add_wrap, +, (_Tp), _Tp) +OPENCV_HAL_IMPL_ADD_SUB_OP(v_sub_wrap, -, (_Tp), _Tp) +OPENCV_HAL_IMPL_ADD_SUB_OP(v_absdiff, -, (rtype)std::abs, typename V_TypeTraits<_Tp>::abs_type) + +template +inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a) +{ + v_reg<_Tp, n> c; + for( int i = 0; i < n; i++ ) + c.s[i] = 1.f/std::sqrt(a.s[i]); + return c; +} + +template +inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + v_reg<_Tp, n> c; + for( int i = 0; i < n; i++ ) + c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]); + return c; +} + + +template +inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + v_reg<_Tp, n> c; + for( int i = 0; i < n; i++ ) + c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i]; + return c; +} + +template +inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, + const v_reg<_Tp, n>& c) +{ + v_reg<_Tp, n> d; + for( int i = 0; i < n; i++ ) + d.s[i] = a.s[i]*b.s[i] + c.s[i]; + return d; +} + +template inline v_reg::w_type, n/2> + v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + typedef typename V_TypeTraits<_Tp>::w_type w_type; + v_reg c; + for( int i = 0; i < (n/2); i++ ) + c.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1]; + return c; +} + +template inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, + v_reg::w_type, n/2>& c, + v_reg::w_type, n/2>& d) +{ + typedef typename V_TypeTraits<_Tp>::w_type w_type; + for( int i = 0; i < (n/2); i++ ) + { + c.s[i] = (w_type)a.s[i]*b.s[i]*2; + d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)]; + } +} + +template inline void v_hsum(const v_reg<_Tp, n>& a, + v_reg::w_type, n/2>& c) +{ + typedef typename V_TypeTraits<_Tp>::w_type w_type; + for( int i = 0; i < (n/2); i++ ) + { + c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1]; + } +} + +#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \ +template inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \ +{ \ + v_reg<_Tp, n> c; \ + for( int i = 0; i < n; i++ ) \ + c.s[i] = (_Tp)(a.s[i] shift_op imm); \ + return c; \ +} + +OPENCV_HAL_IMPL_SHIFT_OP(<<) +OPENCV_HAL_IMPL_SHIFT_OP(>>) + +template inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a) +{ + typename V_TypeTraits<_Tp>::sum_type c = a.s[0]; + for( int i = 1; i < n; i++ ) + c += a.s[i]; + return c; +} + +template inline int v_signmask(const v_reg<_Tp, n>& a) +{ + int mask = 0; + for( int i = 0; i < n; i++ ) + mask |= (V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i; + return mask; +} + +template inline bool v_check_all(const v_reg<_Tp, n>& a) +{ + for( int i = 0; i < n; i++ ) + if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 ) + return false; + return true; +} + +template inline bool v_check_any(const v_reg<_Tp, n>& a) +{ + for( int i = 0; i < n; i++ ) + if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 ) + return true; + return false; +} + +template inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask, + const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + v_reg<_Tp, n> c; + for( int i = 0; i < n; i++ ) + c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(mask.s[i]) < 0 ? b.s[i] : a.s[i]; + return c; +} + +template inline void v_expand(const v_reg<_Tp, n>& a, + v_reg::w_type, n/2>& b0, + v_reg::w_type, n/2>& b1) +{ + for( int i = 0; i < (n/2); i++ ) + { + b0.s[i] = a.s[i]; + b1.s[i] = a.s[i+(n/2)]; + } +} + +template inline v_reg::int_type, n> + v_reinterpret_as_int(const v_reg<_Tp, n>& a) +{ + v_reg::int_type, n> c; + for( int i = 0; i < n; i++ ) + c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(a.s[i]); + return c; +} + +template inline v_reg::uint_type, n> + v_reinterpret_as_uint(const v_reg<_Tp, n>& a) +{ + v_reg::uint_type, n> c; + for( int i = 0; i < n; i++ ) + c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]); + return c; +} + +template inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1, + v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 ) +{ + int i; + for( i = 0; i < n/2; i++ ) + { + b0.s[i*2] = a0.s[i]; + b0.s[i*2+1] = a1.s[i]; + } + for( ; i < n; i++ ) + { + b1.s[i*2-n] = a0.s[i]; + b1.s[i*2-n+1] = a1.s[i]; + } +} + +template inline v_reg<_Tp, n> v_load(const _Tp* ptr) +{ + return v_reg<_Tp, n>(ptr); +} + +template inline v_reg<_Tp, n> v_load_aligned(const _Tp* ptr) +{ + return v_reg<_Tp, n>(ptr); +} + +template inline void v_load_halves(const _Tp* loptr, const _Tp* hiptr) +{ + v_reg<_Tp, n> c; + for( int i = 0; i < n/2; i++ ) + { + c.s[i] = loptr[i]; + c.s[i+n/2] = hiptr[i]; + } + return c; +} + +template inline v_reg::w_type, n> v_load_expand(const _Tp* ptr) +{ + typedef typename V_TypeTraits<_Tp>::w_type w_type; + v_reg c; + for( int i = 0; i < n; i++ ) + { + c.s[i] = ptr[i]; + } + return c; +} + +template inline v_reg::w_type>::w_type, n> v_load_expand_q(const _Tp* ptr) +{ + typedef typename V_TypeTraits::w_type>::w_type w_type; + v_reg c; + for( int i = 0; i < n; i++ ) + { + c.s[i] = ptr[i]; + } + return c; +} + +template inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a, + v_reg<_Tp, n>& b, v_reg<_Tp, n>& c) +{ + int i, i3; + for( i = i3 = 0; i < n; i++, i3 += 3 ) + { + a.s[i] = ptr[i3]; + b.s[i] = ptr[i3+1]; + c.s[i] = ptr[i3+2]; + } +} + +template +inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a, + v_reg<_Tp, n>& b, v_reg<_Tp, n>& c, + v_reg<_Tp, n>& d) +{ + int i, i4; + for( i = i4 = 0; i < n; i++, i4 += 4 ) + { + a.s[i] = ptr[i4]; + b.s[i] = ptr[i4+1]; + c.s[i] = ptr[i4+2]; + d.s[i] = ptr[i4+3]; + } +} + +template +inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, + const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c) +{ + int i, i3; + for( i = i3 = 0; i < n; i++, i3 += 3 ) + { + ptr[i3] = a.s[i]; + ptr[i3+1] = b.s[i]; + ptr[i3+2] = c.s[i]; + } +} + +template inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, + const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c, + const v_reg<_Tp, n>& d) +{ + int i, i4; + for( i = i4 = 0; i < n; i++, i4 += 4 ) + { + ptr[i4] = a.s[i]; + ptr[i4+1] = b.s[i]; + ptr[i4+2] = c.s[i]; + ptr[i4+3] = d.s[i]; + } +} + +template +inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a) +{ + for( int i = 0; i < n; i++ ) + ptr[i] = a.s[i]; +} + +template +inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a) +{ + for( int i = 0; i < (n/2); i++ ) + ptr[i] = a.s[i]; +} + +template +inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a) +{ + for( int i = 0; i < (n/2); i++ ) + ptr[i] = a.s[i+(n/2)]; +} + +template +inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a) +{ + for( int i = 0; i < n; i++ ) + ptr[i] = a.s[i]; +} + +template +inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + v_reg<_Tp, n> c; + for( int i = 0; i < (n/2); i++ ) + { + c.s[i] = a.s[i]; + c.s[i+(n/2)] = b.s[i]; + } +} + +template +inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + v_reg<_Tp, n> c; + for( int i = 0; i < (n/2); i++ ) + { + c.s[i] = a.s[i+(n/2)]; + c.s[i+(n/2)] = b.s[i+(n/2)]; + } +} + +template +inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, + v_reg<_Tp, n>& low, v_reg<_Tp, n>& high) +{ + for( int i = 0; i < (n/2); i++ ) + { + low.s[i] = a.s[i]; + low.s[i+(n/2)] = b.s[i]; + high.s[i] = a.s[i+(n/2)]; + high.s[i+(n/2)] = b.s[i+(n/2)]; + } +} + +template inline v_reg v_round(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = cvRound(a.s[i]); + return c; +} + +template inline v_reg v_floor(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = cvFloor(a.s[i]); + return c; +} + +template inline v_reg v_ceil(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = cvCeil(a.s[i]); + return c; +} + +template inline v_reg v_trunc(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = (int)(a.s[i]); + return c; +} + +template inline v_reg v_round(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + { + c.s[i] = cvRound(a.s[i]); + c.s[i+n] = 0; + } + return c; +} + +template inline v_reg v_floor(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + { + c.s[i] = cvFloor(a.s[i]); + c.s[i+n] = 0; + } + return c; +} + +template inline v_reg v_ceil(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + { + c.s[i] = cvCeil(a.s[i]); + c.s[i+n] = 0; + } + return c; +} + +template inline v_reg v_trunc(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + { + c.s[i] = cvCeil(a.s[i]); + c.s[i+n] = 0; + } + return c; +} + +template inline v_reg v_cvt_f32(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = (float)a.s[i]; + return c; +} + +template inline v_reg v_cvt_f64(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = (double)a.s[i]; + return c; +} + +template inline v_reg v_cvt_f64(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = (double)a.s[i]; + return c; +} + +template +inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1, + const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3, + v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1, + v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3 ) +{ + b0 = v_reg<_Tp, 4>(a0.s[0], a1.s[0], a2.s[0], a3.s[0]); + b1 = v_reg<_Tp, 4>(a0.s[1], a1.s[1], a2.s[1], a3.s[1]); + b2 = v_reg<_Tp, 4>(a0.s[2], a1.s[2], a2.s[2], a3.s[2]); + b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]); +} + +typedef v_reg v_uint8x16; +typedef v_reg v_int8x16; +typedef v_reg v_uint16x8; +typedef v_reg v_int16x8; +typedef v_reg v_uint32x4; +typedef v_reg v_int32x4; +typedef v_reg v_float32x4; +typedef v_reg v_float32x8; +typedef v_reg v_float64x2; +typedef v_reg v_uint64x2; +typedef v_reg v_int64x2; + +#define OPENCV_HAL_IMPL_C_INIT(_Tpvec, _Tp, suffix) \ +inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); } \ +inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); } \ +template inline _Tpvec \ + v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \ +{ return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(a); } + +OPENCV_HAL_IMPL_C_INIT(v_uint8x16, uchar, u8) +OPENCV_HAL_IMPL_C_INIT(v_int8x16, schar, s8) +OPENCV_HAL_IMPL_C_INIT(v_uint16x8, ushort, u16) +OPENCV_HAL_IMPL_C_INIT(v_int16x8, short, s16) +OPENCV_HAL_IMPL_C_INIT(v_uint32x4, unsigned, u32) +OPENCV_HAL_IMPL_C_INIT(v_int32x4, int, s32) +OPENCV_HAL_IMPL_C_INIT(v_float32x4, float, f32) +OPENCV_HAL_IMPL_C_INIT(v_float64x2, double, f64) +OPENCV_HAL_IMPL_C_INIT(v_uint64x2, uint64, u64) +OPENCV_HAL_IMPL_C_INIT(v_uint64x2, int64, s64) + +#define OPENCV_HAL_IMPL_C_SHIFT(_Tpvec, _Tp) \ +template inline _Tpvec v_lshift(const _Tpvec& a) \ +{ return a << n; } \ +template inline _Tpvec v_rshift(const _Tpvec& a) \ +{ return a >> n; } \ +template inline _Tpvec v_rshift_round(const _Tpvec& a) \ +{ \ + _Tpvec c; \ + for( int i = 0; i < _Tpvec::nlanes; i++ ) \ + c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \ + return c; \ +} + +OPENCV_HAL_IMPL_C_SHIFT(v_uint16x8, ushort) +OPENCV_HAL_IMPL_C_SHIFT(v_int16x8, short) +OPENCV_HAL_IMPL_C_SHIFT(v_uint32x4, unsigned) +OPENCV_HAL_IMPL_C_SHIFT(v_int32x4, int) +OPENCV_HAL_IMPL_C_SHIFT(v_uint64x2, uint64) +OPENCV_HAL_IMPL_C_SHIFT(v_int64x2, int64) + + +#define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \ +inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \ +{ \ + _Tpnvec c; \ + for( int i = 0; i < _Tpvec::nlanes; i++ ) \ + { \ + c.s[i] = saturate_cast<_Tpn>(a.s[i]); \ + c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>(b.s[i]); \ + } \ + return c; \ +} \ +template inline _Tpnvec v_rshift_round_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \ +{ \ + _Tpnvec c; \ + for( int i = 0; i < _Tpvec::nlanes; i++ ) \ + { \ + c.s[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \ + c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \ + } \ + return c; \ +} \ +inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \ +{ \ + for( int i = 0; i < _Tpvec::nlanes; i++ ) \ + ptr[i] = saturate_cast<_Tpn>(a.s[i]); \ +} \ +template inline void v_rshift_round_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \ +{ \ + for( int i = 0; i < _Tpvec::nlanes; i++ ) \ + ptr[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \ +} + +OPENCV_HAL_IMPL_C_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack) +OPENCV_HAL_IMPL_C_PACK(v_int16x8, short, v_int8x16, schar, pack) +OPENCV_HAL_IMPL_C_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u) +OPENCV_HAL_IMPL_C_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack) +OPENCV_HAL_IMPL_C_PACK(v_int32x4, int, v_int16x8, short, pack) +OPENCV_HAL_IMPL_C_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u) +OPENCV_HAL_IMPL_C_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack) +OPENCV_HAL_IMPL_C_PACK(v_int64x2, int64, v_int32x4, int, pack) + +inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, + const v_float32x4& m1, const v_float32x4& m2, + const v_float32x4& m3) +{ + return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + v.s[3]*m3.s[0], + v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + v.s[3]*m3.s[1], + v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + v.s[3]*m3.s[2], + v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]); +} + +} + +#endif diff --git a/modules/hal/include/opencv2/hal/intrin_neon.hpp b/modules/hal/include/opencv2/hal/intrin_neon.hpp new file mode 100644 index 0000000000..4bda95db68 --- /dev/null +++ b/modules/hal/include/opencv2/hal/intrin_neon.hpp @@ -0,0 +1,823 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Copyright (C) 2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_HAL_INTRIN_NEON_HPP__ +#define __OPENCV_HAL_INTRIN_NEON_HPP__ + +namespace cv +{ + +#define CV_SIMD128 1 + +struct v_uint8x16 +{ + typedef uchar lane_type; + enum { nlanes = 16 }; + + v_uint8x16() {} + explicit v_uint8x16(uint8x16_t v) : val(v) {} + v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7, + uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15) + { + uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15}; + val = vld1q_u8(v); + } + uchar get0() const + { + return vgetq_lane_u8(val, 0); + } + + uint8x16_t val; +}; + +struct v_int8x16 +{ + typedef schar lane_type; + enum { nlanes = 16 }; + + v_int8x16() {} + explicit v_int8x16(int8x16_t v) : val(v) {} + v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7, + schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15) + { + schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15}; + val = vld1q_s8(v); + } + schar get0() const + { + return vgetq_lane_s8(val, 0); + } + + int8x16_t val; +}; + +struct v_uint16x8 +{ + typedef ushort lane_type; + enum { nlanes = 8 }; + + v_uint16x8() {} + explicit v_uint16x8(uint16x8_t v) : val(v) {} + v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7) + { + ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7}; + val = vld1q_u16(v); + } + ushort get0() const + { + return vgetq_lane_u16(val, 0); + } + + uint16x8_t val; +}; + +struct v_int16x8 +{ + typedef short lane_type; + enum { nlanes = 8 }; + + v_int16x8() {} + explicit v_int16x8(int16x8_t v) : val(v) {} + v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7) + { + short v[] = {v0, v1, v2, v3, v4, v5, v6, v7}; + val = vld1q_s16(v); + } + short get0() const + { + return vgetq_lane_s16(val, 0); + } + + int16x8_t val; +}; + +struct v_uint32x4 +{ + typedef unsigned lane_type; + enum { nlanes = 4 }; + + v_uint32x4() {} + explicit v_uint32x4(uint32x4_t v) : val(v) {} + v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3) + { + unsigned v[] = {v0, v1, v2, v3}; + val = vld1q_u32(v); + } + unsigned get0() const + { + return vgetq_lane_u32(val, 0); + } + + uint32x4_t val; +}; + +struct v_int32x4 +{ + typedef int lane_type; + enum { nlanes = 4 }; + + v_int32x4() {} + explicit v_int32x4(int32x4_t v) : val(v) {} + v_int32x4(int v0, int v1, int v2, int v3) + { + int v[] = {v0, v1, v2, v3}; + val = vld1q_s32(v); + } + int get0() const + { + return vgetq_lane_s32(val, 0); + } + int32x4_t val; +}; + +struct v_float32x4 +{ + typedef float lane_type; + enum { nlanes = 4 }; + + v_float32x4() {} + explicit v_float32x4(float32x4_t v) : val(v) {} + v_float32x4(float v0, float v1, float v2, float v3) + { + float v[] = {v0, v1, v2, v3}; + val = vld1q_f32(v); + } + float get0() const + { + return vgetq_lane_f32(val, 0); + } + float32x4_t val; +}; + +struct v_uint64x2 +{ + typedef uint64 lane_type; + enum { nlanes = 2 }; + + v_uint64x2() {} + explicit v_uint64x2(uint64x2_t v) : val(v) {} + v_uint64x2(unsigned v0, unsigned v1) + { + uint64 v[] = {v0, v1}; + val = vld1q_u64(v); + } + uint64 get0() const + { + return vgetq_lane_u64(val, 0); + } + uint64x2_t val; +}; + +struct v_int64x2 +{ + typedef int64 lane_type; + enum { nlanes = 2 }; + + v_int64x2() {} + explicit v_int64x2(int64x2_t v) : val(v) {} + v_int64x2(int v0, int v1) + { + int64 v[] = {v0, v1}; + val = vld1q_s64(v); + } + int64 get0() const + { + return vgetq_lane_s64(val, 0); + } + int64x2_t val; +}; + +#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \ +inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \ +inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \ +inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \ +inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \ +inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \ +inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(vreinterpretq_u16_##suffix(v.val)); } \ +inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpretq_s16_##suffix(v.val)); } \ +inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(vreinterpretq_u32_##suffix(v.val)); } \ +inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(vreinterpretq_s32_##suffix(v.val)); } \ +inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(vreinterpretq_u64_##suffix(v.val)); } \ +inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(vreinterpretq_s64_##suffix(v.val)); } \ +inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); } + +OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8) +OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8) +OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16) +OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16) +OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32) +OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32) +OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, u64) +OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, s64) +OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32) + +#define OPENCV_HAL_IMPL_NEON_PACK(_Tpvec, _Tp, hreg, suffix, _Tpwvec, wsuffix, pack, op) \ +inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \ +{ \ + hreg a1 = vqmov##op##_##wsuffix(a.val), b1 = vqmov##op##_##wsuffix(b.val); \ + return _Tpvec(vcombine_##suffix(a1, b1)); \ +} \ +inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \ +{ \ + hreg a1 = vqmov##op##_##wsuffix(a.val); \ + vst1_##suffix(ptr, a1); \ +} \ +template inline \ +_Tpvec v_rshift_round_##pack(const _Tpwvec& a, const _Tpwvec& b) \ +{ \ + hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \ + hreg b1 = vqrshr##op##_n_##wsuffix(b.val, n); \ + return _Tpvec(vcombine_##suffix(a1, b1)); \ +} \ +template inline \ +void v_rshift_round_##pack##_store(_Tp* ptr, const _Tpwvec& a) \ +{ \ + hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \ + vst1_##suffix(ptr, a1); \ +} + +OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, u16, pack, n) +OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, s16, pack_u, un) +OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, s16, pack, n) +OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, u32, pack, n) +OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, s32, pack_u, un) +OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, s32, pack, n) +OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, u32, pack, n) +OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, s64, pack, n) + +inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, + const v_float32x4& m1, const v_float32x4& m2, + const v_float32x4& m3) +{ + float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val); + float32x4_t res = vmulq_lane_f32(m0.val, vl, 0); + res = vmlaq_lane_f32(res, m1.val, vl, 1); + res = vmlaq_lane_f32(res, m2.val, vh, 0); + res = vmlaq_lane_f32(res, m3.val, vh, 1); + return v_float32x4(res); +} + +#define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \ +inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(intrin(a.val, b.val)); \ +} \ +inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ +{ \ + a.val = intrin(a.val, b.val); \ + return a; \ +} + +OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8) +OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8) +OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8) +OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8) +OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16) +OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16) +OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint16x8, vmulq_u16) +OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16) +OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16) +OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int16x8, vmulq_s16) +OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32) +OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32) +OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32) +OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32) +OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32) +OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32) +OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int64x2, vaddq_s64) +OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int64x2, vsubq_s64) +OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint64x2, vaddq_u64) +OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint64x2, vsubq_u64) + +inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b) +{ + float32x4_t reciprocal = vrecpeq_f32(b.val); + reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal); + return v_float32x4(vmulq_f32(a.val, reciprocal)); +} +inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b) +{ + float32x4_t reciprocal = vrecpeq_f32(b.val); + reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal); + a.val = vmulq_f32(a.val, reciprocal); + return a; +} + +inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b, + v_int32x4& c, v_int32x4& d) +{ + c.val = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val)); + d.val = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val)); +} + +inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b, + v_uint32x4& c, v_uint32x4& d) +{ + c.val = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val)); + d.val = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val)); +} + +inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, + v_uint64x2& c, v_uint64x2& d) +{ + c.val = vmull_u32(vget_low_u32(a.val), vget_low_u32(b.val)); + d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val)); +} + +inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) +{ + int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val)); + int32x4_t d = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val)); + int32x4x2_t cd = vtrnq_s32(c, d); + return v_int32x4(vaddq_s32(cd.val[0], cd.val[1])); +} + +#define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \ + OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \ + OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \ + OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \ + inline _Tpvec operator ~ (const _Tpvec& a) \ + { \ + return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \ + } + +OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint8x16, u8) +OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int8x16, s8) +OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint16x8, u16) +OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int16x8, s16) +OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint32x4, u32) +OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32) +OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64) +OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64) + +#define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \ +inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \ +{ \ + return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \ +} \ +inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \ +{ \ + a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \ + return a; \ +} + +OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32) +OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32) +OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32) + +inline v_float32x4 operator ~ (const v_float32x4& a) +{ + return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val)))); +} + +inline v_float32x4 v_sqrt(const v_float32x4& x) +{ + float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN)); + float32x4_t e = vrsqrteq_f32(x1); + e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e); + e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e); + return v_float32x4(vmulq_f32(x.val, e)); +} + +inline v_float32x4 v_invsqrt(const v_float32x4& x) +{ + float32x4_t e = vrsqrteq_f32(x.val); + e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e); + e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e); + return v_float32x4(e); +} + +inline v_float32x4 v_abs(v_float32x4 x) +{ return v_float32x4(vabsq_f32(x.val)); } + +// TODO: exp, log, sin, cos + +#define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \ +inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(intrin(a.val, b.val)); \ +} + +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_min, vminq_u8) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_max, vmaxq_u8) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_min, vminq_s8) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_max, vmaxq_s8) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_min, vminq_u16) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_max, vmaxq_u16) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_min, vminq_s16) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_max, vmaxq_s16) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_min, vminq_u32) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_max, vmaxq_u32) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_max, vmaxq_s32) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_min, vminq_f32) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_max, vmaxq_f32) + + +#define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \ +inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \ +inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \ +inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \ +inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \ +inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \ +inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); } + +OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8) +OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, vreinterpretq_s8_u8, s8, u8) +OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16) +OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16) +OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32) +OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32) +OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32) + +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_add_wrap, vaddq_s16) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16) + +// TODO: absdiff for signed integers +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32) + +inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b) +{ + v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val)); + return v_sqrt(x); +} + +inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b) +{ + return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val)); +} + +inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c) +{ + return v_float32x4(vmlaq_f32(c.val, a.val, b.val)); +} + +// trade efficiency for convenience +#define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \ +inline _Tpvec operator << (const _Tpvec& a, int n) \ +{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \ +inline _Tpvec operator >> (const _Tpvec& a, int n) \ +{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \ +template inline _Tpvec lshift(const _Tpvec& a) \ +{ return _Tpvec(vshlq_n_##suffix(a.val, n)); } \ +template inline _Tpvec rshift(const _Tpvec& a) \ +{ return _Tpvec(vshrq_n_##suffix(a.val, n)); } \ +template inline _Tpvec rshift_round(const _Tpvec& a) \ +{ return _Tpvec(vrshrq_n_##suffix(a.val, n)); } + +OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8) +OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8) +OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, u16, short, s16) +OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, s16, short, s16) +OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, u32, int, s32) +OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32) +OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint64x2, u64, int64, s64) +OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64) + +#define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \ +inline _Tpvec v_load(const _Tp* ptr) \ +{ return _Tpvec(vld1q_##suffix(ptr)); } \ +inline _Tpvec v_load_aligned(const _Tp* ptr) \ +{ return _Tpvec(vld1q_##suffix(ptr)); } \ +inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ +{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \ +inline void v_store(_Tp* ptr, const _Tpvec& a) \ +{ vst1q_##suffix(ptr, a.val); } \ +inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ +{ vst1q_##suffix(ptr, a.val); } \ +inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ +{ vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \ +inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ +{ vst1_##suffix(ptr, vget_high_##suffix(a.val)); } + +OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8) +OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8) +OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16) +OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16) +OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32) +OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32) +OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32) + +#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \ +inline scalartype v_reduce_##func(const _Tpvec& a) \ +{ \ + scalartype CV_DECL_ALIGNED(16) buf[4]; \ + v_store_aligned(buf, a); \ + scalartype s0 = scalar_func(buf[0], buf[1]); \ + scalartype s1 = scalar_func(buf[2], buf[3]); \ + return scalar_func(s0, s1); \ +} + +OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, sum, OPENCV_HAL_ADD) +OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max) +OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min) +OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, sum, OPENCV_HAL_ADD) +OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, max, std::max) +OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, min, std::min) +OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD) +OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, max, std::max) +OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, min, std::min) + +inline int v_signmask(const v_uint8x16& a) +{ + int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100)); + uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0)); + uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0))); + return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8); +} +inline int v_signmask(const v_int8x16& a) +{ return v_signmask(v_reinterpret_as_u8(a)); } + +inline int v_signmask(const v_uint16x8& a) +{ + int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000)); + uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0)); + uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0)); + return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4); +} +inline int v_signmask(const v_int16x8& a) +{ return v_signmask(v_reinterpret_as_u16(a)); } + +inline int v_signmask(const v_uint32x4& a) +{ + int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000)); + uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0)); + uint64x2_t v1 = vpaddlq_u32(v0); + return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2); +} +inline int v_signmask(const v_int32x4& a) +{ return v_signmask(v_reinterpret_as_u32(a)); } +inline int v_signmask(const v_float32x4& a) +{ return v_signmask(v_reinterpret_as_u32(a)); } + +#define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \ +inline bool v_check_all(const v_##_Tpvec& a) \ +{ \ + _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \ + uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \ + return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \ +} \ +inline bool v_check_any(const v_##_Tpvec& a) \ +{ \ + _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \ + uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \ + return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \ +} + +OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7) +OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15) +OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31) + +inline bool v_check_all(const v_int8x16& a) +{ return v_check_all(v_reinterpret_as_u8(a)); } +inline bool v_check_all(const v_int16x8& a) +{ return v_check_all(v_reinterpret_as_u16(a)); } +inline bool v_check_all(const v_int32x4& a) +{ return v_check_all(v_reinterpret_as_u32(a)); } +inline bool v_check_all(const v_float32x4& a) +{ return v_check_all(v_reinterpret_as_u32(a)); } + +inline bool v_check_any(const v_int8x16& a) +{ return v_check_all(v_reinterpret_as_u8(a)); } +inline bool v_check_any(const v_int16x8& a) +{ return v_check_all(v_reinterpret_as_u16(a)); } +inline bool v_check_any(const v_int32x4& a) +{ return v_check_all(v_reinterpret_as_u32(a)); } +inline bool v_check_any(const v_float32x4& a) +{ return v_check_all(v_reinterpret_as_u32(a)); } + +#define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \ +inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(vbslq_##suffix(vreinterpretq_##usuffix##_##suffix(mask.val), a.val, b.val)); \ +} + +OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8) +OPENCV_HAL_IMPL_NEON_SELECT(v_int8x16, s8, u8) +OPENCV_HAL_IMPL_NEON_SELECT(v_uint16x8, u16, u16) +OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16) +OPENCV_HAL_IMPL_NEON_SELECT(v_uint32x4, u32, u32) +OPENCV_HAL_IMPL_NEON_SELECT(v_int32x4, s32, u32) +OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32) + +#define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \ +inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \ +{ \ + b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \ + b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \ +} \ +inline _Tpwvec v_load_expand(const _Tp* ptr) \ +{ \ + return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \ +} + +OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8) +OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8) +OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16) +OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16) + +inline v_uint32x4 v_load_expand_q(const uchar* ptr) +{ + uint8x8_t v0 = vcreate_u8(*(unsigned*)ptr); + uint16x4_t v1 = vget_low_u16(vmovl_u8(v0)); + return v_uint32x4(vmovl_u16(v1)); +} + +inline v_int32x4 v_load_expand_q(const schar* ptr) +{ + int8x8_t v0 = vcreate_s8(*(unsigned*)ptr); + int16x4_t v1 = vget_low_s16(vmovl_s8(v0)); + return v_int32x4(vmovl_s16(v1)); +} + +#define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \ +inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \ +{ \ + _Tpvec##x2_t p = vzipq_##suffix(a0.val, a1.val); \ + b0.val = p.val[0]; \ + b1.val = p.val[1]; \ +} \ +inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \ +{ \ + return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \ +} \ +inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \ +{ \ + return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \ +} \ +inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \ +{ \ + c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \ + d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \ +} + +OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8) +OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8) +OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16) +OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16) +OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32) +OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32) +OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32) + +inline v_int32x4 v_round(const v_float32x4& a) +{ + static const int32x4_t v_sign = vdupq_n_s32(1 << 31), + v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f)); + + int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val))); + return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition)))); +} + +inline v_int32x4 v_floor(const v_float32x4& a) +{ + int32x4_t a1 = vcvtq_s32_f32(a.val); + uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val); + return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask))); +} + +inline v_int32x4 v_ceil(const v_float32x4& a) +{ + int32x4_t a1 = vcvtq_s32_f32(a.val); + uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1)); + return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask))); +} + +inline v_int32x4 v_trunc(const v_float32x4& a) +{ return v_int32x4(vcvtq_s32_f32(a.val)); } + +#define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \ +inline void transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \ + const v_##_Tpvec& a2, const v_##_Tpvec& a3, \ + v_##_Tpvec& b0, v_##_Tpvec& b1, \ + v_##_Tpvec& b2, v_##_Tpvec& b3) \ +{ \ + /* m00 m01 m02 m03 */ \ + /* m10 m11 m12 m13 */ \ + /* m20 m21 m22 m23 */ \ + /* m30 m31 m32 m33 */ \ + _Tpvec##x2_t t0 = vtrnq_##suffix(a0.val, a1.val); \ + _Tpvec##x2_t t1 = vtrnq_##suffix(a2.val, a3.val); \ + /* m00 m10 m02 m12 */ \ + /* m01 m11 m03 m13 */ \ + /* m20 m30 m22 m32 */ \ + /* m21 m31 m23 m33 */ \ + b0.val = vcombine_##suffix(vget_low_##suffix(t0.val[0]), vget_low_##suffix(t1.val[0])); \ + b1.val = vcombine_##suffix(vget_low_##suffix(t0.val[1]), vget_low_##suffix(t1.val[1])); \ + b2.val = vcombine_##suffix(vget_high_##suffix(t0.val[0]), vget_high_##suffix(t1.val[0])); \ + b3.val = vcombine_##suffix(vget_high_##suffix(t0.val[1]), vget_high_##suffix(t1.val[1])); \ +} + +OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32) +OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32) +OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32) + +#define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \ +inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \ +{ \ + _Tpvec##x3_t v = vld3q_##suffix(ptr); \ + a.val = v.val[0]; \ + b.val = v.val[1]; \ + c.val = v.val[2]; \ +} \ +inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \ + v_##_Tpvec& c, v_##_Tpvec& d) \ +{ \ + _Tpvec##x4_t v = vld4q_##suffix(ptr); \ + a.val = v.val[0]; \ + b.val = v.val[1]; \ + c.val = v.val[2]; \ + d.val = v.val[3]; \ +} \ +inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \ +{ \ + _Tpvec##x3_t v; \ + v.val[0] = a.val; \ + v.val[1] = b.val; \ + v.val[2] = c.val; \ + vst3q_##suffix(ptr, v); \ +} \ +inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \ + const v_##_Tpvec& c, const v_##_Tpvec& d) \ +{ \ + _Tpvec##x4_t v; \ + v.val[0] = a.val; \ + v.val[1] = b.val; \ + v.val[2] = c.val; \ + v.val[3] = d.val; \ + vst4q_##suffix(ptr, v); \ +} + +OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8) +OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8) +OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16) +OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16) +OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32) +OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32) +OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32) + +inline v_float32x4 v_cvt_f32(const v_int32x4& a) +{ + return v_float32x4(vcvtq_f32_s32(a.val)); +} + +} + +#endif diff --git a/modules/hal/include/opencv2/hal/intrin_sse.hpp b/modules/hal/include/opencv2/hal/intrin_sse.hpp new file mode 100644 index 0000000000..12f5789ea5 --- /dev/null +++ b/modules/hal/include/opencv2/hal/intrin_sse.hpp @@ -0,0 +1,1544 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Copyright (C) 2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_HAL_SSE_HPP__ +#define __OPENCV_HAL_SSE_HPP__ + +#define CV_SIMD128 1 +#define CV_SIMD128_64F 1 + +namespace cv +{ + +struct v_uint8x16 +{ + typedef uchar lane_type; + enum { nlanes = 16 }; + + v_uint8x16() {} + explicit v_uint8x16(__m128i v) : val(v) {} + v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7, + uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15) + { + val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3, + (char)v4, (char)v5, (char)v6, (char)v7, + (char)v8, (char)v9, (char)v10, (char)v11, + (char)v12, (char)v13, (char)v14, (char)v15); + } + uchar get0() const + { + return (uchar)_mm_cvtsi128_si32(val); + } + + __m128i val; +}; + +struct v_int8x16 +{ + typedef schar lane_type; + enum { nlanes = 16 }; + + v_int8x16() {} + explicit v_int8x16(__m128i v) : val(v) {} + v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7, + schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15) + { + val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3, + (char)v4, (char)v5, (char)v6, (char)v7, + (char)v8, (char)v9, (char)v10, (char)v11, + (char)v12, (char)v13, (char)v14, (char)v15); + } + schar get0() const + { + return (schar)_mm_cvtsi128_si32(val); + } + + __m128i val; +}; + +struct v_uint16x8 +{ + typedef ushort lane_type; + enum { nlanes = 8 }; + + v_uint16x8() {} + explicit v_uint16x8(__m128i v) : val(v) {} + v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7) + { + val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3, + (short)v4, (short)v5, (short)v6, (short)v7); + } + ushort get0() const + { + return (ushort)_mm_cvtsi128_si32(val); + } + + __m128i val; +}; + +struct v_int16x8 +{ + typedef short lane_type; + enum { nlanes = 8 }; + + v_int16x8() {} + explicit v_int16x8(__m128i v) : val(v) {} + v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7) + { + val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3, + (short)v4, (short)v5, (short)v6, (short)v7); + } + short get0() const + { + return (short)_mm_cvtsi128_si32(val); + } + __m128i val; +}; + +struct v_uint32x4 +{ + typedef unsigned lane_type; + enum { nlanes = 4 }; + + v_uint32x4() {} + explicit v_uint32x4(__m128i v) : val(v) {} + v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3) + { + val = _mm_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3); + } + unsigned get0() const + { + return (unsigned)_mm_cvtsi128_si32(val); + } + __m128i val; +}; + +struct v_int32x4 +{ + typedef int lane_type; + enum { nlanes = 4 }; + + v_int32x4() {} + explicit v_int32x4(__m128i v) : val(v) {} + v_int32x4(int v0, int v1, int v2, int v3) + { + val = _mm_setr_epi32(v0, v1, v2, v3); + } + int get0() const + { + return _mm_cvtsi128_si32(val); + } + __m128i val; +}; + +struct v_float32x4 +{ + typedef float lane_type; + enum { nlanes = 4 }; + + v_float32x4() {} + explicit v_float32x4(__m128 v) : val(v) {} + v_float32x4(float v0, float v1, float v2, float v3) + { + val = _mm_setr_ps(v0, v1, v2, v3); + } + float get0() const + { + return _mm_cvtss_f32(val); + } + __m128 val; +}; + +struct v_uint64x2 +{ + typedef uint64 lane_type; + enum { nlanes = 2 }; + + v_uint64x2() {} + explicit v_uint64x2(__m128i v) : val(v) {} + v_uint64x2(uint64 v0, uint64 v1) + { + val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32)); + } + uint64 get0() const + { + int a = _mm_cvtsi128_si32(val); + int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32)); + return (unsigned)a | ((uint64)(unsigned)b << 32); + } + __m128i val; +}; + +struct v_int64x2 +{ + typedef int64 lane_type; + enum { nlanes = 2 }; + + v_int64x2() {} + explicit v_int64x2(__m128i v) : val(v) {} + v_int64x2(int64 v0, int64 v1) + { + val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32)); + } + int64 get0() const + { + int a = _mm_cvtsi128_si32(val); + int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32)); + return (int64)((unsigned)a | ((uint64)(unsigned)b << 32)); + } + __m128i val; +}; + +struct v_float64x2 +{ + typedef double lane_type; + enum { nlanes = 2 }; + + v_float64x2() {} + explicit v_float64x2(__m128d v) : val(v) {} + v_float64x2(double v0, double v1) + { + val = _mm_setr_pd(v0, v1); + } + double get0() const + { + return _mm_cvtsd_f64(val); + } + __m128d val; +}; + +#define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \ +inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \ +inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \ +template inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \ +{ return _Tpvec(cast(a.val)); } + +OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, char, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, char, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_SSE_INITVEC(v_int32x4, int, s32, si128, epi32, int, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_SSE_INITVEC(v_float32x4, float, f32, ps, ps, float, _mm_castsi128_ps) +OPENCV_HAL_IMPL_SSE_INITVEC(v_float64x2, double, f64, pd, pd, double, _mm_castsi128_pd) + +inline v_uint64x2 v_setzero_u64() { return v_uint64x2(_mm_setzero_si128()); } +inline v_int64x2 v_setzero_s64() { return v_int64x2(_mm_setzero_si128()); } +inline v_uint64x2 v_setall_u64(uint64 val) { return v_uint64x2(val, val); } +inline v_int64x2 v_setall_s64(int64 val) { return v_int64x2(val, val); } + +template inline +v_uint64x2 v_reinterpret_as_u64(const _Tpvec& a) { return v_uint64x2(a.val); } +template inline +v_int64x2 v_reinterpret_as_s64(const _Tpvec& a) { return v_int64x2(a.val); } +inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& a) +{ return v_float32x4(_mm_castsi128_ps(a.val)); } +inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& a) +{ return v_float32x4(_mm_castsi128_ps(a.val)); } +inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& a) +{ return v_float64x2(_mm_castsi128_pd(a.val)); } +inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& a) +{ return v_float64x2(_mm_castsi128_pd(a.val)); } + +#define OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(_Tpvec, suffix) \ +inline _Tpvec v_reinterpret_as_##suffix(const v_float32x4& a) \ +{ return _Tpvec(_mm_castps_si128(a.val)); } \ +inline _Tpvec v_reinterpret_as_##suffix(const v_float64x2& a) \ +{ return _Tpvec(_mm_castpd_si128(a.val)); } + +OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint8x16, u8) +OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int8x16, s8) +OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint16x8, u16) +OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int16x8, s16) +OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint32x4, u32) +OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int32x4, s32) +OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint64x2, u64) +OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int64x2, s64) + +//////////////// PACK /////////////// +inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b) +{ + __m128i delta = _mm_set1_epi16(255); + return v_uint8x16(_mm_packus_epi16(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta)), + _mm_subs_epu16(b.val, _mm_subs_epu16(b.val, delta)))); +} + +inline void v_pack_store(uchar* ptr, const v_uint16x8& a) +{ + __m128i delta = _mm_set1_epi16(255); + __m128i a1 = _mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta)); + _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1)); +} + +inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b) +{ return v_uint8x16(_mm_packus_epi16(a.val, b.val)); } + +inline void v_pack_u_store(uchar* ptr, const v_int16x8& a) +{ _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); } + +template inline +v_uint8x16 v_rshift_round_pack(const v_uint16x8& a, const v_uint16x8& b) +{ + // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers. + __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); + return v_uint8x16(_mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(a.val, delta), n), + _mm_srli_epi16(_mm_adds_epu16(b.val, delta), n))); +} + +template inline +void v_rshift_round_pack_store(uchar* ptr, const v_uint16x8& a) +{ + __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); + __m128i a1 = _mm_srli_epi16(_mm_adds_epu16(a.val, delta), n); + _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1)); +} + +template inline +v_uint8x16 v_rshift_round_pack_u(const v_int16x8& a, const v_int16x8& b) +{ + __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); + return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n), + _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n))); +} + +template inline +void v_rshift_round_pack_u_store(uchar* ptr, const v_int16x8& a) +{ + __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); + __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n); + _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1)); +} + +inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b) +{ return v_int8x16(_mm_packs_epi16(a.val, b.val)); } + +inline void v_pack_store(schar* ptr, v_int16x8& a) +{ _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); } + +template inline +v_int8x16 v_rshift_round_pack(const v_int16x8& a, const v_int16x8& b) +{ + // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers. + __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); + return v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n), + _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n))); +} +template inline +void v_rshift_round_pack_store(schar* ptr, const v_int16x8& a) +{ + // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers. + __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); + __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n); + _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a1, a1)); +} + + +// bit-wise "mask ? a : b" +inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b) +{ + return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask)); +} + +inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b) +{ + __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768); + __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32); + __m128i b1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, b.val), maxval32, b.val), delta32); + __m128i r = _mm_packs_epi32(a1, b1); + return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768))); +} + +inline void v_pack_store(ushort* ptr, const v_uint32x4& a) +{ + __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768); + __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32); + __m128i r = _mm_packs_epi32(a1, a1); + _mm_storel_epi64((__m128i*)ptr, _mm_sub_epi16(r, _mm_set1_epi16(-32768))); +} + +template inline +v_uint16x8 v_rshift_round_pack(const v_uint32x4& a, const v_uint32x4& b) +{ + __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); + __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32); + __m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val, delta), n), delta32); + return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768))); +} + +template inline +void v_rshift_round_pack_store(ushort* ptr, const v_uint32x4& a) +{ + __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); + __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32); + __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768)); + _mm_storel_epi64((__m128i*)ptr, a2); +} + +inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b) +{ + __m128i delta32 = _mm_set1_epi32(32768); + __m128i r = _mm_packs_epi32(_mm_sub_epi32(a.val, delta32), _mm_sub_epi32(b.val, delta32)); + return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768))); +} + +inline void v_pack_u_store(ushort* ptr, const v_int32x4& a) +{ + __m128i delta32 = _mm_set1_epi32(32768); + __m128i a1 = _mm_sub_epi32(a.val, delta32); + __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768)); + _mm_storel_epi64((__m128i*)ptr, r); +} + +template inline +void v_rshift_round_pack_u_store(ushort* ptr, const v_int32x4& a) +{ + __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); + __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32); + __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768)); + _mm_storel_epi64((__m128i*)ptr, a2); +} + +inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b) +{ return v_int16x8(_mm_packs_epi32(a.val, b.val)); } + +inline void v_pack_store(short* ptr, const v_int32x4& a) +{ + _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val)); +} + +template inline +v_int16x8 v_rshift_round_pack(const v_int32x4& a, const v_int32x4& b) +{ + __m128i delta = _mm_set1_epi32(1 << (n-1)); + return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), + _mm_srai_epi32(_mm_add_epi32(b.val, delta), n))); +} + +template inline +void v_rshift_round_pack_store(short* ptr, const v_int32x4& a) +{ + __m128i delta = _mm_set1_epi32(1 << (n-1)); + __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n); + _mm_storel_epi64((__m128i*)ptr, a1); +} + + +// [a0 0 | b0 0] [a1 0 | b1 0] +inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b) +{ + __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0 + __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0 + return v_uint32x4(_mm_unpacklo_epi64(v0, v1)); +} + +inline void v_pack_store(unsigned* ptr, const v_uint64x2& a) +{ + __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0)); + _mm_storel_epi64((__m128i*)ptr, a1); +} + +// [a0 0 | b0 0] [a1 0 | b1 0] +inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b) +{ + __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0 + __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0 + return v_int32x4(_mm_unpacklo_epi64(v0, v1)); +} + +inline void v_pack_store(int* ptr, const v_int64x2& a) +{ + __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0)); + _mm_storel_epi64((__m128i*)ptr, a1); +} + +template inline +v_uint32x4 v_rshift_round_pack(const v_uint64x2& a, const v_uint64x2& b) +{ + uint64 delta = (uint64)1 << (n-1); + v_uint64x2 delta2(delta, delta); + __m128 a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n); + __m128 b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n); + __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0 + __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0 + return v_uint32x4(_mm_unpacklo_epi64(v0, v1)); +} + +template inline +void v_rshift_round_pack_store(unsigned* ptr, const v_uint64x2& a) +{ + uint64 delta = (uint64)1 << (n-1); + v_uint64x2 delta2(delta, delta); + __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n); + __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0)); + _mm_storel_epi64((__m128i*)ptr, a2); +} + +inline __m128i v_sign_epi64(__m128i a) +{ + return _mm_shuffle_epi32(_mm_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1)); // x m0 | x m1 +} + +inline __m128i v_srai_epi64(__m128i a, int imm) +{ + __m128i smask = v_sign_epi64(a); + return _mm_xor_si128(_mm_srli_epi64(_mm_xor_si128(a, smask), imm), smask); +} + +template inline +v_int32x4 v_rshift_round_pack(const v_int64x2& a, const v_int64x2& b) +{ + int64 delta = (int64)1 << (n-1); + v_int64x2 delta2(delta, delta); + __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n); + __m128i b1 = v_srai_epi64(_mm_add_epi64(b.val, delta2.val), n); + __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0 + __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0 + return v_int32x4(_mm_unpacklo_epi64(v0, v1)); +} + +template inline +void v_rshift_round_pack_store(int* ptr, const v_int64x2& a) +{ + int64 delta = (int64)1 << (n-1); + v_int64x2 delta2(delta, delta); + __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n); + __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0)); + _mm_storel_epi64((__m128i*)ptr, a2); +} + +inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, + const v_float32x4& m1, const v_float32x4& m2, + const v_float32x4& m3) +{ + __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val); + __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val); + __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val); + __m128 v3 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(3, 3, 3, 3)), m3.val); + + return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3))); +} + + +#define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \ + inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ + { \ + return _Tpvec(intrin(a.val, b.val)); \ + } \ + inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ + { \ + a.val = intrin(a.val, b.val); \ + return a; \ + } + +OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8) +OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8) +OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8) +OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8) +OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16) +OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16) +OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint16x8, _mm_mullo_epi16) +OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16) +OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16) +OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int16x8, _mm_mullo_epi16) +OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32) +OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32) +OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32) +OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32) +OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps) +OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps) +OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps) +OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps) +OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd) +OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd) +OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd) +OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd) +OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64) +OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64) +OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64) +OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64) + +inline v_uint32x4 operator * (const v_uint32x4& a, const v_uint32x4& b) +{ + __m128i c0 = _mm_mul_epu32(a.val, b.val); + __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32)); + __m128i d0 = _mm_unpacklo_epi32(c0, c1); + __m128i d1 = _mm_unpackhi_epi32(c0, c1); + return v_uint32x4(_mm_unpacklo_epi64(d0, d1)); +} +inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b) +{ + __m128i c0 = _mm_mul_epu32(a.val, b.val); + __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32)); + __m128i d0 = _mm_unpacklo_epi32(c0, c1); + __m128i d1 = _mm_unpackhi_epi32(c0, c1); + return v_int32x4(_mm_unpacklo_epi64(d0, d1)); +} + +inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b, + v_int32x4& c, v_int32x4& d) +{ + __m128i v0 = _mm_mullo_epi16(a.val, b.val); + __m128i v1 = _mm_mulhi_epi16(a.val, b.val); + c.val = _mm_unpacklo_epi32(v0, v1); + d.val = _mm_unpackhi_epi32(v0, v1); +} + +inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b, + v_uint32x4& c, v_uint32x4& d) +{ + __m128i v0 = _mm_mullo_epi16(a.val, b.val); + __m128i v1 = _mm_mulhi_epu16(a.val, b.val); + c.val = _mm_unpacklo_epi32(v0, v1); + d.val = _mm_unpackhi_epi32(v0, v1); +} + +inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, + v_uint64x2& c, v_uint64x2& d) +{ + __m128i c0 = _mm_mul_epu32(a.val, b.val); + __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32)); + c.val = _mm_unpacklo_epi64(c0, c1); + d.val = _mm_unpackhi_epi64(c0, c1); +} + +inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) +{ + return v_int32x4(_mm_madd_epi16(a.val, b.val)); +} + +#define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \ + OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \ + OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \ + OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \ + inline _Tpvec operator ~ (const _Tpvec& a) \ + { \ + return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \ + } + +OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint8x16, si128, _mm_set1_epi32(-1)) +OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int8x16, si128, _mm_set1_epi32(-1)) +OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint16x8, si128, _mm_set1_epi32(-1)) +OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int16x8, si128, _mm_set1_epi32(-1)) +OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint32x4, si128, _mm_set1_epi32(-1)) +OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int32x4, si128, _mm_set1_epi32(-1)) +OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint64x2, si128, _mm_set1_epi32(-1)) +OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int64x2, si128, _mm_set1_epi32(-1)) +OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1))) +OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1))) + +inline v_float32x4 v_sqrt(const v_float32x4& x) +{ return v_float32x4(_mm_sqrt_ps(x.val)); } + +inline v_float32x4 v_invsqrt(const v_float32x4& x) +{ + static const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f); + __m128 t = x.val; + __m128 h = _mm_mul_ps(t, _0_5); + t = _mm_rsqrt_ps(t); + t = _mm_mul_ps(t, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t, t), h))); + return v_float32x4(t); +} + +inline v_float64x2 v_sqrt(const v_float64x2& x) +{ return v_float64x2(_mm_sqrt_pd(x.val)); } + +inline v_float64x2 v_invsqrt(const v_float64x2& x) +{ + static const __m128d v_1 = _mm_set1_pd(1.); + return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val))); +} + +inline v_float32x4 v_abs(const v_float32x4& x) +{ return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); } +inline v_float64x2 v_abs(const v_float64x2& x) +{ + return v_float64x2(_mm_and_pd(x.val, + _mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1)))); +} + +// TODO: exp, log, sin, cos + +#define OPENCV_HAL_IMPL_SSE_BIN_FUNC(_Tpvec, func, intrin) \ +inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(intrin(a.val, b.val)); \ +} + +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_min, _mm_min_epu8) +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_max, _mm_max_epu8) +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_min, _mm_min_epi16) +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_max, _mm_max_epi16) +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_min, _mm_min_ps) +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_max, _mm_max_ps) +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_min, _mm_min_pd) +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_max, _mm_max_pd) + +inline v_int8x16 v_min(const v_int8x16& a, const v_int8x16& b) +{ + __m128i delta = _mm_set1_epi8((char)-128); + return v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta), + _mm_xor_si128(b.val, delta)))); +} +inline v_int8x16 v_max(const v_int8x16& a, const v_int8x16& b) +{ + __m128i delta = _mm_set1_epi8((char)-128); + return v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta), + _mm_xor_si128(b.val, delta)))); +} +inline v_uint16x8 v_min(const v_uint16x8& a, const v_uint16x8& b) +{ + return v_uint16x8(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, b.val))); +} +inline v_uint16x8 v_max(const v_uint16x8& a, const v_uint16x8& b) +{ + return v_uint16x8(_mm_adds_epu16(_mm_subs_epu16(a.val, b.val), b.val)); +} +inline v_uint32x4 v_min(const v_uint32x4& a, const v_uint32x4& b) +{ + __m128i delta = _mm_set1_epi32((int)0x80000000); + __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta)); + return v_uint32x4(v_select_si128(mask, b.val, a.val)); +} +inline v_uint32x4 v_max(const v_uint32x4& a, const v_uint32x4& b) +{ + __m128i delta = _mm_set1_epi32((int)0x80000000); + __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta)); + return v_uint32x4(v_select_si128(mask, a.val, b.val)); +} +inline v_int32x4 v_min(const v_int32x4& a, const v_int32x4& b) +{ + return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), b.val, a.val)); +} +inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b) +{ + return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), a.val, b.val)); +} + +#define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \ +inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \ +{ return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \ +inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \ +{ \ + __m128i not_mask = _mm_set1_epi32(-1); \ + return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \ +} \ +inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \ +{ return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \ +inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \ +{ \ + __m128i not_mask = _mm_set1_epi32(-1); \ + return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \ +} \ +inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \ +{ \ + __m128i smask = _mm_set1_##suffix(sbit); \ + return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \ +} \ +inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \ +{ \ + __m128i smask = _mm_set1_##suffix(sbit); \ + return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \ +} \ +inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \ +{ \ + __m128i smask = _mm_set1_##suffix(sbit); \ + __m128i not_mask = _mm_set1_epi32(-1); \ + __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \ + return _Tpuvec(_mm_xor_si128(res, not_mask)); \ +} \ +inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \ +{ \ + __m128i smask = _mm_set1_##suffix(sbit); \ + __m128i not_mask = _mm_set1_epi32(-1); \ + __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \ + return _Tpuvec(_mm_xor_si128(res, not_mask)); \ +} \ +inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \ +{ \ + return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \ +} \ +inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \ +{ \ + return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \ +} \ +inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \ +{ \ + __m128i not_mask = _mm_set1_epi32(-1); \ + return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \ +} \ +inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \ +{ \ + __m128i not_mask = _mm_set1_epi32(-1); \ + return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \ +} + +OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)-128) +OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768) +OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000) + +#define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \ +inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \ +inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \ +inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \ +inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \ +inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \ +inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); } + +OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps) +OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd) + +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8) +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8) +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16) +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_add_wrap, _mm_add_epi16) +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8) +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8) +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16) +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16) + +#define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \ +inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \ +{ \ + return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a.val, b.val), _mm_subs_epu##bits(b.val, a.val))); \ +} \ +inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \ +{ \ + __m128i smask = _mm_set1_epi32(smask32); \ + __m128i a1 = _mm_xor_si128(a.val, smask); \ + __m128i b1 = _mm_xor_si128(b.val, smask); \ + return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a1, b1), _mm_subs_epu##bits(b1, a1))); \ +} + +OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080) +OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000) + +#define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \ +inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \ +{ \ + _Tpreg absmask = _mm_castsi128_##suffix(absmask_vec); \ + return _Tpvec(_mm_and_##suffix(_mm_sub_##suffix(a.val, b.val), absmask)); \ +} \ +inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \ +{ \ + _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \ + return _Tpvec(_mm_sqrt_##suffix(res)); \ +} \ +inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \ +{ \ + _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \ + return _Tpvec(res); \ +} \ +inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \ +{ \ + return _Tpvec(_mm_add_##suffix(_mm_mul_##suffix(a.val, b.val), c.val)); \ +} + +OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff)) +OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1)) + +#define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \ +inline _Tpuvec operator << (const _Tpuvec& a, int imm) \ +{ \ + return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \ +} \ +inline _Tpsvec operator << (const _Tpsvec& a, int imm) \ +{ \ + return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \ +} \ +inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \ +{ \ + return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \ +} \ +inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \ +{ \ + return _Tpsvec(srai(a.val, imm)); \ +} \ +template \ +inline _Tpuvec v_lshift(const _Tpuvec& a) \ +{ \ + return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \ +} \ +template \ +inline _Tpsvec v_lshift(const _Tpsvec& a) \ +{ \ + return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \ +} \ +template \ +inline _Tpuvec v_rshift(const _Tpuvec& a) \ +{ \ + return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \ +} \ +template \ +inline _Tpsvec v_rshift(const _Tpsvec& a) \ +{ \ + return _Tpsvec(srai(a.val, imm)); \ +} + +OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16, _mm_srai_epi16) +OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32) +OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64) + +#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \ +inline _Tpvec v_load(const _Tp* ptr) \ +{ return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \ +inline _Tpvec v_load_aligned(const _Tp* ptr) \ +{ return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \ +inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ +{ \ + return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \ + _mm_loadl_epi64((const __m128i*)ptr1))); \ +} \ +inline void v_store(_Tp* ptr, const _Tpvec& a) \ +{ _mm_storeu_si128((__m128i*)ptr, a.val); } \ +inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ +{ _mm_store_si128((__m128i*)ptr, a.val); } \ +inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ +{ _mm_storel_epi64((__m128i*)ptr, a.val); } \ +inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ +{ _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a.val, a.val)); } + +OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint8x16, uchar) +OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int8x16, schar) +OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint16x8, ushort) +OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int16x8, short) +OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint32x4, unsigned) +OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int32x4, int) +OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint64x2, uint64) +OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int64x2, int64) + +#define OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(_Tpvec, _Tp, suffix) \ +inline _Tpvec v_load(const _Tp* ptr) \ +{ return _Tpvec(_mm_loadu_##suffix(ptr)); } \ +inline _Tpvec v_load_aligned(const _Tp* ptr) \ +{ return _Tpvec(_mm_load_##suffix(ptr)); } \ +inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ +{ \ + return _Tpvec(_mm_castsi128_##suffix( \ + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \ + _mm_loadl_epi64((const __m128i*)ptr1)))); \ +} \ +inline void v_store(_Tp* ptr, const _Tpvec& a) \ +{ _mm_storeu_##suffix(ptr, a.val); } \ +inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ +{ _mm_store_##suffix(ptr, a.val); } \ +inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ +{ _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \ +inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ +{ \ + __m128i a1 = _mm_cast##suffix##_si128(a.val); \ + _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a1, a1)); \ +} + +OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps) +OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd) + +#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \ +inline scalartype v_reduce_##func(const _Tpvec& a) \ +{ \ + scalartype CV_DECL_ALIGNED(16) buf[4]; \ + v_store_aligned(buf, a); \ + scalartype s0 = scalar_func(buf[0], buf[1]); \ + scalartype s1 = scalar_func(buf[2], buf[3]); \ + return scalar_func(s0, s1); \ +} + +OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, sum, OPENCV_HAL_ADD) +OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max) +OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min) +OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, sum, OPENCV_HAL_ADD) +OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max) +OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min) +OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD) +OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max) +OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min) + +#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \ +inline int v_signmask(const _Tpvec& a) \ +{ \ + return and_op(_mm_movemask_##suffix(pack_op(a.val)), signmask); \ +} \ +inline bool v_check_all(const _Tpvec& a) \ +{ return and_op(_mm_movemask_##suffix(a.val), allmask) == allmask; } \ +inline bool v_check_any(const _Tpvec& a) \ +{ return and_op(_mm_movemask_##suffix(a.val), allmask) != 0; } + +#define OPENCV_HAL_PACKS(a) _mm_packs_epi16(a, a) +inline __m128i v_packq_epi32(__m128i a) +{ + __m128i b = _mm_packs_epi32(a, a); + return _mm_packs_epi16(b, b); +} + +OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535) +OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535) +OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa) +OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa) +OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888) +OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888) +OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 15, 15) +OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 3, 3) + +#define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \ +inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(_mm_xor_##suffix(b.val, _mm_and_##suffix(_mm_xor_##suffix(b.val, a.val), mask.val))); \ +} + +OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, si128) +OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, si128) +OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128) +OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128) +OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128) +OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128) +OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128) +OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128) +OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps) +OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd) + +#define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpuvec, _Tpwuvec, _Tpu, _Tpsvec, _Tpwsvec, _Tps, suffix, wsuffix, shift) \ +inline void v_expand(const _Tpuvec& a, _Tpwuvec& b0, _Tpwuvec& b1) \ +{ \ + __m128i z = _mm_setzero_si128(); \ + b0.val = _mm_unpacklo_##suffix(a.val, z); \ + b1.val = _mm_unpackhi_##suffix(a.val, z); \ +} \ +inline _Tpwuvec v_load_expand(const _Tpu* ptr) \ +{ \ + __m128i z = _mm_setzero_si128(); \ + return _Tpwuvec(_mm_unpacklo_##suffix(_mm_loadl_epi64((const __m128i*)ptr), z)); \ +} \ +inline void v_expand(const _Tpsvec& a, _Tpwsvec& b0, _Tpwsvec& b1) \ +{ \ + b0.val = _mm_srai_##wsuffix(_mm_unpacklo_##suffix(a.val, a.val), shift); \ + b1.val = _mm_srai_##wsuffix(_mm_unpackhi_##suffix(a.val, a.val), shift); \ +} \ +inline _Tpwsvec v_load_expand(const _Tps* ptr) \ +{ \ + __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \ + return _Tpwsvec(_mm_srai_##wsuffix(_mm_unpacklo_##suffix(a, a), shift)); \ +} + +OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, v_int8x16, v_int16x8, schar, epi8, epi16, 8) +OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, v_int16x8, v_int32x4, short, epi16, epi32, 16) + +inline void v_expand(const v_uint32x4& a, v_uint64x2& b0, v_uint64x2& b1) +{ + __m128i z = _mm_setzero_si128(); + b0.val = _mm_unpacklo_epi32(a.val, z); + b1.val = _mm_unpackhi_epi32(a.val, z); +} +inline v_uint64x2 v_load_expand(const unsigned* ptr) +{ + __m128i z = _mm_setzero_si128(); + return v_uint64x2(_mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i*)ptr), z)); +} +inline void v_expand(const v_int32x4& a, v_int64x2& b0, v_int64x2& b1) +{ + __m128i s = _mm_srai_epi32(a.val, 31); + b0.val = _mm_unpacklo_epi32(a.val, s); + b1.val = _mm_unpackhi_epi32(a.val, s); +} +inline v_int64x2 v_load_expand(const int* ptr) +{ + __m128i a = _mm_loadl_epi64((const __m128i*)ptr); + __m128i s = _mm_srai_epi32(a, 31); + return v_int64x2(_mm_unpacklo_epi32(a, s)); +} + +inline v_uint32x4 v_load_expand_q(const uchar* ptr) +{ + __m128i z = _mm_setzero_si128(); + __m128i a = _mm_cvtsi32_si128(*(const int*)ptr); + return v_uint32x4(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z)); +} + +inline v_int32x4 v_load_expand_q(const schar* ptr) +{ + __m128i a = _mm_cvtsi32_si128(*(const int*)ptr); + a = _mm_unpacklo_epi8(a, a); + a = _mm_unpacklo_epi8(a, a); + return v_int32x4(_mm_srai_epi32(a, 24)); +} + +#define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \ +inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \ +{ \ + b0.val = _mm_unpacklo_##suffix(a0.val, a1.val); \ + b1.val = _mm_unpackhi_##suffix(a0.val, a1.val); \ +} \ +inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \ +{ \ + __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \ + return _Tpvec(cast_to(_mm_unpacklo_epi64(a1, b1))); \ +} \ +inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \ +{ \ + __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \ + return _Tpvec(cast_to(_mm_unpackhi_epi64(a1, b1))); \ +} \ +inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \ +{ \ + __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \ + c.val = cast_to(_mm_unpacklo_epi64(a1, b1)); \ + d.val = cast_to(_mm_unpackhi_epi64(a1, b1)); \ +} + +OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_SSE_UNPACKS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_SSE_UNPACKS(v_int16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps) +OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd) + +inline v_int32x4 v_round(const v_float32x4& a) +{ return v_int32x4(_mm_cvtps_epi32(a.val)); } + +inline v_int32x4 v_floor(const v_float32x4& a) +{ + __m128i a1 = _mm_cvtps_epi32(a.val); + __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(a1), a.val)); + return v_int32x4(_mm_add_epi32(a1, mask)); +} + +inline v_int32x4 v_ceil(const v_float32x4& a) +{ + __m128i a1 = _mm_cvtps_epi32(a.val); + __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(a.val, _mm_cvtepi32_ps(a1))); + return v_int32x4(_mm_sub_epi32(a1, mask)); +} + +inline v_int32x4 v_trunc(const v_float32x4& a) +{ return v_int32x4(_mm_cvttps_epi32(a.val)); } + +inline v_int32x4 v_round(const v_float64x2& a) +{ return v_int32x4(_mm_cvtpd_epi32(a.val)); } + +inline v_int32x4 v_floor(const v_float64x2& a) +{ + __m128i a1 = _mm_cvtpd_epi32(a.val); + __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(_mm_cvtepi32_pd(a1), a.val)); + mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0 + return v_int32x4(_mm_add_epi32(a1, mask)); +} + +inline v_int32x4 v_ceil(const v_float64x2& a) +{ + __m128i a1 = _mm_cvtpd_epi32(a.val); + __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(a.val, _mm_cvtepi32_pd(a1))); + mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0 + return v_int32x4(_mm_sub_epi32(a1, mask)); +} + +inline v_int32x4 v_trunc(const v_float64x2& a) +{ return v_int32x4(_mm_cvttpd_epi32(a.val)); } + +#define OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \ +inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \ + const _Tpvec& a2, const _Tpvec& a3, \ + _Tpvec& b0, _Tpvec& b1, \ + _Tpvec& b2, _Tpvec& b3) \ +{ \ + __m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \ + __m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \ + __m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \ + __m128i t3 = cast_from(_mm_unpackhi_##suffix(a2.val, a3.val)); \ +\ + b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \ + b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \ + b2.val = cast_to(_mm_unpacklo_epi64(t2, t3)); \ + b3.val = cast_to(_mm_unpackhi_epi64(t2, t3)); \ +} + +OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps) + +// adopted from sse_utils.hpp +inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c) +{ + __m128i t00 = _mm_loadu_si128((const __m128i*)ptr); + __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16)); + __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 32)); + + __m128i t10 = _mm_unpacklo_epi8(t00, _mm_unpackhi_epi64(t01, t01)); + __m128i t11 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t00, t00), t02); + __m128i t12 = _mm_unpacklo_epi8(t01, _mm_unpackhi_epi64(t02, t02)); + + __m128i t20 = _mm_unpacklo_epi8(t10, _mm_unpackhi_epi64(t11, t11)); + __m128i t21 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t10, t10), t12); + __m128i t22 = _mm_unpacklo_epi8(t11, _mm_unpackhi_epi64(t12, t12)); + + __m128i t30 = _mm_unpacklo_epi8(t20, _mm_unpackhi_epi64(t21, t21)); + __m128i t31 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t20, t20), t22); + __m128i t32 = _mm_unpacklo_epi8(t21, _mm_unpackhi_epi64(t22, t22)); + + a.val = _mm_unpacklo_epi8(t30, _mm_unpackhi_epi64(t31, t31)); + b.val = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t30, t30), t32); + c.val = _mm_unpacklo_epi8(t31, _mm_unpackhi_epi64(t32, t32)); +} + +inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d) +{ + __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ... + __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ... + __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); // a8 b8 c8 d8 ... + __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 48)); // a12 b12 c12 d12 ... + + __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ... + __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ... + __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ... + __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b4 b14 ... + + u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ... + u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ... + u2 = _mm_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ... + u3 = _mm_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ... + + v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ... + v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ... + v2 = _mm_unpackhi_epi8(u0, u1); // b0 b2 b4 b6 ... + v3 = _mm_unpackhi_epi8(u2, u3); // b1 b3 b5 b7 ... + + a.val = _mm_unpacklo_epi8(v0, v1); + b.val = _mm_unpacklo_epi8(v2, v3); + c.val = _mm_unpackhi_epi8(v0, v1); + d.val = _mm_unpacklo_epi8(v2, v3); +} + +inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c) +{ + __m128i t00 = _mm_loadu_si128((const __m128i*)ptr); + __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 8)); + __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 16)); + + __m128i t10 = _mm_unpacklo_epi16(t00, _mm_unpackhi_epi64(t01, t01)); + __m128i t11 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t00, t00), t02); + __m128i t12 = _mm_unpacklo_epi16(t01, _mm_unpackhi_epi64(t02, t02)); + + __m128i t20 = _mm_unpacklo_epi16(t10, _mm_unpackhi_epi64(t11, t11)); + __m128i t21 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t10, t10), t12); + __m128i t22 = _mm_unpacklo_epi16(t11, _mm_unpackhi_epi64(t12, t12)); + + a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21)); + b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22); + c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22)); +} + +inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d) +{ + __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 + __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 8)); // a2 b2 c2 d2 ... + __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ... + __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 24)); // a6 b6 c6 d6 ... + + __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ... + __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ... + __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ... + __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ... + + u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ... + u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ... + u2 = _mm_unpackhi_epi16(v0, v2); // c0 c2 c4 c6 ... + u3 = _mm_unpackhi_epi16(v1, v3); // c1 c3 c5 c7 ... + + a.val = _mm_unpacklo_epi16(u0, u1); + b.val = _mm_unpackhi_epi16(u0, u1); + c.val = _mm_unpacklo_epi16(u2, u3); + d.val = _mm_unpackhi_epi16(u2, u3); +} + +inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c) +{ + __m128i t00 = _mm_loadu_si128((const __m128i*)ptr); + __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 4)); + __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 8)); + + __m128i t10 = _mm_unpacklo_epi32(t00, _mm_unpackhi_epi64(t01, t01)); + __m128i t11 = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t00, t00), t02); + __m128i t12 = _mm_unpacklo_epi32(t01, _mm_unpackhi_epi64(t02, t02)); + + a.val = _mm_unpacklo_epi32(t10, _mm_unpackhi_epi64(t11, t11)); + b.val = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t10, t10), t12); + c.val = _mm_unpacklo_epi32(t11, _mm_unpackhi_epi64(t12, t12)); +} + +inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d) +{ + v_uint32x4 u0(_mm_loadu_si128((const __m128i*)ptr)); // a0 b0 c0 d0 + v_uint32x4 u1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1 + v_uint32x4 u2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2 + v_uint32x4 u3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3 + + v_transpose4x4(u0, u1, u2, u3, a, b, c, d); +} + +inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, + const v_uint8x16& c ) +{ + __m128i z = _mm_setzero_si128(); + __m128i ab0 = _mm_unpacklo_epi8(a.val, b.val); + __m128i ab1 = _mm_unpackhi_epi8(a.val, b.val); + __m128i c0 = _mm_unpacklo_epi8(c.val, z); + __m128i c1 = _mm_unpackhi_epi8(c.val, z); + + __m128i p00 = _mm_unpacklo_epi16(ab0, c0); + __m128i p01 = _mm_unpackhi_epi16(ab0, c0); + __m128i p02 = _mm_unpacklo_epi16(ab1, c1); + __m128i p03 = _mm_unpackhi_epi16(ab1, c1); + + __m128i p10 = _mm_unpacklo_epi32(p00, p01); + __m128i p11 = _mm_unpackhi_epi32(p00, p01); + __m128i p12 = _mm_unpacklo_epi32(p02, p03); + __m128i p13 = _mm_unpackhi_epi32(p02, p03); + + __m128i p20 = _mm_unpacklo_epi64(p10, p11); + __m128i p21 = _mm_unpackhi_epi64(p10, p11); + __m128i p22 = _mm_unpacklo_epi64(p12, p13); + __m128i p23 = _mm_unpackhi_epi64(p12, p13); + + p20 = _mm_slli_si128(p20, 1); + p22 = _mm_slli_si128(p22, 1); + + __m128i p30 = _mm_slli_epi64(_mm_unpacklo_epi32(p20, p21), 8); + __m128i p31 = _mm_srli_epi64(_mm_unpackhi_epi32(p20, p21), 8); + __m128i p32 = _mm_slli_epi64(_mm_unpacklo_epi32(p22, p23), 8); + __m128i p33 = _mm_srli_epi64(_mm_unpackhi_epi32(p22, p23), 8); + + __m128i p40 = _mm_unpacklo_epi64(p30, p31); + __m128i p41 = _mm_unpackhi_epi64(p30, p31); + __m128i p42 = _mm_unpacklo_epi64(p32, p33); + __m128i p43 = _mm_unpackhi_epi64(p32, p33); + + __m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10)); + __m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6)); + __m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2)); + + _mm_storeu_si128((__m128i*)(ptr), v0); + _mm_storeu_si128((__m128i*)(ptr + 16), v1); + _mm_storeu_si128((__m128i*)(ptr + 32), v2); +} + +inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, + const v_uint8x16& c, const v_uint8x16& d) +{ + // a0 a1 a2 a3 .... + // b0 b1 b2 b3 .... + // c0 c1 c2 c3 .... + // d0 d1 d2 d3 .... + __m128i u0 = _mm_unpacklo_epi8(a.val, c.val); // a0 c0 a1 c1 ... + __m128i u1 = _mm_unpackhi_epi8(a.val, c.val); // a8 c8 a9 c9 ... + __m128i u2 = _mm_unpacklo_epi8(b.val, d.val); // b0 d0 b1 d1 ... + __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ... + + __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ... + __m128i v1 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ... + __m128i v2 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ... + __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ... + + _mm_storeu_si128((__m128i*)ptr, v0); + _mm_storeu_si128((__m128i*)(ptr + 16), v2); + _mm_storeu_si128((__m128i*)(ptr + 32), v1); + _mm_storeu_si128((__m128i*)(ptr + 48), v3); +} + +inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, + const v_uint16x8& b, + const v_uint16x8& c ) +{ + __m128i z = _mm_setzero_si128(); + __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val); + __m128i ab1 = _mm_unpackhi_epi16(a.val, b.val); + __m128i c0 = _mm_unpacklo_epi16(c.val, z); + __m128i c1 = _mm_unpackhi_epi16(c.val, z); + + __m128i p10 = _mm_unpacklo_epi32(ab0, c0); + __m128i p11 = _mm_unpackhi_epi32(ab0, c0); + __m128i p12 = _mm_unpacklo_epi32(ab1, c1); + __m128i p13 = _mm_unpackhi_epi32(ab1, c1); + + __m128i p20 = _mm_unpacklo_epi64(p10, p11); + __m128i p21 = _mm_unpackhi_epi64(p10, p11); + __m128i p22 = _mm_unpacklo_epi64(p12, p13); + __m128i p23 = _mm_unpackhi_epi64(p12, p13); + + p20 = _mm_slli_si128(p20, 2); + p22 = _mm_slli_si128(p22, 2); + + __m128i p30 = _mm_unpacklo_epi64(p20, p21); + __m128i p31 = _mm_unpackhi_epi64(p20, p21); + __m128i p32 = _mm_unpacklo_epi64(p22, p23); + __m128i p33 = _mm_unpackhi_epi64(p22, p23); + + __m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10)); + __m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6)); + __m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2)); + + _mm_storeu_si128((__m128i*)(ptr), v0); + _mm_storeu_si128((__m128i*)(ptr + 8), v1); + _mm_storeu_si128((__m128i*)(ptr + 16), v2); +} + +inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b, + const v_uint16x8& c, const v_uint16x8& d) +{ + // a0 a1 a2 a3 .... + // b0 b1 b2 b3 .... + // c0 c1 c2 c3 .... + // d0 d1 d2 d3 .... + __m128i u0 = _mm_unpacklo_epi16(a.val, c.val); // a0 c0 a1 c1 ... + __m128i u1 = _mm_unpackhi_epi16(a.val, c.val); // a4 c4 a5 c5 ... + __m128i u2 = _mm_unpacklo_epi16(b.val, d.val); // b0 d0 b1 d1 ... + __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ... + + __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ... + __m128i v1 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ... + __m128i v2 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ... + __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ... + + _mm_storeu_si128((__m128i*)ptr, v0); + _mm_storeu_si128((__m128i*)(ptr + 8), v2); + _mm_storeu_si128((__m128i*)(ptr + 16), v1); + _mm_storeu_si128((__m128i*)(ptr + 24), v3); +} + +inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b, + const v_uint32x4& c ) +{ + v_uint32x4 z = v_setzero_u32(), u0, u1, u2, u3; + v_transpose4x4(a, b, c, z, u0, u1, u2, u3); + + __m128i v0 = _mm_or_si128(u0.val, _mm_slli_si128(u1.val, 12)); + __m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8)); + __m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4)); + + _mm_storeu_si128((__m128i*)ptr, v0); + _mm_storeu_si128((__m128i*)(ptr + 4), v1); + _mm_storeu_si128((__m128i*)(ptr + 8), v2); +} + +inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b, + const v_uint32x4& c, const v_uint32x4& d) +{ + v_uint32x4 t0, t1, t2, t3; + v_transpose4x4(a, b, c, d, t0, t1, t2, t3); + v_store(ptr, t0); + v_store(ptr + 4, t1); + v_store(ptr + 8, t2); + v_store(ptr + 12, t3); +} + +#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \ +inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \ + _Tpvec& b0, _Tpvec& c0 ) \ +{ \ + _Tpuvec a1, b1, c1; \ + v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1); \ + a0 = v_reinterpret_as_##suffix(a1); \ + b0 = v_reinterpret_as_##suffix(b1); \ + c0 = v_reinterpret_as_##suffix(c1); \ +} \ +inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \ + _Tpvec& b0, _Tpvec& c0, _Tpvec& d0 ) \ +{ \ + _Tpuvec a1, b1, c1, d1; \ + v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1, d1); \ + a0 = v_reinterpret_as_##suffix(a1); \ + b0 = v_reinterpret_as_##suffix(b1); \ + c0 = v_reinterpret_as_##suffix(c1); \ + d0 = v_reinterpret_as_##suffix(d1); \ +} \ +inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, \ + const _Tpvec& b0, const _Tpvec& c0 ) \ +{ \ + _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \ + _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \ + _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \ + v_store_interleave((_Tpu*)ptr, a1, b1, c1); \ +} \ +inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, const _Tpvec& b0, \ + const _Tpvec& c0, const _Tpvec& d0 ) \ +{ \ + _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \ + _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \ + _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \ + _Tpuvec d1 = v_reinterpret_as_##usuffix(d0); \ + v_store_interleave((_Tpu*)ptr, a1, b1, c1, d1); \ +} + +OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8) +OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16) +OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32) +OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float32x4, float, f32, v_uint32x4, unsigned, u32) + +inline v_float32x4 v_cvt_f32(const v_int32x4& a) +{ + return v_float32x4(_mm_cvtepi32_ps(a.val)); +} + +inline v_float32x4 v_cvt_f32(const v_float64x2& a) +{ + return v_float32x4(_mm_cvtpd_ps(a.val)); +} + +inline v_float64x2 v_cvt_f64(const v_int32x4& a) +{ + return v_float64x2(_mm_cvtepi32_pd(a.val)); +} + +inline v_float64x2 v_cvt_f64(const v_float32x4& a) +{ + return v_float64x2(_mm_cvtps_pd(a.val)); +} + +} + +#endif From 6916a1438fdb9dab7b459077dc91e65e00a020ff Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Mon, 20 Apr 2015 21:34:37 +0300 Subject: [PATCH 10/12] fixed some newly introduced compile errors and warnings --- .../hal/include/opencv2/hal/intrin_cpp.hpp | 10 ++--- .../hal/include/opencv2/hal/intrin_neon.hpp | 12 +++--- .../hal/include/opencv2/hal/intrin_sse.hpp | 42 +++++++++---------- 3 files changed, 32 insertions(+), 32 deletions(-) diff --git a/modules/hal/include/opencv2/hal/intrin_cpp.hpp b/modules/hal/include/opencv2/hal/intrin_cpp.hpp index d0d5b28a29..e0140a8632 100644 --- a/modules/hal/include/opencv2/hal/intrin_cpp.hpp +++ b/modules/hal/include/opencv2/hal/intrin_cpp.hpp @@ -735,11 +735,11 @@ OPENCV_HAL_IMPL_C_INIT(v_uint64x2, uint64, u64) OPENCV_HAL_IMPL_C_INIT(v_uint64x2, int64, s64) #define OPENCV_HAL_IMPL_C_SHIFT(_Tpvec, _Tp) \ -template inline _Tpvec v_lshift(const _Tpvec& a) \ +template inline _Tpvec v_shl(const _Tpvec& a) \ { return a << n; } \ -template inline _Tpvec v_rshift(const _Tpvec& a) \ +template inline _Tpvec v_shr(const _Tpvec& a) \ { return a >> n; } \ -template inline _Tpvec v_rshift_round(const _Tpvec& a) \ +template inline _Tpvec v_rshr(const _Tpvec& a) \ { \ _Tpvec c; \ for( int i = 0; i < _Tpvec::nlanes; i++ ) \ @@ -766,7 +766,7 @@ inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \ } \ return c; \ } \ -template inline _Tpnvec v_rshift_round_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \ +template inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \ { \ _Tpnvec c; \ for( int i = 0; i < _Tpvec::nlanes; i++ ) \ @@ -781,7 +781,7 @@ inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \ for( int i = 0; i < _Tpvec::nlanes; i++ ) \ ptr[i] = saturate_cast<_Tpn>(a.s[i]); \ } \ -template inline void v_rshift_round_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \ +template inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \ { \ for( int i = 0; i < _Tpvec::nlanes; i++ ) \ ptr[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \ diff --git a/modules/hal/include/opencv2/hal/intrin_neon.hpp b/modules/hal/include/opencv2/hal/intrin_neon.hpp index 4bda95db68..ab6aa86315 100644 --- a/modules/hal/include/opencv2/hal/intrin_neon.hpp +++ b/modules/hal/include/opencv2/hal/intrin_neon.hpp @@ -264,14 +264,14 @@ inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \ vst1_##suffix(ptr, a1); \ } \ template inline \ -_Tpvec v_rshift_round_##pack(const _Tpwvec& a, const _Tpwvec& b) \ +_Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \ { \ hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \ hreg b1 = vqrshr##op##_n_##wsuffix(b.val, n); \ return _Tpvec(vcombine_##suffix(a1, b1)); \ } \ template inline \ -void v_rshift_round_##pack##_store(_Tp* ptr, const _Tpwvec& a) \ +void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \ { \ hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \ vst1_##suffix(ptr, a1); \ @@ -283,7 +283,7 @@ OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, s16, pack, OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, u32, pack, n) OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, s32, pack_u, un) OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, s32, pack, n) -OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, u32, pack, n) +OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, u64, pack, n) OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, s64, pack, n) inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, @@ -516,11 +516,11 @@ inline _Tpvec operator << (const _Tpvec& a, int n) \ { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \ inline _Tpvec operator >> (const _Tpvec& a, int n) \ { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \ -template inline _Tpvec lshift(const _Tpvec& a) \ +template inline _Tpvec v_shl(const _Tpvec& a) \ { return _Tpvec(vshlq_n_##suffix(a.val, n)); } \ -template inline _Tpvec rshift(const _Tpvec& a) \ +template inline _Tpvec v_shr(const _Tpvec& a) \ { return _Tpvec(vshrq_n_##suffix(a.val, n)); } \ -template inline _Tpvec rshift_round(const _Tpvec& a) \ +template inline _Tpvec v_rshr(const _Tpvec& a) \ { return _Tpvec(vrshrq_n_##suffix(a.val, n)); } OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8) diff --git a/modules/hal/include/opencv2/hal/intrin_sse.hpp b/modules/hal/include/opencv2/hal/intrin_sse.hpp index 12f5789ea5..3b77a11542 100644 --- a/modules/hal/include/opencv2/hal/intrin_sse.hpp +++ b/modules/hal/include/opencv2/hal/intrin_sse.hpp @@ -318,7 +318,7 @@ inline void v_pack_u_store(uchar* ptr, const v_int16x8& a) { _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); } template inline -v_uint8x16 v_rshift_round_pack(const v_uint16x8& a, const v_uint16x8& b) +v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b) { // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers. __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); @@ -327,7 +327,7 @@ v_uint8x16 v_rshift_round_pack(const v_uint16x8& a, const v_uint16x8& b) } template inline -void v_rshift_round_pack_store(uchar* ptr, const v_uint16x8& a) +void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a) { __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); __m128i a1 = _mm_srli_epi16(_mm_adds_epu16(a.val, delta), n); @@ -335,7 +335,7 @@ void v_rshift_round_pack_store(uchar* ptr, const v_uint16x8& a) } template inline -v_uint8x16 v_rshift_round_pack_u(const v_int16x8& a, const v_int16x8& b) +v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b) { __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n), @@ -343,7 +343,7 @@ v_uint8x16 v_rshift_round_pack_u(const v_int16x8& a, const v_int16x8& b) } template inline -void v_rshift_round_pack_u_store(uchar* ptr, const v_int16x8& a) +void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a) { __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n); @@ -357,7 +357,7 @@ inline void v_pack_store(schar* ptr, v_int16x8& a) { _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); } template inline -v_int8x16 v_rshift_round_pack(const v_int16x8& a, const v_int16x8& b) +v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b) { // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers. __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); @@ -365,7 +365,7 @@ v_int8x16 v_rshift_round_pack(const v_int16x8& a, const v_int16x8& b) _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n))); } template inline -void v_rshift_round_pack_store(schar* ptr, const v_int16x8& a) +void v_rshr_pack_store(schar* ptr, const v_int16x8& a) { // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers. __m128i delta = _mm_set1_epi16((short)(1 << (n-1))); @@ -398,7 +398,7 @@ inline void v_pack_store(ushort* ptr, const v_uint32x4& a) } template inline -v_uint16x8 v_rshift_round_pack(const v_uint32x4& a, const v_uint32x4& b) +v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b) { __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32); @@ -407,7 +407,7 @@ v_uint16x8 v_rshift_round_pack(const v_uint32x4& a, const v_uint32x4& b) } template inline -void v_rshift_round_pack_store(ushort* ptr, const v_uint32x4& a) +void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a) { __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32); @@ -431,7 +431,7 @@ inline void v_pack_u_store(ushort* ptr, const v_int32x4& a) } template inline -void v_rshift_round_pack_u_store(ushort* ptr, const v_int32x4& a) +void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a) { __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32); @@ -448,7 +448,7 @@ inline void v_pack_store(short* ptr, const v_int32x4& a) } template inline -v_int16x8 v_rshift_round_pack(const v_int32x4& a, const v_int32x4& b) +v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b) { __m128i delta = _mm_set1_epi32(1 << (n-1)); return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), @@ -456,7 +456,7 @@ v_int16x8 v_rshift_round_pack(const v_int32x4& a, const v_int32x4& b) } template inline -void v_rshift_round_pack_store(short* ptr, const v_int32x4& a) +void v_rshr_pack_store(short* ptr, const v_int32x4& a) { __m128i delta = _mm_set1_epi32(1 << (n-1)); __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n); @@ -493,19 +493,19 @@ inline void v_pack_store(int* ptr, const v_int64x2& a) } template inline -v_uint32x4 v_rshift_round_pack(const v_uint64x2& a, const v_uint64x2& b) +v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b) { uint64 delta = (uint64)1 << (n-1); v_uint64x2 delta2(delta, delta); - __m128 a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n); - __m128 b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n); + __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n); + __m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n); __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0 __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0 return v_uint32x4(_mm_unpacklo_epi64(v0, v1)); } template inline -void v_rshift_round_pack_store(unsigned* ptr, const v_uint64x2& a) +void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a) { uint64 delta = (uint64)1 << (n-1); v_uint64x2 delta2(delta, delta); @@ -526,7 +526,7 @@ inline __m128i v_srai_epi64(__m128i a, int imm) } template inline -v_int32x4 v_rshift_round_pack(const v_int64x2& a, const v_int64x2& b) +v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b) { int64 delta = (int64)1 << (n-1); v_int64x2 delta2(delta, delta); @@ -538,7 +538,7 @@ v_int32x4 v_rshift_round_pack(const v_int64x2& a, const v_int64x2& b) } template inline -void v_rshift_round_pack_store(int* ptr, const v_int64x2& a) +void v_rshr_pack_store(int* ptr, const v_int64x2& a) { int64 delta = (int64)1 << (n-1); v_int64x2 delta2(delta, delta); @@ -901,22 +901,22 @@ inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \ return _Tpsvec(srai(a.val, imm)); \ } \ template \ -inline _Tpuvec v_lshift(const _Tpuvec& a) \ +inline _Tpuvec v_shl(const _Tpuvec& a) \ { \ return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \ } \ template \ -inline _Tpsvec v_lshift(const _Tpsvec& a) \ +inline _Tpsvec v_shl(const _Tpsvec& a) \ { \ return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \ } \ template \ -inline _Tpuvec v_rshift(const _Tpuvec& a) \ +inline _Tpuvec v_shr(const _Tpuvec& a) \ { \ return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \ } \ template \ -inline _Tpsvec v_rshift(const _Tpsvec& a) \ +inline _Tpsvec v_shr(const _Tpsvec& a) \ { \ return _Tpsvec(srai(a.val, imm)); \ } From fa4165342092dad706777ee614b2a913aff3e61a Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Tue, 21 Apr 2015 11:30:41 +0300 Subject: [PATCH 11/12] trying to avoid ABI checker failures --- cmake/templates/opencv_abi.xml.in | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmake/templates/opencv_abi.xml.in b/cmake/templates/opencv_abi.xml.in index 6a7a6d8d7e..a98f86f072 100644 --- a/cmake/templates/opencv_abi.xml.in +++ b/cmake/templates/opencv_abi.xml.in @@ -21,6 +21,8 @@ + opencv2/hal/intrin_neon.hpp + opencv2/hal/intrin_sse.hpp opencv2/core/cuda* opencv2/core/private* opencv/cxeigen.hpp From 926754a66e59b7af5ea27e87a7193baad944811b Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Tue, 21 Apr 2015 12:42:46 +0300 Subject: [PATCH 12/12] another attempt to resolve ABI checker complains --- cmake/templates/opencv_abi.xml.in | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cmake/templates/opencv_abi.xml.in b/cmake/templates/opencv_abi.xml.in index a98f86f072..292d9b491b 100644 --- a/cmake/templates/opencv_abi.xml.in +++ b/cmake/templates/opencv_abi.xml.in @@ -21,8 +21,7 @@ - opencv2/hal/intrin_neon.hpp - opencv2/hal/intrin_sse.hpp + opencv2/hal/intrin* opencv2/core/cuda* opencv2/core/private* opencv/cxeigen.hpp