// This file is part of OpenCV project. // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html #include "opencv2/core/hal/intrin.hpp" //========================================= // Declare & Define & Dispatch in one step //========================================= // ARITHM_DISPATCHING_ONLY defined by arithm dispatch file #undef ARITHM_DECLARATIONS_ONLY #ifdef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY #define ARITHM_DECLARATIONS_ONLY #endif #undef ARITHM_DEFINITIONS_ONLY #if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && !defined(ARITHM_DISPATCHING_ONLY) #define ARITHM_DEFINITIONS_ONLY #endif #ifdef ARITHM_DECLARATIONS_ONLY #undef DEFINE_SIMD #define DEFINE_SIMD(fun_name, c_type, ...) \ DECLARE_SIMD_FUN(fun_name, c_type) #endif // ARITHM_DECLARATIONS_ONLY #ifdef ARITHM_DEFINITIONS_ONLY #undef DEFINE_SIMD #define DEFINE_SIMD(fun_name, c_type, v_type, ...) \ DECLARE_SIMD_FUN(fun_name, c_type) \ DEFINE_SIMD_FUN(fun_name, c_type, v_type, __VA_ARGS__) #endif // ARITHM_DEFINITIONS_ONLY #ifdef ARITHM_DISPATCHING_ONLY #undef DEFINE_SIMD #define DEFINE_SIMD(fun_name, c_type, v_type, ...) \ DISPATCH_SIMD_FUN(fun_name, c_type, v_type, __VA_ARGS__) #endif // ARITHM_DISPATCHING_ONLY // workaround when neon miss support of double precision #undef DEFINE_NOSIMD #ifdef ARITHM_DEFINITIONS_ONLY #define DEFINE_NOSIMD(fun_name, c_type, ...) \ DECLARE_SIMD_FUN(fun_name, c_type) \ DEFINE_NOSIMD_FUN(fun_name, c_type, __VA_ARGS__) #else #define DEFINE_NOSIMD DEFINE_SIMD #endif // ARITHM_DEFINITIONS_ONLY #ifndef SIMD_GUARD #define DEFINE_SIMD_U8(fun, ...) \ DEFINE_SIMD(__CV_CAT(fun, 8u), uchar, v_uint8, __VA_ARGS__) #define DEFINE_SIMD_S8(fun, ...) \ DEFINE_SIMD(__CV_CAT(fun, 8s), schar, v_int8, __VA_ARGS__) #define DEFINE_SIMD_U16(fun, ...) \ DEFINE_SIMD(__CV_CAT(fun, 16u), ushort, v_uint16, __VA_ARGS__) #define DEFINE_SIMD_S16(fun, ...) \ DEFINE_SIMD(__CV_CAT(fun, 16s), short, v_int16, __VA_ARGS__) #define DEFINE_SIMD_S32(fun, ...) \ DEFINE_SIMD(__CV_CAT(fun, 32s), int, v_int32, __VA_ARGS__) #define DEFINE_SIMD_F32(fun, ...) \ DEFINE_SIMD(__CV_CAT(fun, 32f), float, v_float32, __VA_ARGS__) #if CV_SIMD_64F #define DEFINE_SIMD_F64(fun, ...) \ DEFINE_SIMD(__CV_CAT(fun, 64f), double, v_float64, __VA_ARGS__) #else #define DEFINE_SIMD_F64(fun, ...) \ DEFINE_NOSIMD(__CV_CAT(fun, 64f), double, __VA_ARGS__) #endif #define DEFINE_SIMD_SAT(fun, ...) \ DEFINE_SIMD_U8(fun, __VA_ARGS__) \ DEFINE_SIMD_S8(fun, __VA_ARGS__) \ DEFINE_SIMD_U16(fun, __VA_ARGS__) \ DEFINE_SIMD_S16(fun, __VA_ARGS__) #define DEFINE_SIMD_NSAT(fun, ...) \ DEFINE_SIMD_S32(fun, __VA_ARGS__) \ DEFINE_SIMD_F32(fun, __VA_ARGS__) \ DEFINE_SIMD_F64(fun, __VA_ARGS__) #define DEFINE_SIMD_ALL(fun, ...) \ DEFINE_SIMD_SAT(fun, __VA_ARGS__) \ DEFINE_SIMD_NSAT(fun, __VA_ARGS__) #endif // SIMD_GUARD /////////////////////////////////////////////////////////////////////////// namespace cv { namespace hal { #ifndef ARITHM_DISPATCHING_ONLY CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN #endif #ifdef ARITHM_DEFINITIONS_ONLY #if !CV_SIMD_64F typedef int v_float64; // dummy #endif //======================================= // Utility //======================================= /** add **/ template static inline T c_add(T a, T b) { return saturate_cast(a + b); } template<> inline uchar c_add(uchar a, uchar b) { return CV_FAST_CAST_8U(a + b); } // scale template static inline T1 c_add(T1 a, T1 b, T2 scalar) { return saturate_cast((T2)a * scalar + b); } template<> inline uchar c_add(uchar a, uchar b, float scalar) { return saturate_cast(CV_8TO32F(a) * scalar + b); } // weight template static inline T1 c_add(T1 a, T1 b, T2 alpha, T2 beta, T2 gamma) { return saturate_cast(a * alpha + b * beta + gamma); } template<> inline uchar c_add(uchar a, uchar b, float alpha, float beta, float gamma) { return saturate_cast(CV_8TO32F(a) * alpha + CV_8TO32F(b) * beta + gamma); } /** sub **/ template static inline T c_sub(T a, T b) { return saturate_cast(a - b); } template<> inline uchar c_sub(uchar a, uchar b) { return CV_FAST_CAST_8U(a - b); } /** max **/ template static inline T c_max(T a, T b) { return std::max(a, b); } template<> inline uchar c_max(uchar a, uchar b) { return CV_MAX_8U(a, b); } /** min **/ template static inline T c_min(T a, T b) { return std::min(a, b); } template<> inline uchar c_min(uchar a, uchar b) { return CV_MIN_8U(a, b); } /** absdiff **/ template static inline T c_absdiff(T a, T b) { return a > b ? a - b : b - a; } template<> inline schar c_absdiff(schar a, schar b) { return saturate_cast(std::abs(a - b)); } template<> inline short c_absdiff(short a, short b) { return saturate_cast(std::abs(a - b)); } // specializations to prevent "-0" results template<> inline float c_absdiff(float a, float b) { return std::abs(a - b); } template<> inline double c_absdiff(double a, double b) { return std::abs(a - b); } /** multiply **/ template static inline T c_mul(T a, T b) { return saturate_cast(a * b); } template<> inline uchar c_mul(uchar a, uchar b) { return CV_FAST_CAST_8U(a * b); } // scale template static inline T1 c_mul(T1 a, T1 b, T2 scalar) { return saturate_cast(scalar * (T2)a * b); } template<> inline uchar c_mul(uchar a, uchar b, float scalar) { return saturate_cast(scalar * CV_8TO32F(a) * CV_8TO32F(b)); } /** divide & reciprocal **/ template static inline T2 c_div(T1 a, T2 b) { return saturate_cast(a / b); } // recip template<> inline uchar c_div(float a, uchar b) { return saturate_cast(a / CV_8TO32F(b)); } // scale template static inline T1 c_div(T1 a, T1 b, T2 scalar) { return saturate_cast(scalar * (T2)a / b); } template<> inline uchar c_div(uchar a, uchar b, float scalar) { return saturate_cast(scalar * CV_8TO32F(a) / CV_8TO32F(b)); } //======================================= // Arithmetic and logical operations // +, -, *, /, &, |, ^, ~, abs ... //======================================= ///////////////////////////// Operations ////////////////////////////////// // Add template struct op_add { static inline Tvec r(const Tvec& a, const Tvec& b) { return a + b; } static inline T1 r(T1 a, T1 b) { return c_add(a, b); } }; // Subtract template struct op_sub { static inline Tvec r(const Tvec& a, const Tvec& b) { return a - b; } static inline T1 r(T1 a, T1 b) { return c_sub(a, b); } }; // Max & Min template struct op_max { static inline Tvec r(const Tvec& a, const Tvec& b) { return v_max(a, b); } static inline T1 r(T1 a, T1 b) { return c_max(a, b); } }; template struct op_min { static inline Tvec r(const Tvec& a, const Tvec& b) { return v_min(a, b); } static inline T1 r(T1 a, T1 b) { return c_min(a, b); } }; // Absolute difference template struct op_absdiff { static inline Tvec r(const Tvec& a, const Tvec& b) { return v_absdiff(a, b); } static inline T1 r(T1 a, T1 b) { return c_absdiff(a, b); } }; // Signed absolute difference, 's' template<> struct op_absdiff { static inline v_int8 r(const v_int8& a, const v_int8& b) { return v_absdiffs(a, b); } static inline schar r(schar a, schar b) { return c_absdiff(a, b); } }; template<> struct op_absdiff { static inline v_int16 r(const v_int16& a, const v_int16& b) { return v_absdiffs(a, b); } static inline short r(short a, short b) { return c_absdiff(a, b); } }; template<> struct op_absdiff { static inline v_int32 r(const v_int32& a, const v_int32& b) { return v_reinterpret_as_s32(v_absdiff(a, b)); } static inline int r(int a, int b) { return c_absdiff(a, b); } }; // Logical template struct op_or { static inline Tvec r(const Tvec& a, const Tvec& b) { return a | b; } static inline T1 r(T1 a, T1 b) { return a | b; } }; template struct op_xor { static inline Tvec r(const Tvec& a, const Tvec& b) { return a ^ b; } static inline T1 r(T1 a, T1 b) { return a ^ b; } }; template struct op_and { static inline Tvec r(const Tvec& a, const Tvec& b) { return a & b; } static inline T1 r(T1 a, T1 b) { return a & b; } }; template struct op_not { // ignored b from loader level static inline Tvec r(const Tvec& a) { return ~a; } static inline T1 r(T1 a, T1) { return ~a; } }; //////////////////////////// Loaders ///////////////////////////////// #if CV_SIMD template< template class OP, typename T1, typename Tvec> struct bin_loader { typedef OP op; static inline void l(const T1* src1, const T1* src2, T1* dst) { Tvec a = vx_load(src1); Tvec b = vx_load(src2); v_store(dst, op::r(a, b)); } static inline void la(const T1* src1, const T1* src2, T1* dst) { Tvec a = vx_load_aligned(src1); Tvec b = vx_load_aligned(src2); v_store_aligned(dst, op::r(a, b)); // todo: try write without cache } static inline void l64(const T1* src1, const T1* src2, T1* dst) { Tvec a = vx_load_low(src1), b = vx_load_low(src2); v_store_low(dst, op::r(a, b)); } }; // void src2 for operation "not" template struct bin_loader { typedef op_not op; static inline void l(const T1* src1, const T1*, T1* dst) { Tvec a = vx_load(src1); v_store(dst, op::r(a)); } static inline void la(const T1* src1, const T1*, T1* dst) { Tvec a = vx_load_aligned(src1); v_store_aligned(dst, op::r(a)); } static inline void l64(const T1* src1, const T1*, T1* dst) { Tvec a = vx_load_low(src1); v_store_low(dst, op::r(a)); } }; #endif // CV_SIMD //////////////////////////// Loops ///////////////////////////////// template static inline bool is_aligned(const T1* src1, const T1* src2, const T2* dst) { return (((size_t)src1|(size_t)src2|(size_t)dst) & (CV_SIMD_WIDTH - 1)) == 0; } template class OP, typename T1, typename Tvec> static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height) { typedef OP op; #if CV_SIMD typedef bin_loader ldr; enum {wide_step = Tvec::nlanes}; #if !CV_NEON && CV_SIMD_WIDTH == 16 enum {wide_step_l = wide_step * 2}; #else enum {wide_step_l = wide_step}; #endif #endif // CV_SIMD step1 /= sizeof(T1); step2 /= sizeof(T1); step /= sizeof(T1); for (; height--; src1 += step1, src2 += step2, dst += step) { int x = 0; #if CV_SIMD #if !CV_NEON if (is_aligned(src1, src2, dst)) { for (; x <= width - wide_step_l; x += wide_step_l) { ldr::la(src1 + x, src2 + x, dst + x); #if !CV_NEON && CV_SIMD_WIDTH == 16 ldr::la(src1 + x + wide_step, src2 + x + wide_step, dst + x + wide_step); #endif } } else #endif for (; x <= width - wide_step_l; x += wide_step_l) { ldr::l(src1 + x, src2 + x, dst + x); #if !CV_NEON && CV_SIMD_WIDTH == 16 ldr::l(src1 + x + wide_step, src2 + x + wide_step, dst + x + wide_step); #endif } #if CV_SIMD_WIDTH == 16 for (; x <= width - 8/(int)sizeof(T1); x += 8/(int)sizeof(T1)) { ldr::l64(src1 + x, src2 + x, dst + x); } #endif #endif // CV_SIMD #if CV_ENABLE_UNROLLED || CV_SIMD_WIDTH > 16 for (; x <= width - 4; x += 4) { T1 t0 = op::r(src1[x], src2[x]); T1 t1 = op::r(src1[x + 1], src2[x + 1]); dst[x] = t0; dst[x + 1] = t1; t0 = op::r(src1[x + 2], src2[x + 2]); t1 = op::r(src1[x + 3], src2[x + 3]); dst[x + 2] = t0; dst[x + 3] = t1; } #endif for (; x < width; x++) dst[x] = op::r(src1[x], src2[x]); } vx_cleanup(); } #if !CV_SIMD_64F template class OP, typename T1, typename Tvec> static void bin_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height) { typedef OP op; step1 /= sizeof(T1); step2 /= sizeof(T1); step /= sizeof(T1); for (; height--; src1 += step1, src2 += step2, dst += step) { int x = 0; for (; x <= width - 4; x += 4) { T1 t0 = op::r(src1[x], src2[x]); T1 t1 = op::r(src1[x + 1], src2[x + 1]); dst[x] = t0; dst[x + 1] = t1; t0 = op::r(src1[x + 2], src2[x + 2]); t1 = op::r(src1[x + 3], src2[x + 3]); dst[x + 2] = t0; dst[x + 3] = t1; } for (; x < width; x++) dst[x] = op::r(src1[x], src2[x]); } } #define BIN_LOOP64F bin_loop_nosimd #else #define BIN_LOOP64F bin_loop #endif //!CV_SIMD_64F #endif // ARITHM_DEFINITIONS_ONLY //////////////////////////////////////////////////////////////////////////////////// #ifndef SIMD_GUARD #define BIN_ARGS(_T1) const _T1* src1, size_t step1, const _T1* src2, size_t step2, \ _T1* dst, size_t step, int width, int height #define BIN_ARGS_PASS src1, step1, src2, step2, dst, step, width, height #endif // SIMD_GUARD #undef DECLARE_SIMD_FUN #define DECLARE_SIMD_FUN(fun, _T1) void fun(BIN_ARGS(_T1)); #undef DISPATCH_SIMD_FUN #define DISPATCH_SIMD_FUN(fun, _T1, _Tvec, _OP) \ void fun(BIN_ARGS(_T1), void*) \ { \ CV_INSTRUMENT_REGION(); \ CALL_HAL(fun, __CV_CAT(cv_hal_, fun), BIN_ARGS_PASS) \ ARITHM_CALL_IPP(__CV_CAT(arithm_ipp_, fun), BIN_ARGS_PASS) \ CV_CPU_DISPATCH(fun, (BIN_ARGS_PASS), CV_CPU_DISPATCH_MODES_ALL); \ } #undef DEFINE_SIMD_FUN #define DEFINE_SIMD_FUN(fun, _T1, _Tvec, _OP) \ void fun(BIN_ARGS(_T1)) \ { \ CV_INSTRUMENT_REGION(); \ bin_loop<_OP, _T1, _Tvec>(BIN_ARGS_PASS); \ } #undef DEFINE_NOSIMD_FUN #define DEFINE_NOSIMD_FUN(fun, _T1, _OP) \ void fun(BIN_ARGS(_T1)) \ { \ CV_INSTRUMENT_REGION(); \ bin_loop_nosimd<_OP, _T1, v_float64>(BIN_ARGS_PASS); \ } DEFINE_SIMD_ALL(add, op_add) DEFINE_SIMD_ALL(sub, op_sub) DEFINE_SIMD_ALL(min, op_min) DEFINE_SIMD_ALL(max, op_max) DEFINE_SIMD_ALL(absdiff, op_absdiff) DEFINE_SIMD_U8(or, op_or) DEFINE_SIMD_U8(xor, op_xor) DEFINE_SIMD_U8(and, op_and) // One source!, an exception for operation "not" // we could use macros here but it's better to implement it // with that way to give more clarification // about how macroS "DEFINE_SIMD_*" are works #if defined(ARITHM_DECLARATIONS_ONLY) || defined(ARITHM_DEFINITIONS_ONLY) void not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height); #endif #ifdef ARITHM_DEFINITIONS_ONLY void not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height) { CV_INSTRUMENT_REGION(); bin_loop(src1, step1, src2, step2, dst, step, width, height); } #endif #ifdef ARITHM_DISPATCHING_ONLY void not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void*) { CV_INSTRUMENT_REGION(); CALL_HAL(not8u, cv_hal_not8u, src1, step1, dst, step, width, height) ARITHM_CALL_IPP(arithm_ipp_not8u, src1, step1, dst, step, width, height) CV_CPU_DISPATCH(not8u, (src1, step1, src2, step2, dst, step, width, height), CV_CPU_DISPATCH_MODES_ALL); } #endif //======================================= // Compare //======================================= #ifdef ARITHM_DEFINITIONS_ONLY ///////////////////////////// Operations ////////////////////////////////// template struct op_cmplt { static inline Tvec r(const Tvec& a, const Tvec& b) { return a < b; } static inline uchar r(T1 a, T1 b) { return (uchar)-(int)(a < b); } }; template struct op_cmple { static inline Tvec r(const Tvec& a, const Tvec& b) { return a <= b; } static inline uchar r(T1 a, T1 b) { return (uchar)-(int)(a <= b); } }; template struct op_cmpeq { static inline Tvec r(const Tvec& a, const Tvec& b) { return a == b; } static inline uchar r(T1 a, T1 b) { return (uchar)-(int)(a == b); } }; template struct op_cmpne { static inline Tvec r(const Tvec& a, const Tvec& b) { return a != b; } static inline uchar r(T1 a, T1 b) { return (uchar)-(int)(a != b); } }; //////////////////////////// Loaders ///////////////////////////////// #if CV_SIMD // todo: add support for RW alignment & stream template class OP, typename T1, typename Tvec> struct cmp_loader_n { void l(const T1* src1, const T1* src2, uchar* dst); }; template class OP, typename T1, typename Tvec> struct cmp_loader_n { typedef OP op; static inline void l(const T1* src1, const T1* src2, uchar* dst) { Tvec a = vx_load(src1); Tvec b = vx_load(src2); v_store(dst, v_reinterpret_as_u8(op::r(a, b))); } }; template class OP, typename T1, typename Tvec> struct cmp_loader_n { typedef OP op; enum {step = Tvec::nlanes}; static inline void l(const T1* src1, const T1* src2, uchar* dst) { Tvec c0 = op::r(vx_load(src1), vx_load(src2)); Tvec c1 = op::r(vx_load(src1 + step), vx_load(src2 + step)); v_store(dst, v_pack_b(v_reinterpret_as_u16(c0), v_reinterpret_as_u16(c1))); } }; template class OP, typename T1, typename Tvec> struct cmp_loader_n { typedef OP op; enum {step = Tvec::nlanes}; static inline void l(const T1* src1, const T1* src2, uchar* dst) { v_uint32 c0 = v_reinterpret_as_u32(op::r(vx_load(src1), vx_load(src2))); v_uint32 c1 = v_reinterpret_as_u32(op::r(vx_load(src1 + step), vx_load(src2 + step))); v_uint32 c2 = v_reinterpret_as_u32(op::r(vx_load(src1 + step * 2), vx_load(src2 + step * 2))); v_uint32 c3 = v_reinterpret_as_u32(op::r(vx_load(src1 + step * 3), vx_load(src2 + step * 3))); v_store(dst, v_pack_b(c0, c1, c2, c3)); } }; template class OP, typename T1, typename Tvec> struct cmp_loader_n { typedef OP op; enum {step = Tvec::nlanes}; static inline void l(const T1* src1, const T1* src2, uchar* dst) { v_uint64 c0 = v_reinterpret_as_u64(op::r(vx_load(src1), vx_load(src2))); v_uint64 c1 = v_reinterpret_as_u64(op::r(vx_load(src1 + step), vx_load(src2 + step))); v_uint64 c2 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 2), vx_load(src2 + step * 2))); v_uint64 c3 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 3), vx_load(src2 + step * 3))); v_uint64 c4 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 4), vx_load(src2 + step * 4))); v_uint64 c5 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 5), vx_load(src2 + step * 5))); v_uint64 c6 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 6), vx_load(src2 + step * 6))); v_uint64 c7 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 7), vx_load(src2 + step * 7))); v_store(dst, v_pack_b(c0, c1, c2, c3, c4, c5, c6, c7)); } }; #endif // CV_SIMD //////////////////////////// Loops ///////////////////////////////// template class OP, typename T1, typename Tvec> static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height) { typedef OP op; #if CV_SIMD typedef cmp_loader_n ldr; enum {wide_step = Tvec::nlanes * sizeof(T1)}; #endif // CV_SIMD step1 /= sizeof(T1); step2 /= sizeof(T1); for (; height--; src1 += step1, src2 += step2, dst += step) { int x = 0; #if CV_SIMD for (; x <= width - wide_step; x += wide_step) { ldr::l(src1 + x, src2 + x, dst + x); } #endif // CV_SIMD #if CV_ENABLE_UNROLLED || CV_SIMD_WIDTH > 16 for (; x <= width - 4; x += 4) { uchar t0 = op::r(src1[x], src2[x]); uchar t1 = op::r(src1[x + 1], src2[x + 1]); dst[x] = t0; dst[x + 1] = t1; t0 = op::r(src1[x + 2], src2[x + 2]); t1 = op::r(src1[x + 3], src2[x + 3]); dst[x + 2] = t0; dst[x + 3] = t1; } #endif for (; x < width; x++) dst[x] = op::r(src1[x], src2[x]); } vx_cleanup(); } template static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height, int cmpop) { switch(cmpop) { case CMP_LT: cmp_loop(src1, step1, src2, step2, dst, step, width, height); break; case CMP_GT: cmp_loop(src2, step2, src1, step1, dst, step, width, height); break; case CMP_LE: cmp_loop(src1, step1, src2, step2, dst, step, width, height); break; case CMP_GE: cmp_loop(src2, step2, src1, step1, dst, step, width, height); break; case CMP_EQ: cmp_loop(src1, step1, src2, step2, dst, step, width, height); break; default: CV_Assert(cmpop == CMP_NE); cmp_loop(src1, step1, src2, step2, dst, step, width, height); break; } } #if !CV_SIMD_64F template< template class OP, typename T1> static void cmp_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height) { typedef OP op; step1 /= sizeof(T1); step2 /= sizeof(T1); for (; height--; src1 += step1, src2 += step2, dst += step) { int x = 0; for (; x <= width - 4; x += 4) { uchar t0 = op::r(src1[x], src2[x]); uchar t1 = op::r(src1[x + 1], src2[x + 1]); dst[x] = t0; dst[x + 1] = t1; t0 = op::r(src1[x + 2], src2[x + 2]); t1 = op::r(src1[x + 3], src2[x + 3]); dst[x + 2] = t0; dst[x + 3] = t1; } for (; x < width; x++) dst[x] = op::r(src1[x], src2[x]); } } static void cmp_loop_nosimd(const double* src1, size_t step1, const double* src2, size_t step2, uchar* dst, size_t step, int width, int height, int cmpop) { switch(cmpop) { case CMP_LT: cmp_loop_nosimd(src1, step1, src2, step2, dst, step, width, height); break; case CMP_GT: cmp_loop_nosimd(src2, step2, src1, step1, dst, step, width, height); break; case CMP_LE: cmp_loop_nosimd(src1, step1, src2, step2, dst, step, width, height); break; case CMP_GE: cmp_loop_nosimd(src2, step2, src1, step1, dst, step, width, height); break; case CMP_EQ: cmp_loop_nosimd(src1, step1, src2, step2, dst, step, width, height); break; default: CV_Assert(cmpop == CMP_NE); cmp_loop_nosimd(src1, step1, src2, step2, dst, step, width, height); break; } } #endif // !CV_SIMD_64F #endif // ARITHM_DEFINITIONS_ONLY ///////////////////////////////////////////////////////////////////////////////////////////// #ifndef SIMD_GUARD #define CMP_ARGS(_T1) const _T1* src1, size_t step1, const _T1* src2, size_t step2, \ uchar* dst, size_t step, int width, int height #define CMP_ARGS_PASS src1, step1, src2, step2, dst, step, width, height #endif // SIMD_GUARD #undef DECLARE_SIMD_FUN #define DECLARE_SIMD_FUN(fun, _T1) void fun(CMP_ARGS(_T1), int cmpop); #undef DISPATCH_SIMD_FUN #define DISPATCH_SIMD_FUN(fun, _T1, _Tvec, ...) \ void fun(CMP_ARGS(_T1), void* _cmpop) \ { \ CV_INSTRUMENT_REGION(); \ CALL_HAL(fun, __CV_CAT(cv_hal_, fun), CMP_ARGS_PASS, *(int*)_cmpop) \ ARITHM_CALL_IPP(__CV_CAT(arithm_ipp_, fun), CMP_ARGS_PASS, *(int*)_cmpop) \ CV_CPU_DISPATCH(fun, (CMP_ARGS_PASS, *(int*)_cmpop), CV_CPU_DISPATCH_MODES_ALL); \ } #undef DEFINE_SIMD_FUN #define DEFINE_SIMD_FUN(fun, _T1, _Tvec, ...) \ void fun(CMP_ARGS(_T1), int cmpop) \ { \ CV_INSTRUMENT_REGION(); \ cmp_loop<_T1, _Tvec>(CMP_ARGS_PASS, cmpop); \ } #undef DEFINE_NOSIMD_FUN #define DEFINE_NOSIMD_FUN(fun, _T1, _Tvec, ...) \ void fun(CMP_ARGS(_T1), int cmpop) \ { \ CV_INSTRUMENT_REGION(); \ cmp_loop_nosimd(CMP_ARGS_PASS, cmpop); \ } // todo: try to avoid define dispatcher functions using macros with these such cases DEFINE_SIMD_ALL(cmp) //========================================================================= // scaling helpers for single and dual source // // Dual: Multiply, Div, AddWeighted // // Single: Reciprocal // //========================================================================= #ifdef ARITHM_DEFINITIONS_ONLY //////////////////////////// Loaders /////////////////////////////// #if CV_SIMD // todo: add support for RW alignment & stream template class OP, typename T1, typename T2, typename Tvec> struct scalar_loader_n { void l(const T1* src1, const T1* src2, const T2* scalar, T1* dst); // single source void l(const T1* src1, const T2* scalar, T1* dst); }; template class OP, typename T1, typename T2, typename Tvec> struct scalar_loader_n { typedef OP op; static inline void l(const T1* src1, const T1* src2, const T2* scalar, T1* dst) { v_int16 v_src1 = v_reinterpret_as_s16(vx_load_expand(src1)); v_int16 v_src2 = v_reinterpret_as_s16(vx_load_expand(src2)); v_int32 t0, t1, t2, t3; v_expand(v_src1, t0, t2); v_expand(v_src2, t1, t3); v_float32 f0, f1, f2, f3; f0 = v_cvt_f32(t0); f1 = v_cvt_f32(t1); f2 = v_cvt_f32(t2); f3 = v_cvt_f32(t3); f0 = op::r(f0, f1, scalar); f2 = op::r(f2, f3, scalar); v_int32 r0 = v_round(f0); v_int32 r1 = v_round(f2); store(dst, v_src2, r0, r1); } static inline void l(const T1* src1, const T2* scalar, T1* dst) { v_int16 v_src1 = v_reinterpret_as_s16(vx_load_expand(src1)); v_int32 t0, t1; v_expand(v_src1, t0, t1); v_float32 f0, f1; f0 = v_cvt_f32(t0); f1 = v_cvt_f32(t1); f0 = op::r(f0, scalar); f1 = op::r(f1, scalar); v_int32 r0 = v_round(f0); v_int32 r1 = v_round(f1); store(dst, v_src1, r0, r1); } static inline void store(uchar* dst, const v_int16& src, const v_int32& a, const v_int32& b) { v_pack_u_store(dst, op::pre(src, v_pack(a, b))); } static inline void store(schar* dst, const v_int16& src, const v_int32& a, const v_int32& b) { v_pack_store(dst, op::pre(src, v_pack(a, b))); } }; template class OP, typename T1, typename T2, typename Tvec> struct scalar_loader_n { typedef typename V_RegTraits::w_reg Twvec; typedef OP op; static inline void l(const T1* src1, const T1* src2, const T2* scalar, T1* dst) { Tvec v_src1 = vx_load(src1); Tvec v_src2 = vx_load(src2); Twvec t0, t1, t2, t3; v_expand(v_src1, t0, t2); v_expand(v_src2, t1, t3); v_float32 f0, f1, f2, f3; f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); f2 = v_cvt_f32(v_reinterpret_as_s32(t2)); f3 = v_cvt_f32(v_reinterpret_as_s32(t3)); f0 = op::r(f0, f1, scalar); f2 = op::r(f2, f3, scalar); v_int32 r0 = v_round(f0); v_int32 r1 = v_round(f2); store(dst, v_src2, r0, r1); } static inline void l(const T1* src1, const T2* scalar, T1* dst) { Tvec v_src1 = vx_load(src1); Twvec t0, t1; v_expand(v_src1, t0, t1); v_float32 f0, f1; f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); f0 = op::r(f0, scalar); f1 = op::r(f1, scalar); v_int32 r0 = v_round(f0); v_int32 r1 = v_round(f1); store(dst, v_src1, r0, r1); } static inline void store(ushort* dst, const Tvec& src, const v_int32& a, const v_int32& b) { v_store(dst, op::pre(src, v_pack_u(a, b))); } static inline void store(short* dst, const Tvec& src, const v_int32& a, const v_int32& b) { v_store(dst, op::pre(src, v_pack(a, b))); } }; template class OP, typename T2> struct scalar_loader_n { typedef OP op; enum {step = v_int32::nlanes}; static inline void l(const int* src1, const int* src2, const T2* scalar, int* dst) { v_int32 v_src1 = vx_load(src1); v_int32 v_src2 = vx_load(src2); v_int32 v_src1s = vx_load(src1 + step); v_int32 v_src2s = vx_load(src2 + step); v_float32 f0, f1, f2, f3; f0 = v_cvt_f32(v_reinterpret_as_s32(v_src1)); f1 = v_cvt_f32(v_reinterpret_as_s32(v_src2)); f2 = v_cvt_f32(v_reinterpret_as_s32(v_src1s)); f3 = v_cvt_f32(v_reinterpret_as_s32(v_src2s)); f0 = op::r(f0, f1, scalar); f2 = op::r(f2, f3, scalar); v_int32 r0 = v_round(f0); v_int32 r1 = v_round(f2); r0 = op::pre(v_src2, r0); r1 = op::pre(v_src2s, r1); v_store(dst, r0); v_store(dst + step, r1); } static inline void l(const int* src1, const T2* scalar, int* dst) { v_int32 v_src1 = vx_load(src1); v_int32 v_src1s = vx_load(src1 + step); v_float32 f0, f1; f0 = v_cvt_f32(v_src1); f1 = v_cvt_f32(v_src1s); f0 = op::r(f0, scalar); f1 = op::r(f1, scalar); v_int32 r0 = v_round(f0); v_int32 r1 = v_round(f1); r0 = op::pre(v_src1, r0); r1 = op::pre(v_src1s, r1); v_store(dst, r0); v_store(dst + step, r1); } }; template class OP, typename T2> struct scalar_loader_n { typedef OP op; enum {step = v_float32::nlanes}; static inline void l(const float* src1, const float* src2, const T2* scalar, float* dst) { v_float32 v_src1 = vx_load(src1); v_float32 v_src2 = vx_load(src2); v_float32 v_src1s = vx_load(src1 + step); v_float32 v_src2s = vx_load(src2 + step); v_float32 r0 = op::r(v_src1, v_src2, scalar); v_float32 r1 = op::r(v_src1s, v_src2s, scalar); #if CV_VERSION_MAJOR == 3 r0 = op::pre(v_src2, r0); r1 = op::pre(v_src2s, r1); #endif v_store(dst, r0); v_store(dst + step, r1); } static inline void l(const float* src1, const T2* scalar, float* dst) { v_float32 v_src1 = vx_load(src1); v_float32 v_src1s = vx_load(src1 + step); v_float32 r0 = op::r(v_src1, scalar); v_float32 r1 = op::r(v_src1s, scalar); #if CV_VERSION_MAJOR == 3 r0 = op::pre(v_src1, r0); r1 = op::pre(v_src1s, r1); #endif v_store(dst, r0); v_store(dst + step, r1); } }; #endif // CV_SIMD #if CV_SIMD_64F template class OP> struct scalar_loader_n { typedef OP op; typedef OP op64; enum {step = v_int32::nlanes}; static inline void l(const int* src1, const int* src2, const double* scalar, int* dst) { v_int32 v_src1 = vx_load(src1); v_int32 v_src2 = vx_load(src2); v_int32 v_src1s = vx_load(src1 + step); v_int32 v_src2s = vx_load(src2 + step); v_int32 r0 = r(v_src1, v_src2, scalar); v_int32 r1 = r(v_src1s, v_src2s, scalar); r0 = op::pre(v_src2, r0); r1 = op::pre(v_src2s, r1); v_store(dst, r0); v_store(dst + step, r1); } static inline void l(const int* src1, const double* scalar, int* dst) { v_int32 v_src1 = vx_load(src1); v_int32 v_src1s = vx_load(src1 + step); v_int32 r0 = r(v_src1, scalar); v_int32 r1 = r(v_src1s, scalar); r0 = op::pre(v_src1, r0); r1 = op::pre(v_src1s, r1); v_store(dst, r0); v_store(dst + step, r1); } static inline v_int32 r(const v_int32& a, const v_int32& b, const double* scalar) { v_float64 f0, f1, f2, f3; f0 = v_cvt_f64(a); f1 = v_cvt_f64_high(a); f2 = v_cvt_f64(b); f3 = v_cvt_f64_high(b); v_float64 r0 = op64::r(f0, f2, scalar); v_float64 r1 = op64::r(f1, f3, scalar); return v_round(r0, r1); } static inline v_int32 r(const v_int32& a, const double* scalar) { v_float64 f0, f1; f0 = v_cvt_f64(a); f1 = v_cvt_f64_high(a); v_float64 r0 = op64::r(f0, scalar); v_float64 r1 = op64::r(f1, scalar); return v_round(r0, r1); } }; template class OP> struct scalar_loader_n { typedef OP op; typedef OP op64; enum {step = v_float32::nlanes}; static inline void l(const float* src1, const float* src2, const double* scalar, float* dst) { v_float32 v_src1 = vx_load(src1); v_float32 v_src2 = vx_load(src2); v_float32 v_src1s = vx_load(src1 + step); v_float32 v_src2s = vx_load(src2 + step); v_float32 r0 = r(v_src1, v_src2, scalar); v_float32 r1 = r(v_src1s, v_src2s, scalar); #if CV_VERSION_MAJOR == 3 r0 = op::pre(v_src2, r0); r1 = op::pre(v_src2s, r1); #endif v_store(dst, r0); v_store(dst + step, r1); } static inline void l(const float* src1, const double* scalar, float* dst) { v_float32 v_src1 = vx_load(src1); v_float32 v_src1s = vx_load(src1 + step); v_float32 r0 = r(v_src1, scalar); v_float32 r1 = r(v_src1s, scalar); #if CV_VERSION_MAJOR == 3 r0 = op::pre(v_src1, r0); r1 = op::pre(v_src1s, r1); #endif v_store(dst, r0); v_store(dst + step, r1); } static inline v_float32 r(const v_float32& a, const v_float32& b, const double* scalar) { v_float64 f0, f1, f2, f3; f0 = v_cvt_f64(a); f1 = v_cvt_f64_high(a); f2 = v_cvt_f64(b); f3 = v_cvt_f64_high(b); v_float64 r0 = op64::r(f0, f2, scalar); v_float64 r1 = op64::r(f1, f3, scalar); return v_cvt_f32(r0, r1); } static inline v_float32 r(const v_float32& a, const double* scalar) { v_float64 f0, f1; f0 = v_cvt_f64(a); f1 = v_cvt_f64_high(a); v_float64 r0 = op64::r(f0, scalar); v_float64 r1 = op64::r(f1, scalar); return v_cvt_f32(r0, r1); } }; template class OP> struct scalar_loader_n { typedef OP op; enum {step = v_float64::nlanes}; static inline void l(const double* src1, const double* src2, const double* scalar, double* dst) { v_float64 v_src1 = vx_load(src1); v_float64 v_src2 = vx_load(src2); v_float64 v_src1s = vx_load(src1 + step); v_float64 v_src2s = vx_load(src2 + step); v_float64 r0 = op::r(v_src1, v_src2, scalar); v_float64 r1 = op::r(v_src1s, v_src2s, scalar); #if CV_VERSION_MAJOR == 3 r0 = op::pre(v_src2, r0); r1 = op::pre(v_src2s, r1); #endif v_store(dst, r0); v_store(dst + step, r1); } static inline void l(const double* src1, const double* scalar, double* dst) { v_float64 v_src1 = vx_load(src1); v_float64 v_src1s = vx_load(src1 + step); v_float64 r0 = op::r(v_src1, scalar); v_float64 r1 = op::r(v_src1s, scalar); #if CV_VERSION_MAJOR == 3 r0 = op::pre(v_src1, r0); r1 = op::pre(v_src1s, r1); #endif v_store(dst, r0); v_store(dst + step, r1); } }; #endif // CV_SIMD_64F //////////////////////////// Loops ///////////////////////////////// // dual source template class OP, typename T1, typename T2, typename Tvec> static void scalar_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height, const T2* scalar) { typedef OP op; #if CV_SIMD typedef scalar_loader_n ldr; const int wide_step = sizeof(T1) > sizeof(ushort) ? Tvec::nlanes * 2 : sizeof(T1) == sizeof(uchar) ? Tvec::nlanes / 2 : Tvec::nlanes; #endif // CV_SIMD step1 /= sizeof(T1); step2 /= sizeof(T1); step /= sizeof(T1); for (; height--; src1 += step1, src2 += step2, dst += step) { int x = 0; #if CV_SIMD for (; x <= width - wide_step; x += wide_step) { ldr::l(src1 + x, src2 + x, scalar, dst + x); } #endif // CV_SIMD #if CV_ENABLE_UNROLLED || CV_SIMD_WIDTH > 16 for (; x <= width - 4; x += 4) { T1 t0 = op::r(src1[x], src2[x], scalar); T1 t1 = op::r(src1[x + 1], src2[x + 1], scalar); dst[x] = t0; dst[x + 1] = t1; t0 = op::r(src1[x + 2], src2[x + 2], scalar); t1 = op::r(src1[x + 3], src2[x + 3], scalar); dst[x + 2] = t0; dst[x + 3] = t1; } #endif for (; x < width; ++x) dst[x] = op::r(src1[x], src2[x], scalar); } vx_cleanup(); } // single source template class OP, typename T1, typename T2, typename Tvec> static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int width, int height, const T2* scalar) { typedef OP op; #if CV_SIMD typedef scalar_loader_n ldr; const int wide_step = sizeof(T1) > sizeof(ushort) ? Tvec::nlanes * 2 : sizeof(T1) == sizeof(uchar) ? Tvec::nlanes / 2 : Tvec::nlanes; #endif // CV_SIMD step1 /= sizeof(T1); step /= sizeof(T1); for (; height--; src1 += step1, dst += step) { int x = 0; #if CV_SIMD for (; x <= width - wide_step; x += wide_step) { ldr::l(src1 + x, scalar, dst + x); } #endif // CV_SIMD #if CV_ENABLE_UNROLLED || CV_SIMD_WIDTH > 16 for (; x <= width - 4; x += 4) { T1 t0 = op::r(src1[x], scalar); T1 t1 = op::r(src1[x + 1], scalar); dst[x] = t0; dst[x + 1] = t1; t0 = op::r(src1[x + 2], scalar); t1 = op::r(src1[x + 3], scalar); dst[x + 2] = t0; dst[x + 3] = t1; } #endif for (; x < width; ++x) dst[x] = op::r(src1[x], scalar); } vx_cleanup(); } #if !CV_SIMD_64F // dual source template class OP, typename T1, typename T2, typename Tvec> static void scalar_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height, const T2* scalar) { typedef OP op; step1 /= sizeof(T1); step2 /= sizeof(T1); step /= sizeof(T1); for (; height--; src1 += step1, src2 += step2, dst += step) { int x = 0; for (; x <= width - 4; x += 4) { T1 t0 = op::r(src1[x], src2[x], scalar); T1 t1 = op::r(src1[x + 1], src2[x + 1], scalar); dst[x] = t0; dst[x + 1] = t1; t0 = op::r(src1[x + 2], src2[x + 2], scalar); t1 = op::r(src1[x + 3], src2[x + 3], scalar); dst[x + 2] = t0; dst[x + 3] = t1; } for (; x < width; ++x) dst[x] = op::r(src1[x], src2[x], scalar); } } // single source template class OP, typename T1, typename T2, typename Tvec> static void scalar_loop_nosimd(const T1* src1, size_t step1, T1* dst, size_t step, int width, int height, const T2* scalar) { typedef OP op; step1 /= sizeof(T1); step /= sizeof(T1); for (; height--; src1 += step1, dst += step) { int x = 0; for (; x <= width - 4; x += 4) { T1 t0 = op::r(src1[x], scalar); T1 t1 = op::r(src1[x + 1], scalar); dst[x] = t0; dst[x + 1] = t1; t0 = op::r(src1[x + 2], scalar); t1 = op::r(src1[x + 3], scalar); dst[x + 2] = t0; dst[x + 3] = t1; } for (; x < width; ++x) dst[x] = op::r(src1[x], scalar); } } #define SCALAR_LOOP64F scalar_loop_nosimd #else #define SCALAR_LOOP64F scalar_loop #endif // !CV_SIMD_64F #endif // ARITHM_DEFINITIONS_ONLY //========================================================================= // Multiply //========================================================================= #ifdef ARITHM_DEFINITIONS_ONLY ///////////////////////////// Operations ////////////////////////////////// template struct op_mul { static inline Tvec r(const Tvec& a, const Tvec& b) { return a * b; } static inline T1 r(T1 a, T1 b) { return saturate_cast(a * b); } }; template struct op_mul_scale { static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar) { const v_float32 v_scalar = vx_setall_f32(*scalar); return v_scalar * a * b; } static inline T1 r(T1 a, T1 b, const T2* scalar) { return c_mul(a, b, *scalar); } static inline Tvec pre(const Tvec&, const Tvec& res) { return res; } }; template<> struct op_mul_scale { #if CV_SIMD_64F static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar) { const v_float64 v_scalar = vx_setall_f64(*scalar); return v_scalar * a * b; } #endif static inline double r(double a, double b, const double* scalar) { return c_mul(a, b, *scalar); } static inline v_float64 pre(const v_float64&, const v_float64& res) { return res; } }; //////////////////////////// Loops ///////////////////////////////// template static void mul_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height, const double* scalar) { float fscalar = (float)*scalar; if (std::fabs(fscalar - 1.0f) <= FLT_EPSILON) { bin_loop(src1, step1, src2, step2, dst, step, width, height); } else { scalar_loop(src1, step1, src2, step2, dst, step, width, height, &fscalar); } } template static void mul_loop_d(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height, const double* scalar) { if (std::fabs(*scalar - 1.0) <= FLT_EPSILON) { bin_loop(src1, step1, src2, step2, dst, step, width, height); } else { SCALAR_LOOP64F(src1, step1, src2, step2, dst, step, width, height, scalar); } } template<> void mul_loop_d(const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, const double* scalar) { if (*scalar == 1.0) { BIN_LOOP64F(src1, step1, src2, step2, dst, step, width, height); } else { SCALAR_LOOP64F(src1, step1, src2, step2, dst, step, width, height, scalar); } } #endif // ARITHM_DEFINITIONS_ONLY ////////////////////////////////////////////////////////////////////////// #undef SCALAR_ARGS #define SCALAR_ARGS(_T1) const _T1* src1, size_t step1, const _T1* src2, size_t step2, \ _T1* dst, size_t step, int width, int height #undef SCALAR_ARGS_PASS #define SCALAR_ARGS_PASS src1, step1, src2, step2, dst, step, width, height #undef DECLARE_SIMD_FUN #define DECLARE_SIMD_FUN(fun, _T1) void fun(SCALAR_ARGS(_T1), const double* scalar); #undef DISPATCH_SIMD_FUN #define DISPATCH_SIMD_FUN(fun, _T1, _Tvec, ...) \ void fun(SCALAR_ARGS(_T1), void* scalar) \ { \ CV_INSTRUMENT_REGION(); \ CALL_HAL(fun, __CV_CAT(cv_hal_, fun), \ SCALAR_ARGS_PASS, *(const double*)scalar) \ ARITHM_CALL_IPP(__CV_CAT(arithm_ipp_, fun), \ SCALAR_ARGS_PASS, *(const double*)scalar) \ CV_CPU_DISPATCH(fun, (SCALAR_ARGS_PASS, (const double*)scalar), \ CV_CPU_DISPATCH_MODES_ALL); \ } #undef DEFINE_SIMD_FUN #define DEFINE_SIMD_FUN(fun, _T1, _Tvec, op) \ void fun(SCALAR_ARGS(_T1), const double* scalar) \ { \ CV_INSTRUMENT_REGION(); \ op<_T1, _Tvec>(SCALAR_ARGS_PASS, scalar); \ } #undef DEFINE_NOSIMD_FUN #define DEFINE_NOSIMD_FUN(fun, _T1, _OP) \ DEFINE_SIMD_FUN(fun, _T1, v_float64, _OP) DEFINE_SIMD_SAT(mul, mul_loop) DEFINE_SIMD_F32(mul, mul_loop_d) DEFINE_SIMD_S32(mul, mul_loop_d) DEFINE_SIMD_F64(mul, mul_loop_d) //========================================================================= // Div //========================================================================= #ifdef ARITHM_DEFINITIONS_ONLY ///////////////////////////// Operations ////////////////////////////////// #if CV_VERSION_MAJOR == 3 template struct op_div_f { static inline Tvec r(const Tvec& a, const Tvec& b) { const Tvec v_zero = Tvec(); return v_select(b == v_zero, v_zero, a / b); } static inline T1 r(T1 a, T1 b) { return b != (T1)0 ? a / b : (T1)0; } }; #else template struct op_div_f { static inline Tvec r(const Tvec& a, const Tvec& b) { return a / b; } static inline T1 r(T1 a, T1 b) { return a / b; } }; #endif template struct op_div_scale { static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar) { const v_float32 v_scalar = vx_setall_f32(*scalar); return a * v_scalar / b; } static inline Tvec pre(const Tvec& denom, const Tvec& res) { const Tvec v_zero = Tvec(); return v_select(denom == v_zero, v_zero, res); } static inline T1 r(T1 a, T1 denom, const T2* scalar) { return denom != (T1)0 ? c_div(a, denom, *scalar) : (T1)0; } }; template<> struct op_div_scale { #if CV_SIMD_64F static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar) { const v_float64 v_scalar = vx_setall_f64(*scalar); return a * v_scalar / b; } static inline v_float64 pre(const v_float64& denom, const v_float64& res) { const v_float64 v_zero = vx_setzero_f64(); return v_select(denom == v_zero, v_zero, res); } #endif static inline double r(double a, double denom, const double* scalar) { return denom != 0.0 ? c_div(a, denom, *scalar) : 0.0; } }; //////////////////////////// Loops ///////////////////////////////// template static void div_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height, const double* scalar) { float fscalar = (float)*scalar; // todo: add new intrinsics for integer divide scalar_loop(src1, step1, src2, step2, dst, step, width, height, &fscalar); } template<> void div_loop(const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, const double* scalar) { float fscalar = (float)*scalar; if (std::fabs(fscalar - 1.0f) <= FLT_EPSILON) { bin_loop(src1, step1, src2, step2, dst, step, width, height); } else { SCALAR_LOOP64F(src1, step1, src2, step2, dst, step, width, height, &fscalar); } } template<> void div_loop(const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, const double* scalar) { if (*scalar == 1.0) { BIN_LOOP64F(src1, step1, src2, step2, dst, step, width, height); } else { SCALAR_LOOP64F(src1, step1, src2, step2, dst, step, width, height, scalar); } } #endif // ARITHM_DEFINITIONS_ONLY ////////////////////////////////////////////////////////////////////////// DEFINE_SIMD_ALL(div, div_loop) //========================================================================= // AddWeighted //========================================================================= #ifdef ARITHM_DEFINITIONS_ONLY ///////////////////////////// Operations ////////////////////////////////// ///// Add scale template struct op_add_scale { static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar) { const v_float32 v_alpha = vx_setall_f32(*scalar); return v_fma(a, v_alpha, b); } static inline T1 r(T1 a, T1 b, const T2* scalar) { return c_add(a, b, *scalar); } static inline Tvec pre(const Tvec&, const Tvec& res) { return res; } }; template<> struct op_add_scale { #if CV_SIMD_64F static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar) { const v_float64 v_alpha = vx_setall_f64(*scalar); return v_fma(a, v_alpha, b); } #endif static inline double r(double a, double b, const double* scalar) { return c_add(a, b, *scalar); } static inline v_float64 pre(const v_float64&, const v_float64& res) { return res; } }; ///// Weighted sum template struct op_add_weighted { static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalars) { const v_float32 v_alpha = vx_setall_f32(scalars[0]); const v_float32 v_beta = vx_setall_f32(scalars[1]); const v_float32 v_gamma = vx_setall_f32(scalars[2]); return v_fma(a, v_alpha, v_fma(b, v_beta, v_gamma)); } static inline T1 r(T1 a, T1 b, const T2* scalars) { return c_add(a, b, scalars[0], scalars[1], scalars[2]); } static inline Tvec pre(const Tvec&, const Tvec& res) { return res; } }; template<> struct op_add_weighted { #if CV_SIMD_64F static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalars) { const v_float64 v_alpha = vx_setall_f64(scalars[0]); const v_float64 v_beta = vx_setall_f64(scalars[1]); const v_float64 v_gamma = vx_setall_f64(scalars[2]); return v_fma(a, v_alpha, v_fma(b, v_beta, v_gamma)); } #endif static inline double r(double a, double b, const double* scalars) { return c_add(a, b, scalars[0], scalars[1], scalars[2]); } static inline v_float64 pre(const v_float64&, const v_float64& res) { return res; } }; //////////////////////////// Loops ///////////////////////////////// template static void add_weighted_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height, const double* scalars) { float fscalars[] = {(float)scalars[0], (float)scalars[1], (float)scalars[2]}; if (fscalars[1] == 1.0f && fscalars[2] == 0.0f) { scalar_loop(src1, step1, src2, step2, dst, step, width, height, fscalars); } else { scalar_loop(src1, step1, src2, step2, dst, step, width, height, fscalars); } } template static void add_weighted_loop_d(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height, const double* scalars) { if (scalars[1] == 1.0 && scalars[2] == 0.0) { SCALAR_LOOP64F(src1, step1, src2, step2, dst, step, width, height, scalars); } else { SCALAR_LOOP64F(src1, step1, src2, step2, dst, step, width, height, scalars); } } template<> void add_weighted_loop_d(const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, const double* scalars) { if (scalars[1] == 1.0 && scalars[2] == 0.0) { SCALAR_LOOP64F(src1, step1, src2, step2, dst, step, width, height, scalars); } else { SCALAR_LOOP64F(src1, step1, src2, step2, dst, step, width, height, scalars); } } #endif // ARITHM_DEFINITIONS_ONLY ////////////////////////////////////////////////////////////////////////// #undef DISPATCH_SIMD_FUN #define DISPATCH_SIMD_FUN(fun, _T1, _Tvec, ...) \ void fun(SCALAR_ARGS(_T1), void* scalar) \ { \ CV_INSTRUMENT_REGION(); \ CALL_HAL(fun, __CV_CAT(cv_hal_, fun), \ SCALAR_ARGS_PASS, (const double*)scalar) \ ARITHM_CALL_IPP(__CV_CAT(arithm_ipp_, fun), \ SCALAR_ARGS_PASS, (const double*)scalar) \ CV_CPU_DISPATCH(fun, (SCALAR_ARGS_PASS, (const double*)scalar), \ CV_CPU_DISPATCH_MODES_ALL); \ } DEFINE_SIMD_SAT(addWeighted, add_weighted_loop) DEFINE_SIMD_S32(addWeighted, add_weighted_loop_d) DEFINE_SIMD_F32(addWeighted, add_weighted_loop_d) DEFINE_SIMD_F64(addWeighted, add_weighted_loop_d) //======================================= // Reciprocal //======================================= #ifdef ARITHM_DEFINITIONS_ONLY ///////////////////////////// Operations ////////////////////////////////// template struct op_recip { static inline v_float32 r(const v_float32& a, const T2* scalar) { const v_float32 v_scalar = vx_setall_f32(*scalar); return v_scalar / a; } static inline Tvec pre(const Tvec& denom, const Tvec& res) { const Tvec v_zero = Tvec(); return v_select(denom == v_zero, v_zero, res); } static inline T1 r(T1 denom, const T2* scalar) { return denom != (T1)0 ? c_div(*scalar, denom) : (T1)0; } }; template<> struct op_recip { #if CV_SIMD_64F static inline v_float64 r(const v_float64& a, const double* scalar) { const v_float64 v_scalar = vx_setall_f64(*scalar); return v_scalar / a; } static inline v_float64 pre(const v_float64& denom, const v_float64& res) { const v_float64 v_zero = vx_setzero_f64(); return v_select(denom == v_zero, v_zero, res); } #endif static inline double r(double denom, const double* scalar) { return denom != 0.0 ? c_div(*scalar, denom) : 0.0; } }; //////////////////////////// Loops ///////////////////////////////// template static void recip_loop(const T1* src1, size_t step1, T1* dst, size_t step, int width, int height, const double* scalar) { float fscalar = (float)*scalar; scalar_loop(src1, step1, dst, step, width, height, &fscalar); } template<> void recip_loop(const double* src1, size_t step1, double* dst, size_t step, int width, int height, const double* scalar) { SCALAR_LOOP64F(src1, step1, dst, step, width, height, scalar); } #endif // ARITHM_DEFINITIONS_ONLY ////////////////////////////////////////////////////////////////////////// #undef SCALAR_ARGS #define SCALAR_ARGS(_T1) const _T1* src1, size_t step1, _T1* dst, size_t step, int width, int height #undef SCALAR_ARGS_PASS #define SCALAR_ARGS_PASS src1, step1, dst, step, width, height #undef DISPATCH_SIMD_FUN #define DISPATCH_SIMD_FUN(fun, _T1, _Tvec, ...) \ void fun(const _T1*, size_t, SCALAR_ARGS(_T1), void* scalar) \ { \ CV_INSTRUMENT_REGION(); \ CALL_HAL(fun, __CV_CAT(cv_hal_, fun), \ SCALAR_ARGS_PASS, *(const double*)scalar) \ ARITHM_CALL_IPP(__CV_CAT(arithm_ipp_, fun), \ SCALAR_ARGS_PASS, *(const double*)scalar) \ CV_CPU_DISPATCH(fun, (SCALAR_ARGS_PASS, (const double*)scalar), \ CV_CPU_DISPATCH_MODES_ALL); \ } DEFINE_SIMD_ALL(recip, recip_loop) #ifndef ARITHM_DISPATCHING_ONLY CV_CPU_OPTIMIZATION_NAMESPACE_END #endif #ifndef SIMD_GUARD #define SIMD_GUARD #endif }} // cv::hal::