diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index 40979f6c4b..e9e8d28eaa 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -364,6 +364,7 @@ Floating point: |extract_n | x | x | |broadcast_element | x | | |exp | x | x | +|log | x | x | @{ */ @@ -742,13 +743,29 @@ OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp) OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp) #define OPENCV_HAL_MATH_HAVE_EXP 1 +/** + * @brief Natural logarithm \f$ \log(x) \f$ of elements + * + * Only for floating point types. Core implementation steps: + * 1. Decompose Input: Use binary representation to decompose the input into mantissa part \f$ m \f$ and exponent part \f$ e \f$. Such that \f$ \log(x) = \log(m \cdot 2^e) = \log(m) + e \cdot \ln(2) \f$. + * 2. Adjust Mantissa and Exponent Parts: If the mantissa is less than \f$ \sqrt{0.5} \f$, adjust the exponent and mantissa to ensure the mantissa is in the range \f$ (\sqrt{0.5}, \sqrt{2}) \f$ for better approximation. + * 3. Polynomial Approximation for \f$ \log(m) \f$: The closer the \f$ m \f$ is to 1, the more accurate the result. + * - For float16 and float32, use a Taylor Series with 9 terms. + * - For float64, use Pade Polynomials Approximation with 6 terms. + * 4. Combine Results: Add the two parts together to get the final result. + * + * @note The precision of the calculation depends on the implementation and the data type of the input. + * + * @note Similar to the behavior of std::log(), \f$ \ln(0) = -\infty \f$. + */ +OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp) +#define OPENCV_HAL_MATH_HAVE_LOG 1 + //! @cond IGNORED OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp) #define OPENCV_HAL_MATH_HAVE_SIN 1 OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp) #define OPENCV_HAL_MATH_HAVE_COS 1 -OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp) -#define OPENCV_HAL_MATH_HAVE_LOG 1 //! @endcond /** @brief Absolute value of elements diff --git a/modules/core/include/opencv2/core/hal/intrin_math.hpp b/modules/core/include/opencv2/core/hal/intrin_math.hpp index 528166889b..0f51b9ba13 100644 --- a/modules/core/include/opencv2/core/hal/intrin_math.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_math.hpp @@ -194,7 +194,229 @@ namespace CV__SIMD_NAMESPACE { #define OPENCV_HAL_MATH_HAVE_EXP 1 //! @} +#endif + +#ifndef OPENCV_HAL_MATH_HAVE_LOG + +//! @name Natural Logarithm +//! @{ +#if defined(CV_SIMD_FP16) && CV_SIMD_FP16 + inline v_float16 v_log(const v_float16 &x) { + const v_float16 _vlog_one_fp16 = vx_setall_f16(1.0f); + const v_float16 _vlog_SQRTHF_fp16 = vx_setall_f16(0.707106781186547524f); + const v_float16 _vlog_q1_fp16 = vx_setall_f16(-2.12194440E-4f); + const v_float16 _vlog_q2_fp16 = vx_setall_f16(0.693359375f); + const v_float16 _vlog_p0_fp16 = vx_setall_f16(7.0376836292E-2f); + const v_float16 _vlog_p1_fp16 = vx_setall_f16(-1.1514610310E-1f); + const v_float16 _vlog_p2_fp16 = vx_setall_f16(1.1676998740E-1f); + const v_float16 _vlog_p3_fp16 = vx_setall_f16(-1.2420140846E-1f); + const v_float16 _vlog_p4_fp16 = vx_setall_f16(1.4249322787E-1f); + const v_float16 _vlog_p5_fp16 = vx_setall_f16(-1.6668057665E-1f); + const v_float16 _vlog_p6_fp16 = vx_setall_f16(2.0000714765E-1f); + const v_float16 _vlog_p7_fp16 = vx_setall_f16(-2.4999993993E-1f); + const v_float16 _vlog_p8_fp16 = vx_setall_f16(3.3333331174E-1f); + const v_int16 _vlog_inv_mant_mask_s16 = vx_setall_s16(~0x7c00); + + v_float16 _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp; + v_int16 _vlog_ux, _vlog_emm0; + + _vlog_ux = v_reinterpret_as_s16(x); + _vlog_emm0 = v_shr(_vlog_ux, 10); + + _vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s16); + _vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s16(vx_setall_f16(0.5f))); + _vlog_x = v_reinterpret_as_f16(_vlog_ux); + + _vlog_emm0 = v_sub(_vlog_emm0, vx_setall_s16(0xf)); + _vlog_e = v_cvt_f16(_vlog_emm0); + + _vlog_e = v_add(_vlog_e, _vlog_one_fp16); + + v_float16 _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp16); + _vlog_tmp = v_and(_vlog_x, _vlog_mask); + _vlog_x = v_sub(_vlog_x, _vlog_one_fp16); + _vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp16, _vlog_mask)); + _vlog_x = v_add(_vlog_x, _vlog_tmp); + + _vlog_z = v_mul(_vlog_x, _vlog_x); + + _vlog_y = v_fma(_vlog_p0_fp16, _vlog_x, _vlog_p1_fp16); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp16); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp16); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp16); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp16); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp16); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp16); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp16); + _vlog_y = v_mul(_vlog_y, _vlog_x); + _vlog_y = v_mul(_vlog_y, _vlog_z); + + _vlog_y = v_fma(_vlog_e, _vlog_q1_fp16, _vlog_y); + + _vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, vx_setall_f16(0.5f))); + + _vlog_x = v_add(_vlog_x, _vlog_y); + _vlog_x = v_fma(_vlog_e, _vlog_q2_fp16, _vlog_x); + // log(0) -> -INF + v_float16 mask_zero = v_eq(x, vx_setzero_f16()); + _vlog_x = v_select(mask_zero, v_reinterpret_as_f16(vx_setall_s16(0xfc00)), _vlog_x); + // log(NEG), log(NAN) -> NAN + v_float16 mask_not_nan = v_ge(x, vx_setzero_f16()); + _vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f16(vx_setall_s16(0x7e00))); + // log(INF) -> INF + v_float16 mask_inf = v_eq(x, v_reinterpret_as_f16(vx_setall_s16(0x7c00))); + _vlog_x = v_select(mask_inf, x, _vlog_x); + return _vlog_x; + } +#endif + inline v_float32 v_log(const v_float32 &x) { + const v_float32 _vlog_one_fp32 = vx_setall_f32(1.0f); + const v_float32 _vlog_SQRTHF_fp32 = vx_setall_f32(0.707106781186547524f); + const v_float32 _vlog_q1_fp32 = vx_setall_f32(-2.12194440E-4f); + const v_float32 _vlog_q2_fp32 = vx_setall_f32(0.693359375f); + const v_float32 _vlog_p0_fp32 = vx_setall_f32(7.0376836292E-2f); + const v_float32 _vlog_p1_fp32 = vx_setall_f32(-1.1514610310E-1f); + const v_float32 _vlog_p2_fp32 = vx_setall_f32(1.1676998740E-1f); + const v_float32 _vlog_p3_fp32 = vx_setall_f32(-1.2420140846E-1f); + const v_float32 _vlog_p4_fp32 = vx_setall_f32(1.4249322787E-1f); + const v_float32 _vlog_p5_fp32 = vx_setall_f32(-1.6668057665E-1f); + const v_float32 _vlog_p6_fp32 = vx_setall_f32(2.0000714765E-1f); + const v_float32 _vlog_p7_fp32 = vx_setall_f32(-2.4999993993E-1f); + const v_float32 _vlog_p8_fp32 = vx_setall_f32(3.3333331174E-1f); + const v_int32 _vlog_inv_mant_mask_s32 = vx_setall_s32(~0x7f800000); + + v_float32 _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp; + v_int32 _vlog_ux, _vlog_emm0; + + _vlog_ux = v_reinterpret_as_s32(x); + _vlog_emm0 = v_shr(_vlog_ux, 23); + + _vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s32); + _vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s32(vx_setall_f32(0.5f))); + _vlog_x = v_reinterpret_as_f32(_vlog_ux); + + _vlog_emm0 = v_sub(_vlog_emm0, vx_setall_s32(0x7f)); + _vlog_e = v_cvt_f32(_vlog_emm0); + + _vlog_e = v_add(_vlog_e, _vlog_one_fp32); + + v_float32 _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp32); + _vlog_tmp = v_and(_vlog_x, _vlog_mask); + _vlog_x = v_sub(_vlog_x, _vlog_one_fp32); + _vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp32, _vlog_mask)); + _vlog_x = v_add(_vlog_x, _vlog_tmp); + + _vlog_z = v_mul(_vlog_x, _vlog_x); + + _vlog_y = v_fma(_vlog_p0_fp32, _vlog_x, _vlog_p1_fp32); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp32); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp32); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp32); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp32); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp32); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp32); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp32); + _vlog_y = v_mul(_vlog_y, _vlog_x); + _vlog_y = v_mul(_vlog_y, _vlog_z); + + _vlog_y = v_fma(_vlog_e, _vlog_q1_fp32, _vlog_y); + + _vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, vx_setall_f32(0.5))); + + _vlog_x = v_add(_vlog_x, _vlog_y); + _vlog_x = v_fma(_vlog_e, _vlog_q2_fp32, _vlog_x); + // log(0) -> -INF + v_float32 mask_zero = v_eq(x, vx_setzero_f32()); + _vlog_x = v_select(mask_zero, v_reinterpret_as_f32(vx_setall_s32(0xff800000)), _vlog_x); + // log(NEG), log(NAN) -> NAN + v_float32 mask_not_nan = v_ge(x, vx_setzero_f32()); + _vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f32(vx_setall_s32(0x7fc00000))); + // log(INF) -> INF + v_float32 mask_inf = v_eq(x, v_reinterpret_as_f32(vx_setall_s32(0x7f800000))); + _vlog_x = v_select(mask_inf, x, _vlog_x); + return _vlog_x; + } + +#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F + inline v_float64 v_log(const v_float64 &x) { + const v_float64 _vlog_one_fp64 = vx_setall_f64(1.0); + const v_float64 _vlog_SQRTHF_fp64 = vx_setall_f64(0.7071067811865475244); + const v_float64 _vlog_p0_fp64 = vx_setall_f64(1.01875663804580931796E-4); + const v_float64 _vlog_p1_fp64 = vx_setall_f64(4.97494994976747001425E-1); + const v_float64 _vlog_p2_fp64 = vx_setall_f64(4.70579119878881725854); + const v_float64 _vlog_p3_fp64 = vx_setall_f64(1.44989225341610930846E1); + const v_float64 _vlog_p4_fp64 = vx_setall_f64(1.79368678507819816313E1); + const v_float64 _vlog_p5_fp64 = vx_setall_f64(7.70838733755885391666); + const v_float64 _vlog_q0_fp64 = vx_setall_f64(1.12873587189167450590E1); + const v_float64 _vlog_q1_fp64 = vx_setall_f64(4.52279145837532221105E1); + const v_float64 _vlog_q2_fp64 = vx_setall_f64(8.29875266912776603211E1); + const v_float64 _vlog_q3_fp64 = vx_setall_f64(7.11544750618563894466E1); + const v_float64 _vlog_q4_fp64 = vx_setall_f64(2.31251620126765340583E1); + + const v_float64 _vlog_C0_fp64 = vx_setall_f64(2.121944400546905827679e-4); + const v_float64 _vlog_C1_fp64 = vx_setall_f64(0.693359375); + const v_int64 _vlog_inv_mant_mask_s64 = vx_setall_s64(~0x7ff0000000000000); + + v_float64 _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp, _vlog_xx; + v_int64 _vlog_ux, _vlog_emm0; + + _vlog_ux = v_reinterpret_as_s64(x); + _vlog_emm0 = v_shr(_vlog_ux, 52); + + _vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s64); + _vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s64(vx_setall_f64(0.5))); + _vlog_x = v_reinterpret_as_f64(_vlog_ux); + + _vlog_emm0 = v_sub(_vlog_emm0, vx_setall_s64(0x3ff)); + _vlog_e = v_cvt_f64(_vlog_emm0); + + _vlog_e = v_add(_vlog_e, _vlog_one_fp64); + + v_float64 _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp64); + _vlog_tmp = v_and(_vlog_x, _vlog_mask); + _vlog_x = v_sub(_vlog_x, _vlog_one_fp64); + _vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp64, _vlog_mask)); + _vlog_x = v_add(_vlog_x, _vlog_tmp); + + _vlog_xx = v_mul(_vlog_x, _vlog_x); + + _vlog_y = v_fma(_vlog_p0_fp64, _vlog_x, _vlog_p1_fp64); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp64); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp64); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp64); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp64); + _vlog_y = v_mul(_vlog_y, _vlog_x); + _vlog_y = v_mul(_vlog_y, _vlog_xx); + + _vlog_z = v_add(_vlog_x, _vlog_q0_fp64); + _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q1_fp64); + _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q2_fp64); + _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q3_fp64); + _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q4_fp64); + + _vlog_z = v_div(_vlog_y, _vlog_z); + _vlog_z = v_sub(_vlog_z, v_mul(_vlog_e, _vlog_C0_fp64)); + _vlog_z = v_sub(_vlog_z, v_mul(_vlog_xx, vx_setall_f64(0.5))); + + _vlog_z = v_add(_vlog_z, _vlog_x); + _vlog_z = v_fma(_vlog_e, _vlog_C1_fp64, _vlog_z); + + // log(0) -> -INF + v_float64 mask_zero = v_eq(x, vx_setzero_f64()); + _vlog_z = v_select(mask_zero, v_reinterpret_as_f64(vx_setall_s64(0xfff0000000000000)), _vlog_z); + // log(NEG), log(NAN) -> NAN + v_float64 mask_not_nan = v_ge(x, vx_setzero_f64()); + _vlog_z = v_select(mask_not_nan, _vlog_z, v_reinterpret_as_f64(vx_setall_s64(0x7ff8000000000000))); + // log(INF) -> INF + v_float64 mask_inf = v_eq(x, v_reinterpret_as_f64(vx_setall_s64(0x7ff0000000000000))); + _vlog_z = v_select(mask_inf, x, _vlog_z); + return _vlog_z; + } +#endif + +#define OPENCV_HAL_MATH_HAVE_LOG 1 +//! @} #endif } #endif // OPENCV_HAL_INTRIN_HPP diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index 6e55a6ddd3..4893e64ba8 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -1792,6 +1792,75 @@ template struct TheTest TheTest &test_exp_fp64() { #if CV_SIMD_64F || CV_SIMD_SCALABLE_64F __test_exp(709.0, 1e-15, 1e15, DBL_MIN); +#endif + return *this; + } + + void __test_log(LaneType expBound, LaneType diff_thr, LaneType flt_min) { + int n = VTraits::vlanes(); + // Test special values + std::vector specialValues = {0, 1, (LaneType) M_E, INFINITY, -INFINITY, NAN}; + const int testRandNum = 10000; + const double specialValueProbability = 0.1; // 10% chance to insert a special value + cv::RNG_MT19937 rng; + + for (int i = 0; i < testRandNum; i++) { + Data dataRand, resRand; + for (int j = 0; j < n; ++j) { + if (rng.uniform(0.f, 1.f) <= specialValueProbability) { + // Insert a special value + int specialValueIndex = rng.uniform(0, (int) specialValues.size()); + dataRand[j] = specialValues[specialValueIndex]; + } else { + // Generate uniform random data in [-expBound, expBound] + dataRand[j] = (LaneType) std::exp(rng.uniform(-expBound, expBound)); + } + } + + // Compare with std::log + R x = dataRand; + resRand = v_log(x); + for (int j = 0; j < n; ++j) { + SCOPED_TRACE(cv::format("Random test value: %f", dataRand[j])); + LaneType std_log = std::log(dataRand[j]); + if (dataRand[j] == 0) { + // input 0 -> output -INF + EXPECT_TRUE(std::isinf(resRand[j]) && resRand[j] < 0); + } else if (dataRand[j] < 0 || std::isnan(dataRand[j])) { + // input less than 0 -> output NAN + // input NaN -> output NaN + EXPECT_TRUE(std::isnan(resRand[j])); + } else if (dataRand[j] == 1) { + // input 1 -> output 0 + EXPECT_EQ((LaneType) 0, resRand[j]); + } else if (std::isinf(dataRand[j]) && dataRand[j] > 0) { + // input INF -> output INF + EXPECT_TRUE(std::isinf(resRand[j]) && resRand[j] > 0); + } else { + EXPECT_LT(std::abs(resRand[j] - std_log), diff_thr * (std::abs(std_log) + flt_min * 100)); + } + } + } + } + + TheTest &test_log_fp16() { +#if CV_SIMD_FP16 + float16_t flt16_min; + uint16_t flt16_min_hex = 0x0400; + std::memcpy(&flt16_min, &flt16_min_hex, sizeof(float16_t)); + __test_log((float16_t) 9, (float16_t) 1e-3, flt16_min); +#endif + return *this; + } + + TheTest &test_log_fp32() { + __test_log(25.f, 1e-6f, FLT_MIN); + return *this; + } + + TheTest &test_log_fp64() { +#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F + __test_log(200., 1e-15, DBL_MIN); #endif return *this; } @@ -2109,6 +2178,7 @@ void test_hal_intrin_float32() .test_broadcast_highest() .test_pack_triplets() .test_exp_fp32() + .test_log_fp32() #if CV_SIMD_WIDTH == 32 .test_extract<4>().test_extract<5>().test_extract<6>().test_extract<7>() .test_rotate<4>().test_rotate<5>().test_rotate<6>().test_rotate<7>() @@ -2140,6 +2210,7 @@ void test_hal_intrin_float64() .test_extract_n<0>().test_extract_n<1>() .test_extract_highest() .test_exp_fp64() + .test_log_fp64() //.test_broadcast_element<0>().test_broadcast_element<1>() #if CV_SIMD_WIDTH == 32 .test_extract<2>().test_extract<3>() @@ -2161,6 +2232,7 @@ void test_hal_intrin_float16() .test_loadstore_fp16() .test_float_cvt_fp16() .test_exp_fp16() + .test_log_fp16() #endif ; #else