Merge pull request #25781 from WanliZhong:v_log

Add support for v_log (Natural Logarithm) #25781 This PR aims to implement `v_log(v_float16 x)`, `v_log(v_float32 x)` and `v_log(v_float64 x)`. Merged after https://github.com/opencv/opencv/pull/24941 TODO: - [x] double and half float precision - [x] tests for them - [x] doc to explain the implementation ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
11 months ago · bef6c110a4
parent 934e6899f8
commit bef6c110a4
3 changed files with 313 additions and 2 deletions
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@ -364,6 +364,7 @@ Floating point:
 |extract_n          | x | x |
 |broadcast_element  | x |   |
 |exp                | x | x |
+|log                | x | x |

 @{ */

@ -742,13 +743,29 @@ OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
 OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
 #define OPENCV_HAL_MATH_HAVE_EXP 1

+/**
+ * @brief Natural logarithm \f$ \log(x) \f$ of elements
+ *
+ * Only for floating point types. Core implementation steps:
+ * 1. Decompose Input: Use binary representation to decompose the input into mantissa part \f$ m \f$ and exponent part \f$ e \f$. Such that \f$ \log(x) = \log(m \cdot 2^e) = \log(m) + e \cdot \ln(2) \f$.
+ * 2. Adjust Mantissa and Exponent Parts: If the mantissa is less than \f$ \sqrt{0.5} \f$, adjust the exponent and mantissa to ensure the mantissa is in the range \f$ (\sqrt{0.5}, \sqrt{2}) \f$ for better approximation.
+ * 3. Polynomial Approximation for \f$ \log(m) \f$: The closer the \f$ m \f$ is to 1, the more accurate the result.
+ *    - For float16 and float32, use a Taylor Series with 9 terms.
+ *    - For float64, use Pade Polynomials Approximation with 6 terms.
+ * 4. Combine Results: Add the two parts together to get the final result.
+ *
+ * @note The precision of the calculation depends on the implementation and the data type of the input.
+ *
+ * @note Similar to the behavior of std::log(), \f$ \ln(0) = -\infty \f$.
+ */
+OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
+#define OPENCV_HAL_MATH_HAVE_LOG 1
+
 //! @cond IGNORED
 OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
 #define OPENCV_HAL_MATH_HAVE_SIN 1
 OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
 #define OPENCV_HAL_MATH_HAVE_COS 1
-OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
-#define OPENCV_HAL_MATH_HAVE_LOG 1
 //! @endcond

 /** @brief Absolute value of elements
--- a/modules/core/include/opencv2/core/hal/intrin_math.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_math.hpp
@ -194,7 +194,229 @@ namespace CV__SIMD_NAMESPACE {

 #define OPENCV_HAL_MATH_HAVE_EXP 1
 //! @}
+#endif
+
+#ifndef OPENCV_HAL_MATH_HAVE_LOG
+
+//! @name Natural Logarithm
+//! @{
+#if defined(CV_SIMD_FP16) && CV_SIMD_FP16
+    inline v_float16 v_log(const v_float16 &x) {
+        const v_float16 _vlog_one_fp16 = vx_setall_f16(1.0f);
+        const v_float16 _vlog_SQRTHF_fp16 = vx_setall_f16(0.707106781186547524f);
+        const v_float16 _vlog_q1_fp16 = vx_setall_f16(-2.12194440E-4f);
+        const v_float16 _vlog_q2_fp16 = vx_setall_f16(0.693359375f);
+        const v_float16 _vlog_p0_fp16 = vx_setall_f16(7.0376836292E-2f);
+        const v_float16 _vlog_p1_fp16 = vx_setall_f16(-1.1514610310E-1f);
+        const v_float16 _vlog_p2_fp16 = vx_setall_f16(1.1676998740E-1f);
+        const v_float16 _vlog_p3_fp16 = vx_setall_f16(-1.2420140846E-1f);
+        const v_float16 _vlog_p4_fp16 = vx_setall_f16(1.4249322787E-1f);
+        const v_float16 _vlog_p5_fp16 = vx_setall_f16(-1.6668057665E-1f);
+        const v_float16 _vlog_p6_fp16 = vx_setall_f16(2.0000714765E-1f);
+        const v_float16 _vlog_p7_fp16 = vx_setall_f16(-2.4999993993E-1f);
+        const v_float16 _vlog_p8_fp16 = vx_setall_f16(3.3333331174E-1f);
+        const v_int16 _vlog_inv_mant_mask_s16 = vx_setall_s16(~0x7c00);
+
+        v_float16 _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp;
+        v_int16 _vlog_ux, _vlog_emm0;
+
+        _vlog_ux = v_reinterpret_as_s16(x);
+        _vlog_emm0 = v_shr(_vlog_ux, 10);
+
+        _vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s16);
+        _vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s16(vx_setall_f16(0.5f)));
+        _vlog_x = v_reinterpret_as_f16(_vlog_ux);
+
+        _vlog_emm0 = v_sub(_vlog_emm0, vx_setall_s16(0xf));
+        _vlog_e = v_cvt_f16(_vlog_emm0);
+
+        _vlog_e = v_add(_vlog_e, _vlog_one_fp16);
+
+        v_float16 _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp16);
+        _vlog_tmp = v_and(_vlog_x, _vlog_mask);
+        _vlog_x = v_sub(_vlog_x, _vlog_one_fp16);
+        _vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp16, _vlog_mask));
+        _vlog_x = v_add(_vlog_x, _vlog_tmp);
+
+        _vlog_z = v_mul(_vlog_x, _vlog_x);
+
+        _vlog_y = v_fma(_vlog_p0_fp16, _vlog_x, _vlog_p1_fp16);
+        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp16);
+        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp16);
+        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp16);
+        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp16);
+        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp16);
+        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp16);
+        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp16);
+        _vlog_y = v_mul(_vlog_y, _vlog_x);
+        _vlog_y = v_mul(_vlog_y, _vlog_z);
+
+        _vlog_y = v_fma(_vlog_e, _vlog_q1_fp16, _vlog_y);
+
+        _vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, vx_setall_f16(0.5f)));
+
+        _vlog_x = v_add(_vlog_x, _vlog_y);
+        _vlog_x = v_fma(_vlog_e, _vlog_q2_fp16, _vlog_x);
+        // log(0) -> -INF
+        v_float16 mask_zero = v_eq(x, vx_setzero_f16());
+        _vlog_x = v_select(mask_zero, v_reinterpret_as_f16(vx_setall_s16(0xfc00)), _vlog_x);
+        // log(NEG), log(NAN) -> NAN
+        v_float16 mask_not_nan = v_ge(x, vx_setzero_f16());
+        _vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f16(vx_setall_s16(0x7e00)));
+        // log(INF) -> INF
+        v_float16 mask_inf = v_eq(x, v_reinterpret_as_f16(vx_setall_s16(0x7c00)));
+        _vlog_x = v_select(mask_inf, x, _vlog_x);
+        return _vlog_x;
+    }
+#endif

+    inline v_float32 v_log(const v_float32 &x) {
+        const v_float32 _vlog_one_fp32 = vx_setall_f32(1.0f);
+        const v_float32 _vlog_SQRTHF_fp32 = vx_setall_f32(0.707106781186547524f);
+        const v_float32 _vlog_q1_fp32 = vx_setall_f32(-2.12194440E-4f);
+        const v_float32 _vlog_q2_fp32 = vx_setall_f32(0.693359375f);
+        const v_float32 _vlog_p0_fp32 = vx_setall_f32(7.0376836292E-2f);
+        const v_float32 _vlog_p1_fp32 = vx_setall_f32(-1.1514610310E-1f);
+        const v_float32 _vlog_p2_fp32 = vx_setall_f32(1.1676998740E-1f);
+        const v_float32 _vlog_p3_fp32 = vx_setall_f32(-1.2420140846E-1f);
+        const v_float32 _vlog_p4_fp32 = vx_setall_f32(1.4249322787E-1f);
+        const v_float32 _vlog_p5_fp32 = vx_setall_f32(-1.6668057665E-1f);
+        const v_float32 _vlog_p6_fp32 = vx_setall_f32(2.0000714765E-1f);
+        const v_float32 _vlog_p7_fp32 = vx_setall_f32(-2.4999993993E-1f);
+        const v_float32 _vlog_p8_fp32 = vx_setall_f32(3.3333331174E-1f);
+        const v_int32 _vlog_inv_mant_mask_s32 = vx_setall_s32(~0x7f800000);
+
+        v_float32 _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp;
+        v_int32 _vlog_ux, _vlog_emm0;
+
+        _vlog_ux = v_reinterpret_as_s32(x);
+        _vlog_emm0 = v_shr(_vlog_ux, 23);
+
+        _vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s32);
+        _vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s32(vx_setall_f32(0.5f)));
+        _vlog_x = v_reinterpret_as_f32(_vlog_ux);
+
+        _vlog_emm0 = v_sub(_vlog_emm0, vx_setall_s32(0x7f));
+        _vlog_e = v_cvt_f32(_vlog_emm0);
+
+        _vlog_e = v_add(_vlog_e, _vlog_one_fp32);
+
+        v_float32 _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp32);
+        _vlog_tmp = v_and(_vlog_x, _vlog_mask);
+        _vlog_x = v_sub(_vlog_x, _vlog_one_fp32);
+        _vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp32, _vlog_mask));
+        _vlog_x = v_add(_vlog_x, _vlog_tmp);
+
+        _vlog_z = v_mul(_vlog_x, _vlog_x);
+
+        _vlog_y = v_fma(_vlog_p0_fp32, _vlog_x, _vlog_p1_fp32);
+        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp32);
+        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp32);
+        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp32);
+        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp32);
+        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp32);
+        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp32);
+        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp32);
+        _vlog_y = v_mul(_vlog_y, _vlog_x);
+        _vlog_y = v_mul(_vlog_y, _vlog_z);
+
+        _vlog_y = v_fma(_vlog_e, _vlog_q1_fp32, _vlog_y);
+
+        _vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, vx_setall_f32(0.5)));
+
+        _vlog_x = v_add(_vlog_x, _vlog_y);
+        _vlog_x = v_fma(_vlog_e, _vlog_q2_fp32, _vlog_x);
+        // log(0) -> -INF
+        v_float32 mask_zero = v_eq(x, vx_setzero_f32());
+        _vlog_x = v_select(mask_zero, v_reinterpret_as_f32(vx_setall_s32(0xff800000)), _vlog_x);
+        // log(NEG), log(NAN) -> NAN
+        v_float32 mask_not_nan = v_ge(x, vx_setzero_f32());
+        _vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f32(vx_setall_s32(0x7fc00000)));
+        // log(INF) -> INF
+        v_float32 mask_inf = v_eq(x, v_reinterpret_as_f32(vx_setall_s32(0x7f800000)));
+        _vlog_x = v_select(mask_inf, x, _vlog_x);
+        return _vlog_x;
+    }
+
+#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
+    inline v_float64 v_log(const v_float64 &x) {
+        const v_float64 _vlog_one_fp64 = vx_setall_f64(1.0);
+        const v_float64 _vlog_SQRTHF_fp64 = vx_setall_f64(0.7071067811865475244);
+        const v_float64 _vlog_p0_fp64 = vx_setall_f64(1.01875663804580931796E-4);
+        const v_float64 _vlog_p1_fp64 = vx_setall_f64(4.97494994976747001425E-1);
+        const v_float64 _vlog_p2_fp64 = vx_setall_f64(4.70579119878881725854);
+        const v_float64 _vlog_p3_fp64 = vx_setall_f64(1.44989225341610930846E1);
+        const v_float64 _vlog_p4_fp64 = vx_setall_f64(1.79368678507819816313E1);
+        const v_float64 _vlog_p5_fp64 = vx_setall_f64(7.70838733755885391666);
+        const v_float64 _vlog_q0_fp64 = vx_setall_f64(1.12873587189167450590E1);
+        const v_float64 _vlog_q1_fp64 = vx_setall_f64(4.52279145837532221105E1);
+        const v_float64 _vlog_q2_fp64 = vx_setall_f64(8.29875266912776603211E1);
+        const v_float64 _vlog_q3_fp64 = vx_setall_f64(7.11544750618563894466E1);
+        const v_float64 _vlog_q4_fp64 = vx_setall_f64(2.31251620126765340583E1);
+
+        const v_float64 _vlog_C0_fp64 = vx_setall_f64(2.121944400546905827679e-4);
+        const v_float64 _vlog_C1_fp64 = vx_setall_f64(0.693359375);
+        const v_int64 _vlog_inv_mant_mask_s64 = vx_setall_s64(~0x7ff0000000000000);
+
+        v_float64 _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp, _vlog_xx;
+        v_int64 _vlog_ux, _vlog_emm0;
+
+        _vlog_ux = v_reinterpret_as_s64(x);
+        _vlog_emm0 = v_shr(_vlog_ux, 52);
+
+        _vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s64);
+        _vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s64(vx_setall_f64(0.5)));
+        _vlog_x = v_reinterpret_as_f64(_vlog_ux);
+
+        _vlog_emm0 = v_sub(_vlog_emm0, vx_setall_s64(0x3ff));
+        _vlog_e = v_cvt_f64(_vlog_emm0);
+
+        _vlog_e = v_add(_vlog_e, _vlog_one_fp64);
+
+        v_float64 _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp64);
+        _vlog_tmp = v_and(_vlog_x, _vlog_mask);
+        _vlog_x = v_sub(_vlog_x, _vlog_one_fp64);
+        _vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp64, _vlog_mask));
+        _vlog_x = v_add(_vlog_x, _vlog_tmp);
+
+        _vlog_xx = v_mul(_vlog_x, _vlog_x);
+
+        _vlog_y = v_fma(_vlog_p0_fp64, _vlog_x, _vlog_p1_fp64);
+        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp64);
+        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp64);
+        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp64);
+        _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp64);
+        _vlog_y = v_mul(_vlog_y, _vlog_x);
+        _vlog_y = v_mul(_vlog_y, _vlog_xx);
+
+        _vlog_z = v_add(_vlog_x, _vlog_q0_fp64);
+        _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q1_fp64);
+        _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q2_fp64);
+        _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q3_fp64);
+        _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q4_fp64);
+
+        _vlog_z = v_div(_vlog_y, _vlog_z);
+        _vlog_z = v_sub(_vlog_z, v_mul(_vlog_e, _vlog_C0_fp64));
+        _vlog_z = v_sub(_vlog_z, v_mul(_vlog_xx, vx_setall_f64(0.5)));
+
+        _vlog_z = v_add(_vlog_z, _vlog_x);
+        _vlog_z = v_fma(_vlog_e, _vlog_C1_fp64, _vlog_z);
+
+        // log(0) -> -INF
+        v_float64 mask_zero = v_eq(x, vx_setzero_f64());
+        _vlog_z = v_select(mask_zero, v_reinterpret_as_f64(vx_setall_s64(0xfff0000000000000)), _vlog_z);
+        // log(NEG), log(NAN) -> NAN
+        v_float64 mask_not_nan = v_ge(x, vx_setzero_f64());
+        _vlog_z = v_select(mask_not_nan, _vlog_z, v_reinterpret_as_f64(vx_setall_s64(0x7ff8000000000000)));
+        // log(INF) -> INF
+        v_float64 mask_inf = v_eq(x, v_reinterpret_as_f64(vx_setall_s64(0x7ff0000000000000)));
+        _vlog_z = v_select(mask_inf, x, _vlog_z);
+        return _vlog_z;
+    }
+#endif
+
+#define OPENCV_HAL_MATH_HAVE_LOG 1
+//! @}
 #endif
 }
 #endif  // OPENCV_HAL_INTRIN_HPP
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@ -1792,6 +1792,75 @@ template<typename R> struct TheTest
    TheTest &test_exp_fp64() {
 #if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
        __test_exp(709.0, 1e-15, 1e15, DBL_MIN);
+#endif
+        return *this;
+    }
+
+    void __test_log(LaneType expBound, LaneType diff_thr, LaneType flt_min) {
+        int n = VTraits<R>::vlanes();
+        // Test special values
+        std::vector<LaneType> specialValues = {0, 1, (LaneType) M_E, INFINITY, -INFINITY, NAN};
+        const int testRandNum = 10000;
+        const double specialValueProbability = 0.1; // 10% chance to insert a special value
+        cv::RNG_MT19937 rng;
+
+        for (int i = 0; i < testRandNum; i++) {
+            Data<R> dataRand, resRand;
+            for (int j = 0; j < n; ++j) {
+                if (rng.uniform(0.f, 1.f) <= specialValueProbability) {
+                    // Insert a special value
+                    int specialValueIndex = rng.uniform(0, (int) specialValues.size());
+                    dataRand[j] = specialValues[specialValueIndex];
+                } else {
+                    // Generate uniform random data in [-expBound, expBound]
+                    dataRand[j] = (LaneType) std::exp(rng.uniform(-expBound, expBound));
+                }
+            }
+
+            // Compare with std::log
+            R x = dataRand;
+            resRand = v_log(x);
+            for (int j = 0; j < n; ++j) {
+                SCOPED_TRACE(cv::format("Random test value: %f", dataRand[j]));
+                LaneType std_log = std::log(dataRand[j]);
+                if (dataRand[j] == 0) {
+                    // input 0 -> output -INF
+                    EXPECT_TRUE(std::isinf(resRand[j]) && resRand[j] < 0);
+                } else if (dataRand[j] < 0 || std::isnan(dataRand[j])) {
+                    // input less than 0 -> output NAN
+                    // input NaN -> output NaN
+                    EXPECT_TRUE(std::isnan(resRand[j]));
+                } else if (dataRand[j] == 1) {
+                    // input 1 -> output 0
+                    EXPECT_EQ((LaneType) 0, resRand[j]);
+                } else if (std::isinf(dataRand[j]) && dataRand[j] > 0) {
+                    // input INF -> output INF
+                    EXPECT_TRUE(std::isinf(resRand[j]) && resRand[j] > 0);
+                } else {
+                    EXPECT_LT(std::abs(resRand[j] - std_log), diff_thr * (std::abs(std_log) + flt_min * 100));
+                }
+            }
+        }
+    }
+
+    TheTest &test_log_fp16() {
+#if CV_SIMD_FP16
+        float16_t flt16_min;
+        uint16_t flt16_min_hex = 0x0400;
+        std::memcpy(&flt16_min, &flt16_min_hex, sizeof(float16_t));
+        __test_log((float16_t) 9, (float16_t) 1e-3, flt16_min);
+#endif
+        return *this;
+    }
+
+    TheTest &test_log_fp32() {
+        __test_log(25.f, 1e-6f, FLT_MIN);
+        return *this;
+    }
+
+    TheTest &test_log_fp64() {
+#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
+        __test_log(200., 1e-15, DBL_MIN);
 #endif
        return *this;
    }
@ -2109,6 +2178,7 @@ void test_hal_intrin_float32()
        .test_broadcast_highest()
        .test_pack_triplets()
        .test_exp_fp32()
+        .test_log_fp32()
 #if CV_SIMD_WIDTH == 32
        .test_extract<4>().test_extract<5>().test_extract<6>().test_extract<7>()
        .test_rotate<4>().test_rotate<5>().test_rotate<6>().test_rotate<7>()
@ -2140,6 +2210,7 @@ void test_hal_intrin_float64()
        .test_extract_n<0>().test_extract_n<1>()
        .test_extract_highest()
        .test_exp_fp64()
+        .test_log_fp64()
        //.test_broadcast_element<0>().test_broadcast_element<1>()
 #if CV_SIMD_WIDTH == 32
        .test_extract<2>().test_extract<3>()
@ -2161,6 +2232,7 @@ void test_hal_intrin_float16()
        .test_loadstore_fp16()
        .test_float_cvt_fp16()
        .test_exp_fp16()
+        .test_log_fp16()
 #endif
        ;
 #else