Merge pull request #25781 from WanliZhong:v_log

Add support for v_log (Natural Logarithm) #25781

This PR aims to implement `v_log(v_float16 x)`, `v_log(v_float32 x)` and `v_log(v_float64 x)`. 
Merged after https://github.com/opencv/opencv/pull/24941

TODO:
- [x] double and half float precision
- [x] tests for them
- [x] doc to explain the implementation

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
pull/25857/head
Wanli 4 months ago committed by GitHub
parent 934e6899f8
commit bef6c110a4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 21
      modules/core/include/opencv2/core/hal/intrin_cpp.hpp
  2. 222
      modules/core/include/opencv2/core/hal/intrin_math.hpp
  3. 72
      modules/core/test/test_intrin_utils.hpp

@ -364,6 +364,7 @@ Floating point:
|extract_n | x | x |
|broadcast_element | x | |
|exp | x | x |
|log | x | x |
@{ */
@ -742,13 +743,29 @@ OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
#define OPENCV_HAL_MATH_HAVE_EXP 1
/**
* @brief Natural logarithm \f$ \log(x) \f$ of elements
*
* Only for floating point types. Core implementation steps:
* 1. Decompose Input: Use binary representation to decompose the input into mantissa part \f$ m \f$ and exponent part \f$ e \f$. Such that \f$ \log(x) = \log(m \cdot 2^e) = \log(m) + e \cdot \ln(2) \f$.
* 2. Adjust Mantissa and Exponent Parts: If the mantissa is less than \f$ \sqrt{0.5} \f$, adjust the exponent and mantissa to ensure the mantissa is in the range \f$ (\sqrt{0.5}, \sqrt{2}) \f$ for better approximation.
* 3. Polynomial Approximation for \f$ \log(m) \f$: The closer the \f$ m \f$ is to 1, the more accurate the result.
* - For float16 and float32, use a Taylor Series with 9 terms.
* - For float64, use Pade Polynomials Approximation with 6 terms.
* 4. Combine Results: Add the two parts together to get the final result.
*
* @note The precision of the calculation depends on the implementation and the data type of the input.
*
* @note Similar to the behavior of std::log(), \f$ \ln(0) = -\infty \f$.
*/
OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
#define OPENCV_HAL_MATH_HAVE_LOG 1
//! @cond IGNORED
OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
#define OPENCV_HAL_MATH_HAVE_SIN 1
OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
#define OPENCV_HAL_MATH_HAVE_COS 1
OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
#define OPENCV_HAL_MATH_HAVE_LOG 1
//! @endcond
/** @brief Absolute value of elements

@ -194,7 +194,229 @@ namespace CV__SIMD_NAMESPACE {
#define OPENCV_HAL_MATH_HAVE_EXP 1
//! @}
#endif
#ifndef OPENCV_HAL_MATH_HAVE_LOG
//! @name Natural Logarithm
//! @{
#if defined(CV_SIMD_FP16) && CV_SIMD_FP16
inline v_float16 v_log(const v_float16 &x) {
const v_float16 _vlog_one_fp16 = vx_setall_f16(1.0f);
const v_float16 _vlog_SQRTHF_fp16 = vx_setall_f16(0.707106781186547524f);
const v_float16 _vlog_q1_fp16 = vx_setall_f16(-2.12194440E-4f);
const v_float16 _vlog_q2_fp16 = vx_setall_f16(0.693359375f);
const v_float16 _vlog_p0_fp16 = vx_setall_f16(7.0376836292E-2f);
const v_float16 _vlog_p1_fp16 = vx_setall_f16(-1.1514610310E-1f);
const v_float16 _vlog_p2_fp16 = vx_setall_f16(1.1676998740E-1f);
const v_float16 _vlog_p3_fp16 = vx_setall_f16(-1.2420140846E-1f);
const v_float16 _vlog_p4_fp16 = vx_setall_f16(1.4249322787E-1f);
const v_float16 _vlog_p5_fp16 = vx_setall_f16(-1.6668057665E-1f);
const v_float16 _vlog_p6_fp16 = vx_setall_f16(2.0000714765E-1f);
const v_float16 _vlog_p7_fp16 = vx_setall_f16(-2.4999993993E-1f);
const v_float16 _vlog_p8_fp16 = vx_setall_f16(3.3333331174E-1f);
const v_int16 _vlog_inv_mant_mask_s16 = vx_setall_s16(~0x7c00);
v_float16 _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp;
v_int16 _vlog_ux, _vlog_emm0;
_vlog_ux = v_reinterpret_as_s16(x);
_vlog_emm0 = v_shr(_vlog_ux, 10);
_vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s16);
_vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s16(vx_setall_f16(0.5f)));
_vlog_x = v_reinterpret_as_f16(_vlog_ux);
_vlog_emm0 = v_sub(_vlog_emm0, vx_setall_s16(0xf));
_vlog_e = v_cvt_f16(_vlog_emm0);
_vlog_e = v_add(_vlog_e, _vlog_one_fp16);
v_float16 _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp16);
_vlog_tmp = v_and(_vlog_x, _vlog_mask);
_vlog_x = v_sub(_vlog_x, _vlog_one_fp16);
_vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp16, _vlog_mask));
_vlog_x = v_add(_vlog_x, _vlog_tmp);
_vlog_z = v_mul(_vlog_x, _vlog_x);
_vlog_y = v_fma(_vlog_p0_fp16, _vlog_x, _vlog_p1_fp16);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp16);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp16);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp16);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp16);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp16);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp16);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp16);
_vlog_y = v_mul(_vlog_y, _vlog_x);
_vlog_y = v_mul(_vlog_y, _vlog_z);
_vlog_y = v_fma(_vlog_e, _vlog_q1_fp16, _vlog_y);
_vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, vx_setall_f16(0.5f)));
_vlog_x = v_add(_vlog_x, _vlog_y);
_vlog_x = v_fma(_vlog_e, _vlog_q2_fp16, _vlog_x);
// log(0) -> -INF
v_float16 mask_zero = v_eq(x, vx_setzero_f16());
_vlog_x = v_select(mask_zero, v_reinterpret_as_f16(vx_setall_s16(0xfc00)), _vlog_x);
// log(NEG), log(NAN) -> NAN
v_float16 mask_not_nan = v_ge(x, vx_setzero_f16());
_vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f16(vx_setall_s16(0x7e00)));
// log(INF) -> INF
v_float16 mask_inf = v_eq(x, v_reinterpret_as_f16(vx_setall_s16(0x7c00)));
_vlog_x = v_select(mask_inf, x, _vlog_x);
return _vlog_x;
}
#endif
inline v_float32 v_log(const v_float32 &x) {
const v_float32 _vlog_one_fp32 = vx_setall_f32(1.0f);
const v_float32 _vlog_SQRTHF_fp32 = vx_setall_f32(0.707106781186547524f);
const v_float32 _vlog_q1_fp32 = vx_setall_f32(-2.12194440E-4f);
const v_float32 _vlog_q2_fp32 = vx_setall_f32(0.693359375f);
const v_float32 _vlog_p0_fp32 = vx_setall_f32(7.0376836292E-2f);
const v_float32 _vlog_p1_fp32 = vx_setall_f32(-1.1514610310E-1f);
const v_float32 _vlog_p2_fp32 = vx_setall_f32(1.1676998740E-1f);
const v_float32 _vlog_p3_fp32 = vx_setall_f32(-1.2420140846E-1f);
const v_float32 _vlog_p4_fp32 = vx_setall_f32(1.4249322787E-1f);
const v_float32 _vlog_p5_fp32 = vx_setall_f32(-1.6668057665E-1f);
const v_float32 _vlog_p6_fp32 = vx_setall_f32(2.0000714765E-1f);
const v_float32 _vlog_p7_fp32 = vx_setall_f32(-2.4999993993E-1f);
const v_float32 _vlog_p8_fp32 = vx_setall_f32(3.3333331174E-1f);
const v_int32 _vlog_inv_mant_mask_s32 = vx_setall_s32(~0x7f800000);
v_float32 _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp;
v_int32 _vlog_ux, _vlog_emm0;
_vlog_ux = v_reinterpret_as_s32(x);
_vlog_emm0 = v_shr(_vlog_ux, 23);
_vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s32);
_vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s32(vx_setall_f32(0.5f)));
_vlog_x = v_reinterpret_as_f32(_vlog_ux);
_vlog_emm0 = v_sub(_vlog_emm0, vx_setall_s32(0x7f));
_vlog_e = v_cvt_f32(_vlog_emm0);
_vlog_e = v_add(_vlog_e, _vlog_one_fp32);
v_float32 _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp32);
_vlog_tmp = v_and(_vlog_x, _vlog_mask);
_vlog_x = v_sub(_vlog_x, _vlog_one_fp32);
_vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp32, _vlog_mask));
_vlog_x = v_add(_vlog_x, _vlog_tmp);
_vlog_z = v_mul(_vlog_x, _vlog_x);
_vlog_y = v_fma(_vlog_p0_fp32, _vlog_x, _vlog_p1_fp32);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp32);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp32);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp32);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp32);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp32);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp32);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp32);
_vlog_y = v_mul(_vlog_y, _vlog_x);
_vlog_y = v_mul(_vlog_y, _vlog_z);
_vlog_y = v_fma(_vlog_e, _vlog_q1_fp32, _vlog_y);
_vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, vx_setall_f32(0.5)));
_vlog_x = v_add(_vlog_x, _vlog_y);
_vlog_x = v_fma(_vlog_e, _vlog_q2_fp32, _vlog_x);
// log(0) -> -INF
v_float32 mask_zero = v_eq(x, vx_setzero_f32());
_vlog_x = v_select(mask_zero, v_reinterpret_as_f32(vx_setall_s32(0xff800000)), _vlog_x);
// log(NEG), log(NAN) -> NAN
v_float32 mask_not_nan = v_ge(x, vx_setzero_f32());
_vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f32(vx_setall_s32(0x7fc00000)));
// log(INF) -> INF
v_float32 mask_inf = v_eq(x, v_reinterpret_as_f32(vx_setall_s32(0x7f800000)));
_vlog_x = v_select(mask_inf, x, _vlog_x);
return _vlog_x;
}
#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
inline v_float64 v_log(const v_float64 &x) {
const v_float64 _vlog_one_fp64 = vx_setall_f64(1.0);
const v_float64 _vlog_SQRTHF_fp64 = vx_setall_f64(0.7071067811865475244);
const v_float64 _vlog_p0_fp64 = vx_setall_f64(1.01875663804580931796E-4);
const v_float64 _vlog_p1_fp64 = vx_setall_f64(4.97494994976747001425E-1);
const v_float64 _vlog_p2_fp64 = vx_setall_f64(4.70579119878881725854);
const v_float64 _vlog_p3_fp64 = vx_setall_f64(1.44989225341610930846E1);
const v_float64 _vlog_p4_fp64 = vx_setall_f64(1.79368678507819816313E1);
const v_float64 _vlog_p5_fp64 = vx_setall_f64(7.70838733755885391666);
const v_float64 _vlog_q0_fp64 = vx_setall_f64(1.12873587189167450590E1);
const v_float64 _vlog_q1_fp64 = vx_setall_f64(4.52279145837532221105E1);
const v_float64 _vlog_q2_fp64 = vx_setall_f64(8.29875266912776603211E1);
const v_float64 _vlog_q3_fp64 = vx_setall_f64(7.11544750618563894466E1);
const v_float64 _vlog_q4_fp64 = vx_setall_f64(2.31251620126765340583E1);
const v_float64 _vlog_C0_fp64 = vx_setall_f64(2.121944400546905827679e-4);
const v_float64 _vlog_C1_fp64 = vx_setall_f64(0.693359375);
const v_int64 _vlog_inv_mant_mask_s64 = vx_setall_s64(~0x7ff0000000000000);
v_float64 _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp, _vlog_xx;
v_int64 _vlog_ux, _vlog_emm0;
_vlog_ux = v_reinterpret_as_s64(x);
_vlog_emm0 = v_shr(_vlog_ux, 52);
_vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s64);
_vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s64(vx_setall_f64(0.5)));
_vlog_x = v_reinterpret_as_f64(_vlog_ux);
_vlog_emm0 = v_sub(_vlog_emm0, vx_setall_s64(0x3ff));
_vlog_e = v_cvt_f64(_vlog_emm0);
_vlog_e = v_add(_vlog_e, _vlog_one_fp64);
v_float64 _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp64);
_vlog_tmp = v_and(_vlog_x, _vlog_mask);
_vlog_x = v_sub(_vlog_x, _vlog_one_fp64);
_vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp64, _vlog_mask));
_vlog_x = v_add(_vlog_x, _vlog_tmp);
_vlog_xx = v_mul(_vlog_x, _vlog_x);
_vlog_y = v_fma(_vlog_p0_fp64, _vlog_x, _vlog_p1_fp64);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp64);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp64);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp64);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp64);
_vlog_y = v_mul(_vlog_y, _vlog_x);
_vlog_y = v_mul(_vlog_y, _vlog_xx);
_vlog_z = v_add(_vlog_x, _vlog_q0_fp64);
_vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q1_fp64);
_vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q2_fp64);
_vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q3_fp64);
_vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q4_fp64);
_vlog_z = v_div(_vlog_y, _vlog_z);
_vlog_z = v_sub(_vlog_z, v_mul(_vlog_e, _vlog_C0_fp64));
_vlog_z = v_sub(_vlog_z, v_mul(_vlog_xx, vx_setall_f64(0.5)));
_vlog_z = v_add(_vlog_z, _vlog_x);
_vlog_z = v_fma(_vlog_e, _vlog_C1_fp64, _vlog_z);
// log(0) -> -INF
v_float64 mask_zero = v_eq(x, vx_setzero_f64());
_vlog_z = v_select(mask_zero, v_reinterpret_as_f64(vx_setall_s64(0xfff0000000000000)), _vlog_z);
// log(NEG), log(NAN) -> NAN
v_float64 mask_not_nan = v_ge(x, vx_setzero_f64());
_vlog_z = v_select(mask_not_nan, _vlog_z, v_reinterpret_as_f64(vx_setall_s64(0x7ff8000000000000)));
// log(INF) -> INF
v_float64 mask_inf = v_eq(x, v_reinterpret_as_f64(vx_setall_s64(0x7ff0000000000000)));
_vlog_z = v_select(mask_inf, x, _vlog_z);
return _vlog_z;
}
#endif
#define OPENCV_HAL_MATH_HAVE_LOG 1
//! @}
#endif
}
#endif // OPENCV_HAL_INTRIN_HPP

@ -1792,6 +1792,75 @@ template<typename R> struct TheTest
TheTest &test_exp_fp64() {
#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
__test_exp(709.0, 1e-15, 1e15, DBL_MIN);
#endif
return *this;
}
void __test_log(LaneType expBound, LaneType diff_thr, LaneType flt_min) {
int n = VTraits<R>::vlanes();
// Test special values
std::vector<LaneType> specialValues = {0, 1, (LaneType) M_E, INFINITY, -INFINITY, NAN};
const int testRandNum = 10000;
const double specialValueProbability = 0.1; // 10% chance to insert a special value
cv::RNG_MT19937 rng;
for (int i = 0; i < testRandNum; i++) {
Data<R> dataRand, resRand;
for (int j = 0; j < n; ++j) {
if (rng.uniform(0.f, 1.f) <= specialValueProbability) {
// Insert a special value
int specialValueIndex = rng.uniform(0, (int) specialValues.size());
dataRand[j] = specialValues[specialValueIndex];
} else {
// Generate uniform random data in [-expBound, expBound]
dataRand[j] = (LaneType) std::exp(rng.uniform(-expBound, expBound));
}
}
// Compare with std::log
R x = dataRand;
resRand = v_log(x);
for (int j = 0; j < n; ++j) {
SCOPED_TRACE(cv::format("Random test value: %f", dataRand[j]));
LaneType std_log = std::log(dataRand[j]);
if (dataRand[j] == 0) {
// input 0 -> output -INF
EXPECT_TRUE(std::isinf(resRand[j]) && resRand[j] < 0);
} else if (dataRand[j] < 0 || std::isnan(dataRand[j])) {
// input less than 0 -> output NAN
// input NaN -> output NaN
EXPECT_TRUE(std::isnan(resRand[j]));
} else if (dataRand[j] == 1) {
// input 1 -> output 0
EXPECT_EQ((LaneType) 0, resRand[j]);
} else if (std::isinf(dataRand[j]) && dataRand[j] > 0) {
// input INF -> output INF
EXPECT_TRUE(std::isinf(resRand[j]) && resRand[j] > 0);
} else {
EXPECT_LT(std::abs(resRand[j] - std_log), diff_thr * (std::abs(std_log) + flt_min * 100));
}
}
}
}
TheTest &test_log_fp16() {
#if CV_SIMD_FP16
float16_t flt16_min;
uint16_t flt16_min_hex = 0x0400;
std::memcpy(&flt16_min, &flt16_min_hex, sizeof(float16_t));
__test_log((float16_t) 9, (float16_t) 1e-3, flt16_min);
#endif
return *this;
}
TheTest &test_log_fp32() {
__test_log(25.f, 1e-6f, FLT_MIN);
return *this;
}
TheTest &test_log_fp64() {
#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
__test_log(200., 1e-15, DBL_MIN);
#endif
return *this;
}
@ -2109,6 +2178,7 @@ void test_hal_intrin_float32()
.test_broadcast_highest()
.test_pack_triplets()
.test_exp_fp32()
.test_log_fp32()
#if CV_SIMD_WIDTH == 32
.test_extract<4>().test_extract<5>().test_extract<6>().test_extract<7>()
.test_rotate<4>().test_rotate<5>().test_rotate<6>().test_rotate<7>()
@ -2140,6 +2210,7 @@ void test_hal_intrin_float64()
.test_extract_n<0>().test_extract_n<1>()
.test_extract_highest()
.test_exp_fp64()
.test_log_fp64()
//.test_broadcast_element<0>().test_broadcast_element<1>()
#if CV_SIMD_WIDTH == 32
.test_extract<2>().test_extract<3>()
@ -2161,6 +2232,7 @@ void test_hal_intrin_float16()
.test_loadstore_fp16()
.test_float_cvt_fp16()
.test_exp_fp16()
.test_log_fp16()
#endif
;
#else

Loading…
Cancel
Save