diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index e9e8d28eaa..e364ba359b 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -263,7 +263,8 @@ Most of these operations return only one value. ### Other math -- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude, @ref v_exp +- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude, @ref v_exp, + @ref v_erf - Absolute values: @ref v_abs, @ref v_absdiff, @ref v_absdiffs ### Conversions @@ -761,6 +762,13 @@ OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp) OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp) #define OPENCV_HAL_MATH_HAVE_LOG 1 +/** + * @brief Error function. + * + * @note Support FP32 precision for now. + */ +OPENCV_HAL_IMPL_MATH_FUNC(v_erf, std::erf, _Tp) + //! @cond IGNORED OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp) #define OPENCV_HAL_MATH_HAVE_SIN 1 diff --git a/modules/core/include/opencv2/core/hal/intrin_math.hpp b/modules/core/include/opencv2/core/hal/intrin_math.hpp index 0f51b9ba13..4f967cff1a 100644 --- a/modules/core/include/opencv2/core/hal/intrin_math.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_math.hpp @@ -418,5 +418,50 @@ namespace CV__SIMD_NAMESPACE { #define OPENCV_HAL_MATH_HAVE_LOG 1 //! @} #endif + +/* This implementation is derived from the approximation approach of Error Function (Erf) from PyTorch + https://github.com/pytorch/pytorch/blob/9c50ecc84b9a6e699a7f058891b889aafbf976c7/aten/src/ATen/cpu/vec/vec512/vec512_float.h#L189-L220 +*/ + +#ifndef OPENCV_HAL_MATH_HAVE_ERF + +//! @name Error Function +//! @{ + + inline v_float32 v_erf(v_float32 v) { + const v_float32 coef0 = vx_setall_f32(0.3275911f), + coef1 = vx_setall_f32(1.061405429f), + coef2 = vx_setall_f32(-1.453152027f), + coef3 = vx_setall_f32(1.421413741f), + coef4 = vx_setall_f32(-0.284496736f), + coef5 = vx_setall_f32(0.254829592f), + ones = vx_setall_f32(1.0f), + neg_zeros = vx_setall_f32(-0.f); + v_float32 t = v_abs(v); + // sign(v) + v_float32 sign_mask = v_and(neg_zeros, v); + + t = v_div(ones, v_fma(coef0, t, ones)); + v_float32 r = v_fma(coef1, t, coef2); + r = v_fma(r, t, coef3); + r = v_fma(r, t, coef4); + r = v_fma(r, t, coef5); + // - v * v + v_float32 pow_2 = v_mul(v, v); + v_float32 neg_pow_2 = v_xor(neg_zeros, pow_2); + // - exp(- v * v) + v_float32 exp = v_exp(neg_pow_2); + v_float32 neg_exp = v_xor(neg_zeros, exp); + v_float32 res = v_mul(t, neg_exp); + res = v_fma(r, res, ones); + return v_xor(sign_mask, res); + } + +#define OPENCV_HAL_MATH_HAVE_ERF 1 +//! @} + +#endif // OPENCV_HAL_MATH_HAVE_ERF + + } #endif // OPENCV_HAL_INTRIN_HPP diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index 4893e64ba8..742136f84c 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -1864,6 +1864,48 @@ template struct TheTest #endif return *this; } + + TheTest &test_erf_fp32() { + int n = VTraits::vlanes(); + + constexpr int num_loops = 10000; + const std::vector singular_inputs{INFINITY, -INFINITY, NAN}; + constexpr double insert_singular_input_probability = 0.1; + cv::RNG_MT19937 rng; + + for (int i = 0; i < num_loops; i++) { + Data inputs; + for (int j = 0; j < n; j++) { + if (rng.uniform(0.f, 1.f) <= insert_singular_input_probability) { + int singular_input_index = rng.uniform(0, int(singular_inputs.size())); + inputs[j] = singular_inputs[singular_input_index]; + } else { + // std::exp(float) overflows at about 88.0f. + // In v_erf, exp is called on input*input. So test range is [-sqrt(88.0f), sqrt(88.0f)] + inputs[j] = (LaneType) rng.uniform(-9.4f, 9.4f); + } + } + + Data outputs = v_erf(R(inputs)); + for (int j = 0; j < n; j++) { + SCOPED_TRACE(cv::format("Random test value: %f", inputs[j])); + if (std::isinf(inputs[j])) { + if (inputs[j] < 0) { + EXPECT_EQ(-1, outputs[j]); + } else { + EXPECT_EQ(1, outputs[j]); + } + } else if (std::isnan(inputs[j])) { + EXPECT_TRUE(std::isnan(outputs[j])); + } else { + LaneType ref_output = std::erf(inputs[j]); + EXPECT_LT(std::abs(outputs[j] - ref_output), 1e-3f * (std::abs(ref_output) + FLT_MIN * 1e4f)); + } + } + } + + return *this; + } }; #define DUMP_ENTRY(type) printf("SIMD%d: %s\n", 8*VTraits::vlanes(), CV__TRACE_FUNCTION); @@ -2179,6 +2221,7 @@ void test_hal_intrin_float32() .test_pack_triplets() .test_exp_fp32() .test_log_fp32() + .test_erf_fp32() #if CV_SIMD_WIDTH == 32 .test_extract<4>().test_extract<5>().test_extract<6>().test_extract<7>() .test_rotate<4>().test_rotate<5>().test_rotate<6>().test_rotate<7>()