Merge pull request #24941 from WanliZhong:v_exp

Add support for v_exp (exponential) #24941 This PR aims to implement `v_exp(v_float16 x)`, `v_exp(v_float32 x)` and `v_exp(v_float64 x)`. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
8 months ago · 6e1864e3fc
parent 75339a5528
commit 6e1864e3fc
5 changed files with 328 additions and 41 deletions
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@ -1239,6 +1239,7 @@ namespace CV__SIMD_NAMESPACE {
 #define CV_SIMD 0
 #endif

+#include "intrin_math.hpp"
 #include "simd_utils.impl.hpp"

 #ifndef CV_DOXYGEN
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@ -263,7 +263,7 @@ Most of these operations return only one value.

 ### Other math

- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude
+- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude, @ref v_exp
 - Absolute values: @ref v_abs, @ref v_absdiff, @ref v_absdiffs

 ### Conversions
@ -363,6 +363,7 @@ Floating point:
 |reverse            | x | x |
 |extract_n          | x | x |
 |broadcast_element  | x |   |
+|exp                | x | x |

 @{ */

@ -721,11 +722,33 @@ template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a)
 Only for floating point types.*/
 OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)

+/**
+ * @brief Exponential \f$ e^x \f$ of elements
+ *
+ * Only for floating point types. Core implementation steps:
+ * 1. Decompose Input: Convert the input to \f$ 2^{x \cdot \log_2e} \f$ and split its exponential into integer and fractional parts:
+ *    \f$ x \cdot \log_2e = n + f \f$, where \f$ n \f$ is the integer part and \f$ f \f$ is the fractional part.
+ * 2. Compute \f$ 2^n \f$: Calculated by shifting the bits.
+ * 3. Adjust Fractional Part: Compute \f$ f \cdot \ln2 \f$ to convert the fractional part to base \f$ e \f$.
+ *    \f$ C1 \f$ and \f$ C2 \f$ are used to adjust the fractional part.
+ * 4. Polynomial Approximation for \f$ e^{f \cdot \ln2} \f$: The closer the fractional part is to 0, the more accurate the result.
+ *    - For float16 and float32, use a Taylor Series with 6 terms.
+ *    - For float64, use Pade Polynomials Approximation with 4 terms.
+ * 5. Combine Results: Multiply the two parts together to get the final result:
+ *    \f$ e^x = 2^n \cdot e^{f \cdot \ln2} \f$.
+ *
+ * @note The precision of the calculation depends on the implementation and the data type of the input vector.
+ */
+OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
+#define OPENCV_HAL_MATH_HAVE_EXP 1
+
 //! @cond IGNORED
 OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
+#define OPENCV_HAL_MATH_HAVE_SIN 1
 OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
-OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
+#define OPENCV_HAL_MATH_HAVE_COS 1
 OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
+#define OPENCV_HAL_MATH_HAVE_LOG 1
 //! @endcond

 /** @brief Absolute value of elements
--- a/modules/core/include/opencv2/core/hal/intrin_math.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_math.hpp
@ -0,0 +1,200 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// This header is not standalone. Don't include directly, use "intrin.hpp" instead.
+#ifdef OPENCV_HAL_INTRIN_HPP  // defined in intrin.hpp
+
+namespace CV__SIMD_NAMESPACE {
+
+/* Universal Intrinsics implementation of sin, cos, exp and log
+
+   Inspired by Intel Approximate Math library, and based on the
+   corresponding algorithms of the cephes math library
+*/
+
+/* Copyright (C) 2010,2011  RJVB - extensions */
+/* Copyright (C) 2011  Julien Pommier
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+
+#ifndef OPENCV_HAL_MATH_HAVE_EXP
+
+//! @name Exponential
+//! @{
+#if defined(CV_SIMD_FP16) && CV_SIMD_FP16
+    // Implementation is the same as float32 vector.
+    inline v_float16 v_exp(const v_float16 &x) {
+        const v_float16 _vexp_lo_f16 = vx_setall_f16(-10.7421875f);
+        const v_float16 _vexp_hi_f16 = vx_setall_f16(11.f);
+        const v_float16 _vexp_half_fp16 = vx_setall_f16(0.5f);
+        const v_float16 _vexp_one_fp16 = vx_setall_f16(1.f);
+        const v_float16 _vexp_LOG2EF_f16 = vx_setall_f16(1.44269504088896341f);
+        const v_float16 _vexp_C1_f16 = vx_setall_f16(-6.93359375E-1f);
+        const v_float16 _vexp_C2_f16 = vx_setall_f16(2.12194440E-4f);
+        const v_float16 _vexp_p0_f16 = vx_setall_f16(1.9875691500E-4f);
+        const v_float16 _vexp_p1_f16 = vx_setall_f16(1.3981999507E-3f);
+        const v_float16 _vexp_p2_f16 = vx_setall_f16(8.3334519073E-3f);
+        const v_float16 _vexp_p3_f16 = vx_setall_f16(4.1665795894E-2f);
+        const v_float16 _vexp_p4_f16 = vx_setall_f16(1.6666665459E-1f);
+        const v_float16 _vexp_p5_f16 = vx_setall_f16(5.0000001201E-1f);
+        const v_int16 _vexp_bias_s16 = vx_setall_s16(0xf);
+
+        v_float16 _vexp_, _vexp_x, _vexp_y, _vexp_xx;
+        v_int16 _vexp_mm;
+
+        // compute exponential of x
+        _vexp_x = v_max(x, _vexp_lo_f16);
+        _vexp_x = v_min(_vexp_x, _vexp_hi_f16);
+
+        _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f16, _vexp_half_fp16);
+        _vexp_mm = v_floor(_vexp_);
+        _vexp_ = v_cvt_f16(_vexp_mm);
+        _vexp_mm = v_add(_vexp_mm, _vexp_bias_s16);
+        _vexp_mm = v_shl(_vexp_mm, 10);
+
+        _vexp_x = v_fma(_vexp_, _vexp_C1_f16, _vexp_x);
+        _vexp_x = v_fma(_vexp_, _vexp_C2_f16, _vexp_x);
+        _vexp_xx = v_mul(_vexp_x, _vexp_x);
+
+        _vexp_y = v_fma(_vexp_x, _vexp_p0_f16, _vexp_p1_f16);
+        _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f16);
+        _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f16);
+        _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f16);
+        _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f16);
+
+        _vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x);
+        _vexp_y = v_add(_vexp_y, _vexp_one_fp16);
+        _vexp_y = v_mul(_vexp_y, v_reinterpret_as_f16(_vexp_mm));
+
+        // exp(NAN) -> NAN
+        v_float16 mask_not_nan = v_not_nan(x);
+        return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f16(vx_setall_s16(0x7e00)));
+    }
+#endif
+
+    inline v_float32 v_exp(const v_float32 &x) {
+        const v_float32 _vexp_lo_f32 = vx_setall_f32(-88.3762626647949f);
+        const v_float32 _vexp_hi_f32 = vx_setall_f32(89.f);
+        const v_float32 _vexp_half_fp32 = vx_setall_f32(0.5f);
+        const v_float32 _vexp_one_fp32 = vx_setall_f32(1.f);
+        const v_float32 _vexp_LOG2EF_f32 = vx_setall_f32(1.44269504088896341f);
+        const v_float32 _vexp_C1_f32 = vx_setall_f32(-6.93359375E-1f);
+        const v_float32 _vexp_C2_f32 = vx_setall_f32(2.12194440E-4f);
+        const v_float32 _vexp_p0_f32 = vx_setall_f32(1.9875691500E-4f);
+        const v_float32 _vexp_p1_f32 = vx_setall_f32(1.3981999507E-3f);
+        const v_float32 _vexp_p2_f32 = vx_setall_f32(8.3334519073E-3f);
+        const v_float32 _vexp_p3_f32 = vx_setall_f32(4.1665795894E-2f);
+        const v_float32 _vexp_p4_f32 = vx_setall_f32(1.6666665459E-1f);
+        const v_float32 _vexp_p5_f32 = vx_setall_f32(5.0000001201E-1f);
+        const v_int32 _vexp_bias_s32 = vx_setall_s32(0x7f);
+
+        v_float32 _vexp_, _vexp_x, _vexp_y, _vexp_xx;
+        v_int32 _vexp_mm;
+
+        // compute exponential of x
+        _vexp_x = v_max(x, _vexp_lo_f32);
+        _vexp_x = v_min(_vexp_x, _vexp_hi_f32);
+
+        _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f32, _vexp_half_fp32);
+        _vexp_mm = v_floor(_vexp_);
+        _vexp_ = v_cvt_f32(_vexp_mm);
+        _vexp_mm = v_add(_vexp_mm, _vexp_bias_s32);
+        _vexp_mm = v_shl(_vexp_mm, 23);
+
+        _vexp_x = v_fma(_vexp_, _vexp_C1_f32, _vexp_x);
+        _vexp_x = v_fma(_vexp_, _vexp_C2_f32, _vexp_x);
+        _vexp_xx = v_mul(_vexp_x, _vexp_x);
+
+        _vexp_y = v_fma(_vexp_x, _vexp_p0_f32, _vexp_p1_f32);
+        _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f32);
+        _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f32);
+        _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f32);
+        _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f32);
+
+        _vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x);
+        _vexp_y = v_add(_vexp_y, _vexp_one_fp32);
+        _vexp_y = v_mul(_vexp_y, v_reinterpret_as_f32(_vexp_mm));
+
+        // exp(NAN) -> NAN
+        v_float32 mask_not_nan = v_not_nan(x);
+        return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f32(vx_setall_s32(0x7fc00000)));
+    }
+
+#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
+    inline v_float64 v_exp(const v_float64 &x) {
+        const v_float64 _vexp_lo_f64 = vx_setall_f64(-709.43613930310391424428);
+        const v_float64 _vexp_hi_f64 = vx_setall_f64(710.);
+        const v_float64 _vexp_half_f64 = vx_setall_f64(0.5);
+        const v_float64 _vexp_one_f64 = vx_setall_f64(1.0);
+        const v_float64 _vexp_two_f64 = vx_setall_f64(2.0);
+        const v_float64 _vexp_LOG2EF_f64 = vx_setall_f64(1.44269504088896340736);
+        const v_float64 _vexp_C1_f64 = vx_setall_f64(-6.93145751953125E-1);
+        const v_float64 _vexp_C2_f64 = vx_setall_f64(-1.42860682030941723212E-6);
+        const v_float64 _vexp_p0_f64 = vx_setall_f64(1.26177193074810590878E-4);
+        const v_float64 _vexp_p1_f64 = vx_setall_f64(3.02994407707441961300E-2);
+        const v_float64 _vexp_p2_f64 = vx_setall_f64(9.99999999999999999910E-1);
+        const v_float64 _vexp_q0_f64 = vx_setall_f64(3.00198505138664455042E-6);
+        const v_float64 _vexp_q1_f64 = vx_setall_f64(2.52448340349684104192E-3);
+        const v_float64 _vexp_q2_f64 = vx_setall_f64(2.27265548208155028766E-1);
+        const v_float64 _vexp_q3_f64 = vx_setall_f64(2.00000000000000000009E0);
+        const v_int64 _vexp_bias_s64 = vx_setall_s64(0x3ff);
+
+        v_float64 _vexp_, _vexp_x, _vexp_y, _vexp_z, _vexp_xx;
+        v_int64 _vexp_mm;
+
+        // compute exponential of x
+        _vexp_x = v_max(x, _vexp_lo_f64);
+        _vexp_x = v_min(_vexp_x, _vexp_hi_f64);
+
+        _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f64, _vexp_half_f64);
+        _vexp_mm = v_expand_low(v_floor(_vexp_));
+        _vexp_ = v_cvt_f64(_vexp_mm);
+        _vexp_mm = v_add(_vexp_mm, _vexp_bias_s64);
+        _vexp_mm = v_shl(_vexp_mm, 52);
+
+        _vexp_x = v_fma(_vexp_, _vexp_C1_f64, _vexp_x);
+        _vexp_x = v_fma(_vexp_, _vexp_C2_f64, _vexp_x);
+        _vexp_xx = v_mul(_vexp_x, _vexp_x);
+
+        _vexp_y = v_fma(_vexp_xx, _vexp_p0_f64, _vexp_p1_f64);
+        _vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_p2_f64);
+        _vexp_y = v_mul(_vexp_y, _vexp_x);
+
+        _vexp_z = v_fma(_vexp_xx, _vexp_q0_f64, _vexp_q1_f64);
+        _vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q2_f64);
+        _vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q3_f64);
+
+        _vexp_z = v_div(_vexp_y, v_sub(_vexp_z, _vexp_y));
+        _vexp_z = v_fma(_vexp_two_f64, _vexp_z, _vexp_one_f64);
+        _vexp_z = v_mul(_vexp_z, v_reinterpret_as_f64(_vexp_mm));
+
+        // exp(NAN) -> NAN
+        v_float64 mask_not_nan = v_not_nan(x);
+        return v_select(mask_not_nan, _vexp_z, v_reinterpret_as_f64(vx_setall_s64(0x7FF8000000000000)));
+    }
+#endif
+
+#define OPENCV_HAL_MATH_HAVE_EXP 1
+//! @}
+
+#endif
+}
+#endif  // OPENCV_HAL_INTRIN_HPP
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@ -1698,6 +1698,103 @@ template<typename R> struct TheTest
        return *this;
    }

+    void __test_exp(LaneType dataMax, LaneType diff_thr, LaneType enlarge_factor, LaneType flt_min) {
+        int n = VTraits<R>::vlanes();
+
+        // Test overflow and underflow values with step
+        const LaneType step = (LaneType) 0.01;
+        for (LaneType i = dataMax + 1; i <= dataMax + 11;) {
+            Data<R> dataUpperBound, dataLowerBound, resOverflow, resUnderflow;
+            for (int j = 0; j < n; ++j) {
+                dataUpperBound[j] = i;
+                dataLowerBound[j] = -i;
+                i += step;
+            }
+            R upperBound = dataUpperBound, lowerBound = dataLowerBound;
+            resOverflow = v_exp(upperBound);
+            resUnderflow = v_exp(lowerBound);
+            for (int j = 0; j < n; ++j) {
+                SCOPED_TRACE(cv::format("Overflow/Underflow test value: %f", i));
+                EXPECT_TRUE(resOverflow[j] > 0 && std::isinf(resOverflow[j]));
+                EXPECT_GE(resUnderflow[j], 0);
+                EXPECT_LT(resUnderflow[j], flt_min);
+            }
+        }
+
+        // Test random values combined with special values
+        std::vector<LaneType> specialValues = {0, 1, INFINITY, -INFINITY, NAN, dataMax};
+        const int testRandNum = 10000;
+        const double specialValueProbability = 0.1; // 10% chance to insert a special value
+        cv::RNG_MT19937 rng;
+
+        for (int i = 0; i < testRandNum; i++) {
+            Data<R> dataRand, resRand;
+            for (int j = 0; j < n; ++j) {
+                if (rng.uniform(0.f, 1.f) <= specialValueProbability) {
+                    // Insert a special value
+                    int specialValueIndex = rng.uniform(0, (int) specialValues.size());
+                    dataRand[j] = specialValues[specialValueIndex];
+                } else {
+                    // Generate random data in [-dataMax*1.1, dataMax*1.1]
+                    dataRand[j] = (LaneType) rng.uniform(-dataMax * 1.1, dataMax * 1.1);
+                }
+            }
+            // Compare with std::exp
+            R x = dataRand;
+            resRand = v_exp(x);
+            for (int j = 0; j < n; ++j) {
+                SCOPED_TRACE(cv::format("Random test value: %f", dataRand[j]));
+                LaneType std_exp = std::exp(dataRand[j]);
+                if (dataRand[j] == 0) {
+                    // input 0 -> output 1
+                    EXPECT_EQ(resRand[j], 1);
+                } else if (dataRand[j] == 1) {
+                    // input 1 -> output e
+                    EXPECT_NEAR((LaneType) M_E, resRand[j], 1e-15);
+                } else if (dataRand[j] > 0 && std::isinf(dataRand[j])) {
+                    // input INF -> output INF
+                    EXPECT_TRUE(resRand[j] > 0 && std::isinf(resRand[j]));
+                } else if (dataRand[j] < 0 && std::isinf(dataRand[j])) {
+                    // input -INF -> output 0
+                    EXPECT_EQ(resRand[j], 0);
+                } else if (std::isnan(dataRand[j])) {
+                    // input NaN -> output NaN
+                    EXPECT_TRUE(std::isnan(resRand[j]));
+                } else if (dataRand[j] == dataMax) {
+                    // input dataMax -> output less than INFINITY
+                    EXPECT_LT(resRand[j], (LaneType) INFINITY);
+                } else if (std::isinf(resRand[j])) {
+                    // output INF -> input close to edge
+                    EXPECT_GT(dataRand[j], dataMax);
+                } else {
+                    EXPECT_GE(resRand[j], 0);
+                    EXPECT_LT(std::abs(resRand[j] - std_exp), diff_thr * (std_exp + flt_min * enlarge_factor));
+                }
+            }
+        }
+    }
+
+    TheTest &test_exp_fp16() {
+#if CV_SIMD_FP16
+        float16_t flt16_min;
+        uint16_t flt16_min_hex = 0x0400;
+        std::memcpy(&flt16_min, &flt16_min_hex, sizeof(float16_t));
+        __test_exp((float16_t) 10, (float16_t) 1e-2, (float16_t) 1e2, flt16_min);
+#endif
+        return *this;
+    }
+
+    TheTest &test_exp_fp32() {
+        __test_exp(88.0f, 1e-6f, 1e6f, FLT_MIN);
+        return *this;
+    }
+
+    TheTest &test_exp_fp64() {
+#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
+        __test_exp(709.0, 1e-15, 1e15, DBL_MIN);
+#endif
+        return *this;
+    }
 };

 #define DUMP_ENTRY(type) printf("SIMD%d: %s\n", 8*VTraits<v_uint8>::vlanes(), CV__TRACE_FUNCTION);
@ -2011,6 +2108,7 @@ void test_hal_intrin_float32()
        .test_extract_highest()
        .test_broadcast_highest()
        .test_pack_triplets()
+        .test_exp_fp32()
 #if CV_SIMD_WIDTH == 32
        .test_extract<4>().test_extract<5>().test_extract<6>().test_extract<7>()
        .test_rotate<4>().test_rotate<5>().test_rotate<6>().test_rotate<7>()
@ -2035,13 +2133,13 @@ void test_hal_intrin_float64()
        .test_mask()
        .test_unpack()
        .test_float_math()
-        .test_round_pair_f64()
        .test_float_cvt32()
        .test_reverse()
        .test_extract<0>().test_extract<1>()
        .test_rotate<0>().test_rotate<1>()
        .test_extract_n<0>().test_extract_n<1>()
        .test_extract_highest()
+        .test_exp_fp64()
        //.test_broadcast_element<0>().test_broadcast_element<1>()
 #if CV_SIMD_WIDTH == 32
        .test_extract<2>().test_extract<3>()
@ -2062,6 +2160,7 @@ void test_hal_intrin_float16()
 #if CV_SIMD_FP16
        .test_loadstore_fp16()
        .test_float_cvt_fp16()
+        .test_exp_fp16()
 #endif
        ;
 #else
--- a/modules/dnn/src/layers/cpu_kernels/softmax.cpp
+++ b/modules/dnn/src/layers/cpu_kernels/softmax.cpp
@ -71,48 +71,12 @@ void softmax(Mat &dst, const Mat &src, int axis, int axisBias, int axisStep){
            // calculate the exp value along the axis
            v_float32 vs = vx_setzero_f32();
            vmax = vx_setall_f32(maxVal);
-            // initialize vexp constant
-            v_float32 _vexp_lo = vx_setall_f32(-88.3762626647949f);
-            v_float32 _vexp_hi = vx_setall_f32(88.3762626647949f);
-            v_float32 _vexp_half = vx_setall_f32(0.5f);
-            v_float32 _vexp_one = vx_setall_f32(1.f);
-            v_float32 _vexp_LOG2EF = vx_setall_f32(1.44269504088896341f);
-            v_float32 _vexp_C1 = vx_setall_f32(-0.693359375f);
-            v_float32 _vexp_C2 = vx_setall_f32(2.12194440e-4f);
-            v_float32 _vexp_p0 = vx_setall_f32(1.9875691500E-4f);
-            v_float32 _vexp_p1 = vx_setall_f32(1.3981999507E-3f);
-            v_float32 _vexp_p2 = vx_setall_f32(8.3334519073E-3f);
-            v_float32 _vexp_p3 = vx_setall_f32(4.1665795894E-2f);
-            v_float32 _vexp_p4 = vx_setall_f32(1.6666665459E-1f);
-            v_float32 _vexp_p5 = vx_setall_f32(5.0000001201E-1f);
-            // initialize temp vectors for vexp
-            v_float32 val, _vexp_, _vexp_x, _vexp_y, _vexp_z;
-            v_int32 _vexp_mm;
-
+            v_float32 val;
            // calculate and sum all data along axis
            for (size_t cnDim = 0; cnDim < axisStep; cnDim += nlanes) {
                val = vx_load(axisBuf + cnDim);
                val = v_sub(val, vmax);
-
-                // compute vexp of val
-                _vexp_x = v_min(val, _vexp_hi);
-                _vexp_x = v_max(_vexp_x, _vexp_lo);
-                _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF, _vexp_half);
-                _vexp_mm = v_floor(_vexp_);
-                _vexp_ = v_cvt_f32(_vexp_mm);
-                _vexp_mm = v_add(_vexp_mm, vx_setall_s32(0x7f));
-                _vexp_mm = v_shl(_vexp_mm, 23);
-                _vexp_x = v_fma(_vexp_, _vexp_C1, _vexp_x);
-                _vexp_x = v_fma(_vexp_, _vexp_C2, _vexp_x);
-                _vexp_z = v_mul(_vexp_x, _vexp_x);
-                _vexp_y = v_fma(_vexp_x, _vexp_p0, _vexp_p1);
-                _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2);
-                _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3);
-                _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4);
-                _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5);
-                _vexp_y = v_fma(_vexp_y, _vexp_z, _vexp_x);
-                _vexp_y = v_add(_vexp_y, _vexp_one);
-                val = v_mul(_vexp_y, v_reinterpret_as_f32(_vexp_mm));
+                val = v_exp(val);

                vs = v_add(vs, val);
                v_store(axisBuf + cnDim, val);