Merge pull request #24941 from WanliZhong:v_exp

Add support for v_exp (exponential) #24941 This PR aims to implement `v_exp(v_float16 x)`, `v_exp(v_float32 x)` and `v_exp(v_float64 x)`. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
8 months ago · 6e1864e3fc
parent 75339a5528
commit 6e1864e3fc
5 changed files with 328 additions and 41 deletions
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@ -1239,6 +1239,7 @@ namespace CV__SIMD_NAMESPACE {
 #define CV_SIMD 0
 #endif
 #include "intrin_math.hpp"
 #include "simd_utils.impl.hpp"
 #ifndef CV_DOXYGEN
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@ -263,7 +263,7 @@ Most of these operations return only one value.
 ### Other math
- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude
+- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude, @ref v_exp
 - Absolute values: @ref v_abs, @ref v_absdiff, @ref v_absdiffs
 ### Conversions
@ -363,6 +363,7 @@ Floating point:
 |reverse            | x | x |
 |extract_n          | x | x |
 |broadcast_element  | x |   |
 |exp                | x | x |
 @{ */
@ -721,11 +722,33 @@ template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a)
 Only for floating point types.*/
 OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
 /**
 * @brief Exponential \f$ e^x \f$ of elements
 *
 * Only for floating point types. Core implementation steps:
 * 1. Decompose Input: Convert the input to \f$ 2^{x \cdot \log_2e} \f$ and split its exponential into integer and fractional parts:
 *    \f$ x \cdot \log_2e = n + f \f$, where \f$ n \f$ is the integer part and \f$ f \f$ is the fractional part.
 * 2. Compute \f$ 2^n \f$: Calculated by shifting the bits.
 * 3. Adjust Fractional Part: Compute \f$ f \cdot \ln2 \f$ to convert the fractional part to base \f$ e \f$.
 *    \f$ C1 \f$ and \f$ C2 \f$ are used to adjust the fractional part.
 * 4. Polynomial Approximation for \f$ e^{f \cdot \ln2} \f$: The closer the fractional part is to 0, the more accurate the result.
 *    - For float16 and float32, use a Taylor Series with 6 terms.
 *    - For float64, use Pade Polynomials Approximation with 4 terms.
 * 5. Combine Results: Multiply the two parts together to get the final result:
 *    \f$ e^x = 2^n \cdot e^{f \cdot \ln2} \f$.
 *
 * @note The precision of the calculation depends on the implementation and the data type of the input vector.
 */
 OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
 #define OPENCV_HAL_MATH_HAVE_EXP 1
 //! @cond IGNORED
 OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
 #define OPENCV_HAL_MATH_HAVE_SIN 1
 OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
-OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
+#define OPENCV_HAL_MATH_HAVE_COS 1
 OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
 #define OPENCV_HAL_MATH_HAVE_LOG 1
 //! @endcond
 /** @brief Absolute value of elements
--- a/modules/core/include/opencv2/core/hal/intrin_math.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_math.hpp
@ -0,0 +1,200 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html
 // This header is not standalone. Don't include directly, use "intrin.hpp" instead.
 #ifdef OPENCV_HAL_INTRIN_HPP  // defined in intrin.hpp
 namespace CV__SIMD_NAMESPACE {
 /* Universal Intrinsics implementation of sin, cos, exp and log
   Inspired by Intel Approximate Math library, and based on the
   corresponding algorithms of the cephes math library
 */
 /* Copyright (C) 2010,2011  RJVB - extensions */
 /* Copyright (C) 2011  Julien Pommier
  This software is provided 'as-is', without any express or implied
  warranty.  In no event will the authors be held liable for any damages
  arising from the use of this software.
  Permission is granted to anyone to use this software for any purpose,
  including commercial applications, and to alter it and redistribute it
  freely, subject to the following restrictions:
  1. The origin of this software must not be misrepresented; you must not
     claim that you wrote the original software. If you use this software
     in a product, an acknowledgment in the product documentation would be
     appreciated but is not required.
  2. Altered source versions must be plainly marked as such, and must not be
     misrepresented as being the original software.
  3. This notice may not be removed or altered from any source distribution.
  (this is the zlib license)
 */
 #ifndef OPENCV_HAL_MATH_HAVE_EXP
 //! @name Exponential
 //! @{
 #if defined(CV_SIMD_FP16) && CV_SIMD_FP16
    // Implementation is the same as float32 vector.
    inline v_float16 v_exp(const v_float16 &x) {
        const v_float16 _vexp_lo_f16 = vx_setall_f16(-10.7421875f);
        const v_float16 _vexp_hi_f16 = vx_setall_f16(11.f);
        const v_float16 _vexp_half_fp16 = vx_setall_f16(0.5f);
        const v_float16 _vexp_one_fp16 = vx_setall_f16(1.f);
        const v_float16 _vexp_LOG2EF_f16 = vx_setall_f16(1.44269504088896341f);
        const v_float16 _vexp_C1_f16 = vx_setall_f16(-6.93359375E-1f);
        const v_float16 _vexp_C2_f16 = vx_setall_f16(2.12194440E-4f);
        const v_float16 _vexp_p0_f16 = vx_setall_f16(1.9875691500E-4f);
        const v_float16 _vexp_p1_f16 = vx_setall_f16(1.3981999507E-3f);
        const v_float16 _vexp_p2_f16 = vx_setall_f16(8.3334519073E-3f);
        const v_float16 _vexp_p3_f16 = vx_setall_f16(4.1665795894E-2f);
        const v_float16 _vexp_p4_f16 = vx_setall_f16(1.6666665459E-1f);
        const v_float16 _vexp_p5_f16 = vx_setall_f16(5.0000001201E-1f);
        const v_int16 _vexp_bias_s16 = vx_setall_s16(0xf);
        v_float16 _vexp_, _vexp_x, _vexp_y, _vexp_xx;
        v_int16 _vexp_mm;
        // compute exponential of x
        _vexp_x = v_max(x, _vexp_lo_f16);
        _vexp_x = v_min(_vexp_x, _vexp_hi_f16);
        _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f16, _vexp_half_fp16);
        _vexp_mm = v_floor(_vexp_);
        _vexp_ = v_cvt_f16(_vexp_mm);
        _vexp_mm = v_add(_vexp_mm, _vexp_bias_s16);
        _vexp_mm = v_shl(_vexp_mm, 10);
        _vexp_x = v_fma(_vexp_, _vexp_C1_f16, _vexp_x);
        _vexp_x = v_fma(_vexp_, _vexp_C2_f16, _vexp_x);
        _vexp_xx = v_mul(_vexp_x, _vexp_x);
        _vexp_y = v_fma(_vexp_x, _vexp_p0_f16, _vexp_p1_f16);
        _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f16);
        _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f16);
        _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f16);
        _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f16);
        _vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x);
        _vexp_y = v_add(_vexp_y, _vexp_one_fp16);
        _vexp_y = v_mul(_vexp_y, v_reinterpret_as_f16(_vexp_mm));
        // exp(NAN) -> NAN
        v_float16 mask_not_nan = v_not_nan(x);
        return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f16(vx_setall_s16(0x7e00)));
    }
 #endif
    inline v_float32 v_exp(const v_float32 &x) {
        const v_float32 _vexp_lo_f32 = vx_setall_f32(-88.3762626647949f);
        const v_float32 _vexp_hi_f32 = vx_setall_f32(89.f);
        const v_float32 _vexp_half_fp32 = vx_setall_f32(0.5f);
        const v_float32 _vexp_one_fp32 = vx_setall_f32(1.f);
        const v_float32 _vexp_LOG2EF_f32 = vx_setall_f32(1.44269504088896341f);
        const v_float32 _vexp_C1_f32 = vx_setall_f32(-6.93359375E-1f);
        const v_float32 _vexp_C2_f32 = vx_setall_f32(2.12194440E-4f);
        const v_float32 _vexp_p0_f32 = vx_setall_f32(1.9875691500E-4f);
        const v_float32 _vexp_p1_f32 = vx_setall_f32(1.3981999507E-3f);
        const v_float32 _vexp_p2_f32 = vx_setall_f32(8.3334519073E-3f);
        const v_float32 _vexp_p3_f32 = vx_setall_f32(4.1665795894E-2f);
        const v_float32 _vexp_p4_f32 = vx_setall_f32(1.6666665459E-1f);
        const v_float32 _vexp_p5_f32 = vx_setall_f32(5.0000001201E-1f);
        const v_int32 _vexp_bias_s32 = vx_setall_s32(0x7f);
        v_float32 _vexp_, _vexp_x, _vexp_y, _vexp_xx;
        v_int32 _vexp_mm;
        // compute exponential of x
        _vexp_x = v_max(x, _vexp_lo_f32);
        _vexp_x = v_min(_vexp_x, _vexp_hi_f32);
        _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f32, _vexp_half_fp32);
        _vexp_mm = v_floor(_vexp_);
        _vexp_ = v_cvt_f32(_vexp_mm);
        _vexp_mm = v_add(_vexp_mm, _vexp_bias_s32);
        _vexp_mm = v_shl(_vexp_mm, 23);
        _vexp_x = v_fma(_vexp_, _vexp_C1_f32, _vexp_x);
        _vexp_x = v_fma(_vexp_, _vexp_C2_f32, _vexp_x);
        _vexp_xx = v_mul(_vexp_x, _vexp_x);
        _vexp_y = v_fma(_vexp_x, _vexp_p0_f32, _vexp_p1_f32);
        _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f32);
        _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f32);
        _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f32);
        _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f32);
        _vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x);
        _vexp_y = v_add(_vexp_y, _vexp_one_fp32);
        _vexp_y = v_mul(_vexp_y, v_reinterpret_as_f32(_vexp_mm));
        // exp(NAN) -> NAN
        v_float32 mask_not_nan = v_not_nan(x);
        return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f32(vx_setall_s32(0x7fc00000)));
    }
 #if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
    inline v_float64 v_exp(const v_float64 &x) {
        const v_float64 _vexp_lo_f64 = vx_setall_f64(-709.43613930310391424428);
        const v_float64 _vexp_hi_f64 = vx_setall_f64(710.);
        const v_float64 _vexp_half_f64 = vx_setall_f64(0.5);
        const v_float64 _vexp_one_f64 = vx_setall_f64(1.0);
        const v_float64 _vexp_two_f64 = vx_setall_f64(2.0);
        const v_float64 _vexp_LOG2EF_f64 = vx_setall_f64(1.44269504088896340736);
        const v_float64 _vexp_C1_f64 = vx_setall_f64(-6.93145751953125E-1);
        const v_float64 _vexp_C2_f64 = vx_setall_f64(-1.42860682030941723212E-6);
        const v_float64 _vexp_p0_f64 = vx_setall_f64(1.26177193074810590878E-4);
        const v_float64 _vexp_p1_f64 = vx_setall_f64(3.02994407707441961300E-2);
        const v_float64 _vexp_p2_f64 = vx_setall_f64(9.99999999999999999910E-1);
        const v_float64 _vexp_q0_f64 = vx_setall_f64(3.00198505138664455042E-6);
        const v_float64 _vexp_q1_f64 = vx_setall_f64(2.52448340349684104192E-3);
        const v_float64 _vexp_q2_f64 = vx_setall_f64(2.27265548208155028766E-1);
        const v_float64 _vexp_q3_f64 = vx_setall_f64(2.00000000000000000009E0);
        const v_int64 _vexp_bias_s64 = vx_setall_s64(0x3ff);
        v_float64 _vexp_, _vexp_x, _vexp_y, _vexp_z, _vexp_xx;
        v_int64 _vexp_mm;
        // compute exponential of x
        _vexp_x = v_max(x, _vexp_lo_f64);
        _vexp_x = v_min(_vexp_x, _vexp_hi_f64);
        _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f64, _vexp_half_f64);
        _vexp_mm = v_expand_low(v_floor(_vexp_));
        _vexp_ = v_cvt_f64(_vexp_mm);
        _vexp_mm = v_add(_vexp_mm, _vexp_bias_s64);
        _vexp_mm = v_shl(_vexp_mm, 52);
        _vexp_x = v_fma(_vexp_, _vexp_C1_f64, _vexp_x);
        _vexp_x = v_fma(_vexp_, _vexp_C2_f64, _vexp_x);
        _vexp_xx = v_mul(_vexp_x, _vexp_x);
        _vexp_y = v_fma(_vexp_xx, _vexp_p0_f64, _vexp_p1_f64);
        _vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_p2_f64);
        _vexp_y = v_mul(_vexp_y, _vexp_x);
        _vexp_z = v_fma(_vexp_xx, _vexp_q0_f64, _vexp_q1_f64);
        _vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q2_f64);
        _vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q3_f64);
        _vexp_z = v_div(_vexp_y, v_sub(_vexp_z, _vexp_y));
        _vexp_z = v_fma(_vexp_two_f64, _vexp_z, _vexp_one_f64);
        _vexp_z = v_mul(_vexp_z, v_reinterpret_as_f64(_vexp_mm));
        // exp(NAN) -> NAN
        v_float64 mask_not_nan = v_not_nan(x);
        return v_select(mask_not_nan, _vexp_z, v_reinterpret_as_f64(vx_setall_s64(0x7FF8000000000000)));
    }
 #endif
 #define OPENCV_HAL_MATH_HAVE_EXP 1
 //! @}
 #endif
 }
 #endif  // OPENCV_HAL_INTRIN_HPP
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@ -1698,6 +1698,103 @@ template<typename R> struct TheTest
        return *this;
    }
    void __test_exp(LaneType dataMax, LaneType diff_thr, LaneType enlarge_factor, LaneType flt_min) {
        int n = VTraits<R>::vlanes();
        // Test overflow and underflow values with step
        const LaneType step = (LaneType) 0.01;
        for (LaneType i = dataMax + 1; i <= dataMax + 11;) {
            Data<R> dataUpperBound, dataLowerBound, resOverflow, resUnderflow;
            for (int j = 0; j < n; ++j) {
                dataUpperBound[j] = i;
                dataLowerBound[j] = -i;
                i += step;
            }
            R upperBound = dataUpperBound, lowerBound = dataLowerBound;
            resOverflow = v_exp(upperBound);
            resUnderflow = v_exp(lowerBound);
            for (int j = 0; j < n; ++j) {
                SCOPED_TRACE(cv::format("Overflow/Underflow test value: %f", i));
                EXPECT_TRUE(resOverflow[j] > 0 && std::isinf(resOverflow[j]));
                EXPECT_GE(resUnderflow[j], 0);
                EXPECT_LT(resUnderflow[j], flt_min);
            }
        }
        // Test random values combined with special values
        std::vector<LaneType> specialValues = {0, 1, INFINITY, -INFINITY, NAN, dataMax};
        const int testRandNum = 10000;
        const double specialValueProbability = 0.1; // 10% chance to insert a special value
        cv::RNG_MT19937 rng;
        for (int i = 0; i < testRandNum; i++) {
            Data<R> dataRand, resRand;
            for (int j = 0; j < n; ++j) {
                if (rng.uniform(0.f, 1.f) <= specialValueProbability) {
                    // Insert a special value
                    int specialValueIndex = rng.uniform(0, (int) specialValues.size());
                    dataRand[j] = specialValues[specialValueIndex];
                } else {
                    // Generate random data in [-dataMax*1.1, dataMax*1.1]
                    dataRand[j] = (LaneType) rng.uniform(-dataMax * 1.1, dataMax * 1.1);
                }
            }
            // Compare with std::exp
            R x = dataRand;
            resRand = v_exp(x);
            for (int j = 0; j < n; ++j) {
                SCOPED_TRACE(cv::format("Random test value: %f", dataRand[j]));
                LaneType std_exp = std::exp(dataRand[j]);
                if (dataRand[j] == 0) {
                    // input 0 -> output 1
                    EXPECT_EQ(resRand[j], 1);
                } else if (dataRand[j] == 1) {
                    // input 1 -> output e
                    EXPECT_NEAR((LaneType) M_E, resRand[j], 1e-15);
                } else if (dataRand[j] > 0 && std::isinf(dataRand[j])) {
                    // input INF -> output INF
                    EXPECT_TRUE(resRand[j] > 0 && std::isinf(resRand[j]));
                } else if (dataRand[j] < 0 && std::isinf(dataRand[j])) {
                    // input -INF -> output 0
                    EXPECT_EQ(resRand[j], 0);
                } else if (std::isnan(dataRand[j])) {
                    // input NaN -> output NaN
                    EXPECT_TRUE(std::isnan(resRand[j]));
                } else if (dataRand[j] == dataMax) {
                    // input dataMax -> output less than INFINITY
                    EXPECT_LT(resRand[j], (LaneType) INFINITY);
                } else if (std::isinf(resRand[j])) {
                    // output INF -> input close to edge
                    EXPECT_GT(dataRand[j], dataMax);
                } else {
                    EXPECT_GE(resRand[j], 0);
                    EXPECT_LT(std::abs(resRand[j] - std_exp), diff_thr * (std_exp + flt_min * enlarge_factor));
                }
            }
        }
    }
    TheTest &test_exp_fp16() {
 #if CV_SIMD_FP16
        float16_t flt16_min;
        uint16_t flt16_min_hex = 0x0400;
        std::memcpy(&flt16_min, &flt16_min_hex, sizeof(float16_t));
        __test_exp((float16_t) 10, (float16_t) 1e-2, (float16_t) 1e2, flt16_min);
 #endif
        return *this;
    }
    TheTest &test_exp_fp32() {
        __test_exp(88.0f, 1e-6f, 1e6f, FLT_MIN);
        return *this;
    }
    TheTest &test_exp_fp64() {
 #if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
        __test_exp(709.0, 1e-15, 1e15, DBL_MIN);
 #endif
        return *this;
    }
 };
 #define DUMP_ENTRY(type) printf("SIMD%d: %s\n", 8*VTraits<v_uint8>::vlanes(), CV__TRACE_FUNCTION);
@ -2011,6 +2108,7 @@ void test_hal_intrin_float32()
        .test_extract_highest()
        .test_broadcast_highest()
        .test_pack_triplets()
        .test_exp_fp32()
 #if CV_SIMD_WIDTH == 32
        .test_extract<4>().test_extract<5>().test_extract<6>().test_extract<7>()
        .test_rotate<4>().test_rotate<5>().test_rotate<6>().test_rotate<7>()
@ -2035,13 +2133,13 @@ void test_hal_intrin_float64()
        .test_mask()
        .test_unpack()
        .test_float_math()
        .test_round_pair_f64()
        .test_float_cvt32()
        .test_reverse()
        .test_extract<0>().test_extract<1>()
        .test_rotate<0>().test_rotate<1>()
        .test_extract_n<0>().test_extract_n<1>()
        .test_extract_highest()
        .test_exp_fp64()
        //.test_broadcast_element<0>().test_broadcast_element<1>()
 #if CV_SIMD_WIDTH == 32
        .test_extract<2>().test_extract<3>()
@ -2062,6 +2160,7 @@ void test_hal_intrin_float16()
 #if CV_SIMD_FP16
        .test_loadstore_fp16()
        .test_float_cvt_fp16()
        .test_exp_fp16()
 #endif
        ;
 #else
--- a/modules/dnn/src/layers/cpu_kernels/softmax.cpp
+++ b/modules/dnn/src/layers/cpu_kernels/softmax.cpp
@ -71,48 +71,12 @@ void softmax(Mat &dst, const Mat &src, int axis, int axisBias, int axisStep){
            // calculate the exp value along the axis
            v_float32 vs = vx_setzero_f32();
            vmax = vx_setall_f32(maxVal);
-            // initialize vexp constant
+            v_float32 val;
            v_float32 _vexp_lo = vx_setall_f32(-88.3762626647949f);
            v_float32 _vexp_hi = vx_setall_f32(88.3762626647949f);
            v_float32 _vexp_half = vx_setall_f32(0.5f);
            v_float32 _vexp_one = vx_setall_f32(1.f);
            v_float32 _vexp_LOG2EF = vx_setall_f32(1.44269504088896341f);
            v_float32 _vexp_C1 = vx_setall_f32(-0.693359375f);
            v_float32 _vexp_C2 = vx_setall_f32(2.12194440e-4f);
            v_float32 _vexp_p0 = vx_setall_f32(1.9875691500E-4f);
            v_float32 _vexp_p1 = vx_setall_f32(1.3981999507E-3f);
            v_float32 _vexp_p2 = vx_setall_f32(8.3334519073E-3f);
            v_float32 _vexp_p3 = vx_setall_f32(4.1665795894E-2f);
            v_float32 _vexp_p4 = vx_setall_f32(1.6666665459E-1f);
            v_float32 _vexp_p5 = vx_setall_f32(5.0000001201E-1f);
            // initialize temp vectors for vexp
            v_float32 val, _vexp_, _vexp_x, _vexp_y, _vexp_z;
            v_int32 _vexp_mm;
            // calculate and sum all data along axis
            for (size_t cnDim = 0; cnDim < axisStep; cnDim += nlanes) {
                val = vx_load(axisBuf + cnDim);
                val = v_sub(val, vmax);
-
+                val = v_exp(val);
                // compute vexp of val
                _vexp_x = v_min(val, _vexp_hi);
                _vexp_x = v_max(_vexp_x, _vexp_lo);
                _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF, _vexp_half);
                _vexp_mm = v_floor(_vexp_);
                _vexp_ = v_cvt_f32(_vexp_mm);
                _vexp_mm = v_add(_vexp_mm, vx_setall_s32(0x7f));
                _vexp_mm = v_shl(_vexp_mm, 23);
                _vexp_x = v_fma(_vexp_, _vexp_C1, _vexp_x);
                _vexp_x = v_fma(_vexp_, _vexp_C2, _vexp_x);
                _vexp_z = v_mul(_vexp_x, _vexp_x);
                _vexp_y = v_fma(_vexp_x, _vexp_p0, _vexp_p1);
                _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2);
                _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3);
                _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4);
                _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5);
                _vexp_y = v_fma(_vexp_y, _vexp_z, _vexp_x);
                _vexp_y = v_add(_vexp_y, _vexp_one);
                val = v_mul(_vexp_y, v_reinterpret_as_f32(_vexp_mm));
                vs = v_add(vs, val);
                v_store(axisBuf + cnDim, val);