Merge pull request #24941 from WanliZhong:v_exp

Add support for v_exp (exponential) #24941

This PR aims to implement `v_exp(v_float16 x)`, `v_exp(v_float32 x)` and `v_exp(v_float64 x)`.

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
pull/25856/head
Wanli 5 months ago committed by GitHub
parent 75339a5528
commit 6e1864e3fc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 1
      modules/core/include/opencv2/core/hal/intrin.hpp
  2. 27
      modules/core/include/opencv2/core/hal/intrin_cpp.hpp
  3. 200
      modules/core/include/opencv2/core/hal/intrin_math.hpp
  4. 101
      modules/core/test/test_intrin_utils.hpp
  5. 40
      modules/dnn/src/layers/cpu_kernels/softmax.cpp

@ -1239,6 +1239,7 @@ namespace CV__SIMD_NAMESPACE {
#define CV_SIMD 0 #define CV_SIMD 0
#endif #endif
#include "intrin_math.hpp"
#include "simd_utils.impl.hpp" #include "simd_utils.impl.hpp"
#ifndef CV_DOXYGEN #ifndef CV_DOXYGEN

@ -263,7 +263,7 @@ Most of these operations return only one value.
### Other math ### Other math
- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude - Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude, @ref v_exp
- Absolute values: @ref v_abs, @ref v_absdiff, @ref v_absdiffs - Absolute values: @ref v_abs, @ref v_absdiff, @ref v_absdiffs
### Conversions ### Conversions
@ -363,6 +363,7 @@ Floating point:
|reverse | x | x | |reverse | x | x |
|extract_n | x | x | |extract_n | x | x |
|broadcast_element | x | | |broadcast_element | x | |
|exp | x | x |
@{ */ @{ */
@ -721,11 +722,33 @@ template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a)
Only for floating point types.*/ Only for floating point types.*/
OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp) OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
/**
* @brief Exponential \f$ e^x \f$ of elements
*
* Only for floating point types. Core implementation steps:
* 1. Decompose Input: Convert the input to \f$ 2^{x \cdot \log_2e} \f$ and split its exponential into integer and fractional parts:
* \f$ x \cdot \log_2e = n + f \f$, where \f$ n \f$ is the integer part and \f$ f \f$ is the fractional part.
* 2. Compute \f$ 2^n \f$: Calculated by shifting the bits.
* 3. Adjust Fractional Part: Compute \f$ f \cdot \ln2 \f$ to convert the fractional part to base \f$ e \f$.
* \f$ C1 \f$ and \f$ C2 \f$ are used to adjust the fractional part.
* 4. Polynomial Approximation for \f$ e^{f \cdot \ln2} \f$: The closer the fractional part is to 0, the more accurate the result.
* - For float16 and float32, use a Taylor Series with 6 terms.
* - For float64, use Pade Polynomials Approximation with 4 terms.
* 5. Combine Results: Multiply the two parts together to get the final result:
* \f$ e^x = 2^n \cdot e^{f \cdot \ln2} \f$.
*
* @note The precision of the calculation depends on the implementation and the data type of the input vector.
*/
OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
#define OPENCV_HAL_MATH_HAVE_EXP 1
//! @cond IGNORED //! @cond IGNORED
OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp) OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
#define OPENCV_HAL_MATH_HAVE_SIN 1
OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp) OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp) #define OPENCV_HAL_MATH_HAVE_COS 1
OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp) OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
#define OPENCV_HAL_MATH_HAVE_LOG 1
//! @endcond //! @endcond
/** @brief Absolute value of elements /** @brief Absolute value of elements

@ -0,0 +1,200 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html
// This header is not standalone. Don't include directly, use "intrin.hpp" instead.
#ifdef OPENCV_HAL_INTRIN_HPP // defined in intrin.hpp
namespace CV__SIMD_NAMESPACE {
/* Universal Intrinsics implementation of sin, cos, exp and log
Inspired by Intel Approximate Math library, and based on the
corresponding algorithms of the cephes math library
*/
/* Copyright (C) 2010,2011 RJVB - extensions */
/* Copyright (C) 2011 Julien Pommier
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
(this is the zlib license)
*/
#ifndef OPENCV_HAL_MATH_HAVE_EXP
//! @name Exponential
//! @{
#if defined(CV_SIMD_FP16) && CV_SIMD_FP16
// Implementation is the same as float32 vector.
inline v_float16 v_exp(const v_float16 &x) {
const v_float16 _vexp_lo_f16 = vx_setall_f16(-10.7421875f);
const v_float16 _vexp_hi_f16 = vx_setall_f16(11.f);
const v_float16 _vexp_half_fp16 = vx_setall_f16(0.5f);
const v_float16 _vexp_one_fp16 = vx_setall_f16(1.f);
const v_float16 _vexp_LOG2EF_f16 = vx_setall_f16(1.44269504088896341f);
const v_float16 _vexp_C1_f16 = vx_setall_f16(-6.93359375E-1f);
const v_float16 _vexp_C2_f16 = vx_setall_f16(2.12194440E-4f);
const v_float16 _vexp_p0_f16 = vx_setall_f16(1.9875691500E-4f);
const v_float16 _vexp_p1_f16 = vx_setall_f16(1.3981999507E-3f);
const v_float16 _vexp_p2_f16 = vx_setall_f16(8.3334519073E-3f);
const v_float16 _vexp_p3_f16 = vx_setall_f16(4.1665795894E-2f);
const v_float16 _vexp_p4_f16 = vx_setall_f16(1.6666665459E-1f);
const v_float16 _vexp_p5_f16 = vx_setall_f16(5.0000001201E-1f);
const v_int16 _vexp_bias_s16 = vx_setall_s16(0xf);
v_float16 _vexp_, _vexp_x, _vexp_y, _vexp_xx;
v_int16 _vexp_mm;
// compute exponential of x
_vexp_x = v_max(x, _vexp_lo_f16);
_vexp_x = v_min(_vexp_x, _vexp_hi_f16);
_vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f16, _vexp_half_fp16);
_vexp_mm = v_floor(_vexp_);
_vexp_ = v_cvt_f16(_vexp_mm);
_vexp_mm = v_add(_vexp_mm, _vexp_bias_s16);
_vexp_mm = v_shl(_vexp_mm, 10);
_vexp_x = v_fma(_vexp_, _vexp_C1_f16, _vexp_x);
_vexp_x = v_fma(_vexp_, _vexp_C2_f16, _vexp_x);
_vexp_xx = v_mul(_vexp_x, _vexp_x);
_vexp_y = v_fma(_vexp_x, _vexp_p0_f16, _vexp_p1_f16);
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f16);
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f16);
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f16);
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f16);
_vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x);
_vexp_y = v_add(_vexp_y, _vexp_one_fp16);
_vexp_y = v_mul(_vexp_y, v_reinterpret_as_f16(_vexp_mm));
// exp(NAN) -> NAN
v_float16 mask_not_nan = v_not_nan(x);
return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f16(vx_setall_s16(0x7e00)));
}
#endif
inline v_float32 v_exp(const v_float32 &x) {
const v_float32 _vexp_lo_f32 = vx_setall_f32(-88.3762626647949f);
const v_float32 _vexp_hi_f32 = vx_setall_f32(89.f);
const v_float32 _vexp_half_fp32 = vx_setall_f32(0.5f);
const v_float32 _vexp_one_fp32 = vx_setall_f32(1.f);
const v_float32 _vexp_LOG2EF_f32 = vx_setall_f32(1.44269504088896341f);
const v_float32 _vexp_C1_f32 = vx_setall_f32(-6.93359375E-1f);
const v_float32 _vexp_C2_f32 = vx_setall_f32(2.12194440E-4f);
const v_float32 _vexp_p0_f32 = vx_setall_f32(1.9875691500E-4f);
const v_float32 _vexp_p1_f32 = vx_setall_f32(1.3981999507E-3f);
const v_float32 _vexp_p2_f32 = vx_setall_f32(8.3334519073E-3f);
const v_float32 _vexp_p3_f32 = vx_setall_f32(4.1665795894E-2f);
const v_float32 _vexp_p4_f32 = vx_setall_f32(1.6666665459E-1f);
const v_float32 _vexp_p5_f32 = vx_setall_f32(5.0000001201E-1f);
const v_int32 _vexp_bias_s32 = vx_setall_s32(0x7f);
v_float32 _vexp_, _vexp_x, _vexp_y, _vexp_xx;
v_int32 _vexp_mm;
// compute exponential of x
_vexp_x = v_max(x, _vexp_lo_f32);
_vexp_x = v_min(_vexp_x, _vexp_hi_f32);
_vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f32, _vexp_half_fp32);
_vexp_mm = v_floor(_vexp_);
_vexp_ = v_cvt_f32(_vexp_mm);
_vexp_mm = v_add(_vexp_mm, _vexp_bias_s32);
_vexp_mm = v_shl(_vexp_mm, 23);
_vexp_x = v_fma(_vexp_, _vexp_C1_f32, _vexp_x);
_vexp_x = v_fma(_vexp_, _vexp_C2_f32, _vexp_x);
_vexp_xx = v_mul(_vexp_x, _vexp_x);
_vexp_y = v_fma(_vexp_x, _vexp_p0_f32, _vexp_p1_f32);
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f32);
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f32);
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f32);
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f32);
_vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x);
_vexp_y = v_add(_vexp_y, _vexp_one_fp32);
_vexp_y = v_mul(_vexp_y, v_reinterpret_as_f32(_vexp_mm));
// exp(NAN) -> NAN
v_float32 mask_not_nan = v_not_nan(x);
return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f32(vx_setall_s32(0x7fc00000)));
}
#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
inline v_float64 v_exp(const v_float64 &x) {
const v_float64 _vexp_lo_f64 = vx_setall_f64(-709.43613930310391424428);
const v_float64 _vexp_hi_f64 = vx_setall_f64(710.);
const v_float64 _vexp_half_f64 = vx_setall_f64(0.5);
const v_float64 _vexp_one_f64 = vx_setall_f64(1.0);
const v_float64 _vexp_two_f64 = vx_setall_f64(2.0);
const v_float64 _vexp_LOG2EF_f64 = vx_setall_f64(1.44269504088896340736);
const v_float64 _vexp_C1_f64 = vx_setall_f64(-6.93145751953125E-1);
const v_float64 _vexp_C2_f64 = vx_setall_f64(-1.42860682030941723212E-6);
const v_float64 _vexp_p0_f64 = vx_setall_f64(1.26177193074810590878E-4);
const v_float64 _vexp_p1_f64 = vx_setall_f64(3.02994407707441961300E-2);
const v_float64 _vexp_p2_f64 = vx_setall_f64(9.99999999999999999910E-1);
const v_float64 _vexp_q0_f64 = vx_setall_f64(3.00198505138664455042E-6);
const v_float64 _vexp_q1_f64 = vx_setall_f64(2.52448340349684104192E-3);
const v_float64 _vexp_q2_f64 = vx_setall_f64(2.27265548208155028766E-1);
const v_float64 _vexp_q3_f64 = vx_setall_f64(2.00000000000000000009E0);
const v_int64 _vexp_bias_s64 = vx_setall_s64(0x3ff);
v_float64 _vexp_, _vexp_x, _vexp_y, _vexp_z, _vexp_xx;
v_int64 _vexp_mm;
// compute exponential of x
_vexp_x = v_max(x, _vexp_lo_f64);
_vexp_x = v_min(_vexp_x, _vexp_hi_f64);
_vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f64, _vexp_half_f64);
_vexp_mm = v_expand_low(v_floor(_vexp_));
_vexp_ = v_cvt_f64(_vexp_mm);
_vexp_mm = v_add(_vexp_mm, _vexp_bias_s64);
_vexp_mm = v_shl(_vexp_mm, 52);
_vexp_x = v_fma(_vexp_, _vexp_C1_f64, _vexp_x);
_vexp_x = v_fma(_vexp_, _vexp_C2_f64, _vexp_x);
_vexp_xx = v_mul(_vexp_x, _vexp_x);
_vexp_y = v_fma(_vexp_xx, _vexp_p0_f64, _vexp_p1_f64);
_vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_p2_f64);
_vexp_y = v_mul(_vexp_y, _vexp_x);
_vexp_z = v_fma(_vexp_xx, _vexp_q0_f64, _vexp_q1_f64);
_vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q2_f64);
_vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q3_f64);
_vexp_z = v_div(_vexp_y, v_sub(_vexp_z, _vexp_y));
_vexp_z = v_fma(_vexp_two_f64, _vexp_z, _vexp_one_f64);
_vexp_z = v_mul(_vexp_z, v_reinterpret_as_f64(_vexp_mm));
// exp(NAN) -> NAN
v_float64 mask_not_nan = v_not_nan(x);
return v_select(mask_not_nan, _vexp_z, v_reinterpret_as_f64(vx_setall_s64(0x7FF8000000000000)));
}
#endif
#define OPENCV_HAL_MATH_HAVE_EXP 1
//! @}
#endif
}
#endif // OPENCV_HAL_INTRIN_HPP

@ -1698,6 +1698,103 @@ template<typename R> struct TheTest
return *this; return *this;
} }
void __test_exp(LaneType dataMax, LaneType diff_thr, LaneType enlarge_factor, LaneType flt_min) {
int n = VTraits<R>::vlanes();
// Test overflow and underflow values with step
const LaneType step = (LaneType) 0.01;
for (LaneType i = dataMax + 1; i <= dataMax + 11;) {
Data<R> dataUpperBound, dataLowerBound, resOverflow, resUnderflow;
for (int j = 0; j < n; ++j) {
dataUpperBound[j] = i;
dataLowerBound[j] = -i;
i += step;
}
R upperBound = dataUpperBound, lowerBound = dataLowerBound;
resOverflow = v_exp(upperBound);
resUnderflow = v_exp(lowerBound);
for (int j = 0; j < n; ++j) {
SCOPED_TRACE(cv::format("Overflow/Underflow test value: %f", i));
EXPECT_TRUE(resOverflow[j] > 0 && std::isinf(resOverflow[j]));
EXPECT_GE(resUnderflow[j], 0);
EXPECT_LT(resUnderflow[j], flt_min);
}
}
// Test random values combined with special values
std::vector<LaneType> specialValues = {0, 1, INFINITY, -INFINITY, NAN, dataMax};
const int testRandNum = 10000;
const double specialValueProbability = 0.1; // 10% chance to insert a special value
cv::RNG_MT19937 rng;
for (int i = 0; i < testRandNum; i++) {
Data<R> dataRand, resRand;
for (int j = 0; j < n; ++j) {
if (rng.uniform(0.f, 1.f) <= specialValueProbability) {
// Insert a special value
int specialValueIndex = rng.uniform(0, (int) specialValues.size());
dataRand[j] = specialValues[specialValueIndex];
} else {
// Generate random data in [-dataMax*1.1, dataMax*1.1]
dataRand[j] = (LaneType) rng.uniform(-dataMax * 1.1, dataMax * 1.1);
}
}
// Compare with std::exp
R x = dataRand;
resRand = v_exp(x);
for (int j = 0; j < n; ++j) {
SCOPED_TRACE(cv::format("Random test value: %f", dataRand[j]));
LaneType std_exp = std::exp(dataRand[j]);
if (dataRand[j] == 0) {
// input 0 -> output 1
EXPECT_EQ(resRand[j], 1);
} else if (dataRand[j] == 1) {
// input 1 -> output e
EXPECT_NEAR((LaneType) M_E, resRand[j], 1e-15);
} else if (dataRand[j] > 0 && std::isinf(dataRand[j])) {
// input INF -> output INF
EXPECT_TRUE(resRand[j] > 0 && std::isinf(resRand[j]));
} else if (dataRand[j] < 0 && std::isinf(dataRand[j])) {
// input -INF -> output 0
EXPECT_EQ(resRand[j], 0);
} else if (std::isnan(dataRand[j])) {
// input NaN -> output NaN
EXPECT_TRUE(std::isnan(resRand[j]));
} else if (dataRand[j] == dataMax) {
// input dataMax -> output less than INFINITY
EXPECT_LT(resRand[j], (LaneType) INFINITY);
} else if (std::isinf(resRand[j])) {
// output INF -> input close to edge
EXPECT_GT(dataRand[j], dataMax);
} else {
EXPECT_GE(resRand[j], 0);
EXPECT_LT(std::abs(resRand[j] - std_exp), diff_thr * (std_exp + flt_min * enlarge_factor));
}
}
}
}
TheTest &test_exp_fp16() {
#if CV_SIMD_FP16
float16_t flt16_min;
uint16_t flt16_min_hex = 0x0400;
std::memcpy(&flt16_min, &flt16_min_hex, sizeof(float16_t));
__test_exp((float16_t) 10, (float16_t) 1e-2, (float16_t) 1e2, flt16_min);
#endif
return *this;
}
TheTest &test_exp_fp32() {
__test_exp(88.0f, 1e-6f, 1e6f, FLT_MIN);
return *this;
}
TheTest &test_exp_fp64() {
#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
__test_exp(709.0, 1e-15, 1e15, DBL_MIN);
#endif
return *this;
}
}; };
#define DUMP_ENTRY(type) printf("SIMD%d: %s\n", 8*VTraits<v_uint8>::vlanes(), CV__TRACE_FUNCTION); #define DUMP_ENTRY(type) printf("SIMD%d: %s\n", 8*VTraits<v_uint8>::vlanes(), CV__TRACE_FUNCTION);
@ -2011,6 +2108,7 @@ void test_hal_intrin_float32()
.test_extract_highest() .test_extract_highest()
.test_broadcast_highest() .test_broadcast_highest()
.test_pack_triplets() .test_pack_triplets()
.test_exp_fp32()
#if CV_SIMD_WIDTH == 32 #if CV_SIMD_WIDTH == 32
.test_extract<4>().test_extract<5>().test_extract<6>().test_extract<7>() .test_extract<4>().test_extract<5>().test_extract<6>().test_extract<7>()
.test_rotate<4>().test_rotate<5>().test_rotate<6>().test_rotate<7>() .test_rotate<4>().test_rotate<5>().test_rotate<6>().test_rotate<7>()
@ -2035,13 +2133,13 @@ void test_hal_intrin_float64()
.test_mask() .test_mask()
.test_unpack() .test_unpack()
.test_float_math() .test_float_math()
.test_round_pair_f64()
.test_float_cvt32() .test_float_cvt32()
.test_reverse() .test_reverse()
.test_extract<0>().test_extract<1>() .test_extract<0>().test_extract<1>()
.test_rotate<0>().test_rotate<1>() .test_rotate<0>().test_rotate<1>()
.test_extract_n<0>().test_extract_n<1>() .test_extract_n<0>().test_extract_n<1>()
.test_extract_highest() .test_extract_highest()
.test_exp_fp64()
//.test_broadcast_element<0>().test_broadcast_element<1>() //.test_broadcast_element<0>().test_broadcast_element<1>()
#if CV_SIMD_WIDTH == 32 #if CV_SIMD_WIDTH == 32
.test_extract<2>().test_extract<3>() .test_extract<2>().test_extract<3>()
@ -2062,6 +2160,7 @@ void test_hal_intrin_float16()
#if CV_SIMD_FP16 #if CV_SIMD_FP16
.test_loadstore_fp16() .test_loadstore_fp16()
.test_float_cvt_fp16() .test_float_cvt_fp16()
.test_exp_fp16()
#endif #endif
; ;
#else #else

@ -71,48 +71,12 @@ void softmax(Mat &dst, const Mat &src, int axis, int axisBias, int axisStep){
// calculate the exp value along the axis // calculate the exp value along the axis
v_float32 vs = vx_setzero_f32(); v_float32 vs = vx_setzero_f32();
vmax = vx_setall_f32(maxVal); vmax = vx_setall_f32(maxVal);
// initialize vexp constant v_float32 val;
v_float32 _vexp_lo = vx_setall_f32(-88.3762626647949f);
v_float32 _vexp_hi = vx_setall_f32(88.3762626647949f);
v_float32 _vexp_half = vx_setall_f32(0.5f);
v_float32 _vexp_one = vx_setall_f32(1.f);
v_float32 _vexp_LOG2EF = vx_setall_f32(1.44269504088896341f);
v_float32 _vexp_C1 = vx_setall_f32(-0.693359375f);
v_float32 _vexp_C2 = vx_setall_f32(2.12194440e-4f);
v_float32 _vexp_p0 = vx_setall_f32(1.9875691500E-4f);
v_float32 _vexp_p1 = vx_setall_f32(1.3981999507E-3f);
v_float32 _vexp_p2 = vx_setall_f32(8.3334519073E-3f);
v_float32 _vexp_p3 = vx_setall_f32(4.1665795894E-2f);
v_float32 _vexp_p4 = vx_setall_f32(1.6666665459E-1f);
v_float32 _vexp_p5 = vx_setall_f32(5.0000001201E-1f);
// initialize temp vectors for vexp
v_float32 val, _vexp_, _vexp_x, _vexp_y, _vexp_z;
v_int32 _vexp_mm;
// calculate and sum all data along axis // calculate and sum all data along axis
for (size_t cnDim = 0; cnDim < axisStep; cnDim += nlanes) { for (size_t cnDim = 0; cnDim < axisStep; cnDim += nlanes) {
val = vx_load(axisBuf + cnDim); val = vx_load(axisBuf + cnDim);
val = v_sub(val, vmax); val = v_sub(val, vmax);
val = v_exp(val);
// compute vexp of val
_vexp_x = v_min(val, _vexp_hi);
_vexp_x = v_max(_vexp_x, _vexp_lo);
_vexp_ = v_fma(_vexp_x, _vexp_LOG2EF, _vexp_half);
_vexp_mm = v_floor(_vexp_);
_vexp_ = v_cvt_f32(_vexp_mm);
_vexp_mm = v_add(_vexp_mm, vx_setall_s32(0x7f));
_vexp_mm = v_shl(_vexp_mm, 23);
_vexp_x = v_fma(_vexp_, _vexp_C1, _vexp_x);
_vexp_x = v_fma(_vexp_, _vexp_C2, _vexp_x);
_vexp_z = v_mul(_vexp_x, _vexp_x);
_vexp_y = v_fma(_vexp_x, _vexp_p0, _vexp_p1);
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2);
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3);
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4);
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5);
_vexp_y = v_fma(_vexp_y, _vexp_z, _vexp_x);
_vexp_y = v_add(_vexp_y, _vexp_one);
val = v_mul(_vexp_y, v_reinterpret_as_f32(_vexp_mm));
vs = v_add(vs, val); vs = v_add(vs, val);
v_store(axisBuf + cnDim, val); v_store(axisBuf + cnDim, val);

Loading…
Cancel
Save