Merge pull request #25892 from WanliZhong:v_sincos

Add support for v_sin and v_cos (Sine and Cosine) #25892

This PR aims to implement `v_sincos(v_float16 x)`, `v_sincos(v_float32 x)` and `v_sincos(v_float64 x)`. 
Merged after https://github.com/opencv/opencv/pull/25891 and https://github.com/opencv/opencv/pull/26023

**NOTE:** 
Also, the patch changes already added `v_exp`, `v_log` and `v_erf` to pass parameters by reference instead of by value, to match API of other universal intrinsics.

TODO:
- [x] double and half float precision
- [x] tests for them
- [x] doc to explain the implementation

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
pull/21407/merge
Wanli 1 month ago committed by GitHub
parent 69803e7b99
commit 687e37e6a8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 18
      modules/core/include/opencv2/core/hal/intrin_avx.hpp
  2. 18
      modules/core/include/opencv2/core/hal/intrin_avx512.hpp
  3. 38
      modules/core/include/opencv2/core/hal/intrin_cpp.hpp
  4. 18
      modules/core/include/opencv2/core/hal/intrin_lasx.hpp
  5. 18
      modules/core/include/opencv2/core/hal/intrin_lsx.hpp
  6. 242
      modules/core/include/opencv2/core/hal/intrin_math.hpp
  7. 18
      modules/core/include/opencv2/core/hal/intrin_msa.hpp
  8. 23
      modules/core/include/opencv2/core/hal/intrin_neon.hpp
  9. 18
      modules/core/include/opencv2/core/hal/intrin_rvv071.hpp
  10. 18
      modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
  11. 17
      modules/core/include/opencv2/core/hal/intrin_sse.hpp
  12. 18
      modules/core/include/opencv2/core/hal/intrin_vsx.hpp
  13. 18
      modules/core/include/opencv2/core/hal/intrin_wasm.hpp
  14. 96
      modules/core/test/test_intrin_utils.hpp

@ -3167,12 +3167,18 @@ inline void v_pack_store(hfloat* ptr, const v_float32x8& a)
inline void v256_cleanup() { _mm256_zeroall(); }
#include "intrin_math.hpp"
inline v_float32x8 v_exp(v_float32x8 x) { return v_exp_default_32f<v_float32x8, v_int32x8>(x); }
inline v_float32x8 v_log(v_float32x8 x) { return v_log_default_32f<v_float32x8, v_int32x8>(x); }
inline v_float32x8 v_erf(v_float32x8 x) { return v_erf_default_32f<v_float32x8, v_int32x8>(x); }
inline v_float64x4 v_exp(v_float64x4 x) { return v_exp_default_64f<v_float64x4, v_int64x4>(x); }
inline v_float64x4 v_log(v_float64x4 x) { return v_log_default_64f<v_float64x4, v_int64x4>(x); }
inline v_float32x8 v_exp(const v_float32x8& x) { return v_exp_default_32f<v_float32x8, v_int32x8>(x); }
inline v_float32x8 v_log(const v_float32x8& x) { return v_log_default_32f<v_float32x8, v_int32x8>(x); }
inline void v_sincos(const v_float32x8& x, v_float32x8& s, v_float32x8& c) { v_sincos_default_32f<v_float32x8, v_int32x8>(x, s, c); }
inline v_float32x8 v_sin(const v_float32x8& x) { return v_sin_default_32f<v_float32x8, v_int32x8>(x); }
inline v_float32x8 v_cos(const v_float32x8& x) { return v_cos_default_32f<v_float32x8, v_int32x8>(x); }
inline v_float32x8 v_erf(const v_float32x8& x) { return v_erf_default_32f<v_float32x8, v_int32x8>(x); }
inline v_float64x4 v_exp(const v_float64x4& x) { return v_exp_default_64f<v_float64x4, v_int64x4>(x); }
inline v_float64x4 v_log(const v_float64x4& x) { return v_log_default_64f<v_float64x4, v_int64x4>(x); }
inline void v_sincos(const v_float64x4& x, v_float64x4& s, v_float64x4& c) { v_sincos_default_64f<v_float64x4, v_int64x4>(x, s, c); }
inline v_float64x4 v_sin(const v_float64x4& x) { return v_sin_default_64f<v_float64x4, v_int64x4>(x); }
inline v_float64x4 v_cos(const v_float64x4& x) { return v_cos_default_64f<v_float64x4, v_int64x4>(x); }
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END

@ -3079,12 +3079,18 @@ inline int v_scan_forward(const v_float64x8& a) { return trailingZeros32(v_signm
inline void v512_cleanup() { _mm256_zeroall(); }
#include "intrin_math.hpp"
inline v_float32x16 v_exp(v_float32x16 x) { return v_exp_default_32f<v_float32x16, v_int32x16>(x); }
inline v_float32x16 v_log(v_float32x16 x) { return v_log_default_32f<v_float32x16, v_int32x16>(x); }
inline v_float32x16 v_erf(v_float32x16 x) { return v_erf_default_32f<v_float32x16, v_int32x16>(x); }
inline v_float64x8 v_exp(v_float64x8 x) { return v_exp_default_64f<v_float64x8, v_int64x8>(x); }
inline v_float64x8 v_log(v_float64x8 x) { return v_log_default_64f<v_float64x8, v_int64x8>(x); }
inline v_float32x16 v_exp(const v_float32x16& x) { return v_exp_default_32f<v_float32x16, v_int32x16>(x); }
inline v_float32x16 v_log(const v_float32x16& x) { return v_log_default_32f<v_float32x16, v_int32x16>(x); }
inline void v_sincos(const v_float32x16& x, v_float32x16& s, v_float32x16& c) { v_sincos_default_32f<v_float32x16, v_int32x16>(x, s, c); }
inline v_float32x16 v_sin(const v_float32x16& x) { return v_sin_default_32f<v_float32x16, v_int32x16>(x); }
inline v_float32x16 v_cos(const v_float32x16& x) { return v_cos_default_32f<v_float32x16, v_int32x16>(x); }
inline v_float32x16 v_erf(const v_float32x16& x) { return v_erf_default_32f<v_float32x16, v_int32x16>(x); }
inline v_float64x8 v_exp(const v_float64x8& x) { return v_exp_default_64f<v_float64x8, v_int64x8>(x); }
inline v_float64x8 v_log(const v_float64x8& x) { return v_log_default_64f<v_float64x8, v_int64x8>(x); }
inline void v_sincos(const v_float64x8& x, v_float64x8& s, v_float64x8& c) { v_sincos_default_64f<v_float64x8, v_int64x8>(x, s, c); }
inline v_float64x8 v_sin(const v_float64x8& x) { return v_sin_default_64f<v_float64x8, v_int64x8>(x); }
inline v_float64x8 v_cos(const v_float64x8& x) { return v_cos_default_64f<v_float64x8, v_int64x8>(x); }
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END

@ -264,7 +264,7 @@ Most of these operations return only one value.
### Other math
- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude, @ref v_exp, @ref v_log,
@ref v_erf
@ref v_erf, @ref v_sin, @ref v_cos
- Absolute values: @ref v_abs, @ref v_absdiff, @ref v_absdiffs
### Conversions
@ -366,6 +366,7 @@ Floating point:
|broadcast_element | x | |
|exp | x | x |
|log | x | x |
|sin, cos | x | x |
@{ */
@ -745,10 +746,41 @@ OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
*/
OPENCV_HAL_IMPL_MATH_FUNC(v_erf, std::erf, _Tp)
//! @cond IGNORED
/**
* @brief Compute sine \f$ sin(x) \f$ and cosine \f$ cos(x) \f$ of elements at the same time
*
* Only for floating point types. Core implementation steps:
* 1. Input Normalization: Scale the periodicity from 2π to 4 and reduce the angle to the range \f$ [0, \frac{\pi}{4}] \f$ using periodicity and trigonometric identities.
* 2. Polynomial Approximation for \f$ sin(x) \f$ and \f$ cos(x) \f$:
* - For float16 and float32, use a Taylor series with 4 terms for sine and 5 terms for cosine.
* - For float64, use a Taylor series with 7 terms for sine and 8 terms for cosine.
* 3. Select Results: select and convert the final sine and cosine values for the original input angle.
*
* @note The precision of the calculation depends on the implementation and the data type of the input vector.
*/
template<typename _Tp, int n>
inline void v_sincos(const v_reg<_Tp, n>& x, v_reg<_Tp, n>& s, v_reg<_Tp, n>& c)
{
for( int i = 0; i < n; i++ )
{
s.s[i] = std::sin(x.s[i]);
c.s[i] = std::cos(x.s[i]);
}
}
/**
* @brief Sine \f$ sin(x) \f$ of elements
*
* Only for floating point types. Core implementation the same as @ref v_sincos.
*/
OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
/**
* @brief Cosine \f$ cos(x) \f$ of elements
*
* Only for floating point types. Core implementation the same as @ref v_sincos.
*/
OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
//! @endcond
/** @brief Absolute value of elements

@ -3014,12 +3014,18 @@ inline void v_pack_store(hfloat* ptr, const v_float32x8& a)
inline void v256_cleanup() {}
#include "intrin_math.hpp"
inline v_float32x8 v_exp(v_float32x8 x) { return v_exp_default_32f<v_float32x8, v_int32x8>(x); }
inline v_float32x8 v_log(v_float32x8 x) { return v_log_default_32f<v_float32x8, v_int32x8>(x); }
inline v_float32x8 v_erf(v_float32x8 x) { return v_erf_default_32f<v_float32x8, v_int32x8>(x); }
inline v_float64x4 v_exp(v_float64x4 x) { return v_exp_default_64f<v_float64x4, v_int64x4>(x); }
inline v_float64x4 v_log(v_float64x4 x) { return v_log_default_64f<v_float64x4, v_int64x4>(x); }
inline v_float32x8 v_exp(const v_float32x8& x) { return v_exp_default_32f<v_float32x8, v_int32x8>(x); }
inline v_float32x8 v_log(const v_float32x8& x) { return v_log_default_32f<v_float32x8, v_int32x8>(x); }
inline void v_sincos(const v_float32x8& x, v_float32x8& s, v_float32x8& c) { v_sincos_default_32f<v_float32x8, v_int32x8>(x, s, c); }
inline v_float32x8 v_sin(const v_float32x8& x) { return v_sin_default_32f<v_float32x8, v_int32x8>(x); }
inline v_float32x8 v_cos(const v_float32x8& x) { return v_cos_default_32f<v_float32x8, v_int32x8>(x); }
inline v_float32x8 v_erf(const v_float32x8& x) { return v_erf_default_32f<v_float32x8, v_int32x8>(x); }
inline v_float64x4 v_exp(const v_float64x4& x) { return v_exp_default_64f<v_float64x4, v_int64x4>(x); }
inline v_float64x4 v_log(const v_float64x4& x) { return v_log_default_64f<v_float64x4, v_int64x4>(x); }
inline void v_sincos(const v_float64x4& x, v_float64x4& s, v_float64x4& c) { v_sincos_default_64f<v_float64x4, v_int64x4>(x, s, c); }
inline v_float64x4 v_sin(const v_float64x4& x) { return v_sin_default_64f<v_float64x4, v_int64x4>(x); }
inline v_float64x4 v_cos(const v_float64x4& x) { return v_cos_default_64f<v_float64x4, v_int64x4>(x); }
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END

@ -2524,12 +2524,18 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& a)
inline void v_cleanup() {}
#include "intrin_math.hpp"
inline v_float32x4 v_exp(v_float32x4 x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_log(v_float32x4 x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_erf(v_float32x4 x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float64x2 v_exp(v_float64x2 x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_log(v_float64x2 x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END

@ -405,6 +405,248 @@ inline _TpVec64F v_log_default_64f(const _TpVec64F &x) {
}
//! @}
//! @name Sine and Cosine
//! @{
template<typename _TpVec16F, typename _TpVec16S>
inline void v_sincos_default_16f(const _TpVec16F &x, _TpVec16F &ysin, _TpVec16F &ycos) {
const _TpVec16F v_cephes_FOPI = v_setall_<_TpVec16F>(hfloat(1.27323954473516f)); // 4 / M_PI
const _TpVec16F v_minus_DP1 = v_setall_<_TpVec16F>(hfloat(-0.78515625f));
const _TpVec16F v_minus_DP2 = v_setall_<_TpVec16F>(hfloat(-2.4187564849853515625E-4f));
const _TpVec16F v_minus_DP3 = v_setall_<_TpVec16F>(hfloat(-3.77489497744594108E-8f));
const _TpVec16F v_sincof_p0 = v_setall_<_TpVec16F>(hfloat(-1.9515295891E-4f));
const _TpVec16F v_sincof_p1 = v_setall_<_TpVec16F>(hfloat(8.3321608736E-3f));
const _TpVec16F v_sincof_p2 = v_setall_<_TpVec16F>(hfloat(-1.6666654611E-1f));
const _TpVec16F v_coscof_p0 = v_setall_<_TpVec16F>(hfloat(2.443315711809948E-5f));
const _TpVec16F v_coscof_p1 = v_setall_<_TpVec16F>(hfloat(-1.388731625493765E-3f));
const _TpVec16F v_coscof_p2 = v_setall_<_TpVec16F>(hfloat(4.166664568298827E-2f));
const _TpVec16F v_nan = v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7e00));
const _TpVec16F v_neg_zero = v_setall_<_TpVec16F>(hfloat(-0.f));
_TpVec16F _vx, _vy, sign_mask_sin, sign_mask_cos;
_TpVec16S emm2;
sign_mask_sin = v_lt(x, v_setzero_<_TpVec16F>());
_vx = v_abs(x);
_vy = v_mul(_vx, v_cephes_FOPI);
emm2 = v_trunc(_vy);
emm2 = v_add(emm2, v_setall_<_TpVec16S>((short)1));
emm2 = v_and(emm2, v_setall_<_TpVec16S>((short)~1));
_vy = v_cvt_f16(emm2);
_TpVec16F poly_mask = v_reinterpret_as_f16(v_eq(v_and(emm2, v_setall_<_TpVec16S>((short)2)), v_setall_<_TpVec16S>((short)0)));
_vx = v_fma(_vy, v_minus_DP1, _vx);
_vx = v_fma(_vy, v_minus_DP2, _vx);
_vx = v_fma(_vy, v_minus_DP3, _vx);
sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f16(v_eq(v_and(emm2, v_setall_<_TpVec16S>((short)4)), v_setall_<_TpVec16S>((short)0))));
sign_mask_cos = v_reinterpret_as_f16(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec16S>((short)2)), v_setall_<_TpVec16S>((short)4)), v_setall_<_TpVec16S>((short)0)));
_TpVec16F _vxx = v_mul(_vx, _vx);
_TpVec16F y1, y2;
y1 = v_fma(v_coscof_p0, _vxx, v_coscof_p1);
y1 = v_fma(y1, _vxx, v_coscof_p2);
y1 = v_fma(y1, _vxx, v_setall_<_TpVec16F>(hfloat(-0.5f)));
y1 = v_fma(y1, _vxx, v_setall_<_TpVec16F>(hfloat(1.f)));
y2 = v_fma(v_sincof_p0, _vxx, v_sincof_p1);
y2 = v_fma(y2, _vxx, v_sincof_p2);
y2 = v_mul(y2, _vxx);
y2 = v_fma(y2, _vx, _vx);
ysin = v_select(poly_mask, y2, y1);
ycos = v_select(poly_mask, y1, y2);
ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin));
ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos);
// sincos(NAN) -> NAN, sincos(±INF) -> NAN
_TpVec16F mask_inf = v_eq(_vx, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7c00)));
_TpVec16F mask_nan = v_or(mask_inf, v_ne(x, x));
ysin = v_select(mask_nan, v_nan, ysin);
ycos = v_select(mask_nan, v_nan, ycos);
}
template<typename _TpVec16F, typename _TpVec16S>
inline _TpVec16F v_sin_default_16f(const _TpVec16F &x) {
_TpVec16F ysin, ycos;
v_sincos_default_16f<_TpVec16F, _TpVec16S>(x, ysin, ycos);
return ysin;
}
template<typename _TpVec16F, typename _TpVec16S>
inline _TpVec16F v_cos_default_16f(const _TpVec16F &x) {
_TpVec16F ysin, ycos;
v_sincos_default_16f<_TpVec16F, _TpVec16S>(x, ysin, ycos);
return ycos;
}
template<typename _TpVec32F, typename _TpVec32S>
inline void v_sincos_default_32f(const _TpVec32F &x, _TpVec32F &ysin, _TpVec32F &ycos) {
const _TpVec32F v_cephes_FOPI = v_setall_<_TpVec32F>(1.27323954473516f); // 4 / M_PI
const _TpVec32F v_minus_DP1 = v_setall_<_TpVec32F>(-0.78515625f);
const _TpVec32F v_minus_DP2 = v_setall_<_TpVec32F>(-2.4187564849853515625E-4f);
const _TpVec32F v_minus_DP3 = v_setall_<_TpVec32F>(-3.77489497744594108E-8f);
const _TpVec32F v_sincof_p0 = v_setall_<_TpVec32F>(-1.9515295891E-4f);
const _TpVec32F v_sincof_p1 = v_setall_<_TpVec32F>(8.3321608736E-3f);
const _TpVec32F v_sincof_p2 = v_setall_<_TpVec32F>(-1.6666654611E-1f);
const _TpVec32F v_coscof_p0 = v_setall_<_TpVec32F>(2.443315711809948E-5f);
const _TpVec32F v_coscof_p1 = v_setall_<_TpVec32F>(-1.388731625493765E-3f);
const _TpVec32F v_coscof_p2 = v_setall_<_TpVec32F>(4.166664568298827E-2f);
const _TpVec32F v_nan = v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7fc00000));
const _TpVec32F v_neg_zero = v_setall_<_TpVec32F>(-0.f);
_TpVec32F _vx, _vy, sign_mask_sin, sign_mask_cos;
_TpVec32S emm2;
sign_mask_sin = v_lt(x, v_setzero_<_TpVec32F>());
_vx = v_abs(x);
_vy = v_mul(_vx, v_cephes_FOPI);
emm2 = v_trunc(_vy);
emm2 = v_add(emm2, v_setall_<_TpVec32S>(1));
emm2 = v_and(emm2, v_setall_<_TpVec32S>(~1));
_vy = v_cvt_f32(emm2);
_TpVec32F poly_mask = v_reinterpret_as_f32(v_eq(v_and(emm2, v_setall_<_TpVec32S>(2)), v_setall_<_TpVec32S>(0)));
_vx = v_fma(_vy, v_minus_DP1, _vx);
_vx = v_fma(_vy, v_minus_DP2, _vx);
_vx = v_fma(_vy, v_minus_DP3, _vx);
sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f32(v_eq(v_and(emm2, v_setall_<_TpVec32S>(4)), v_setall_<_TpVec32S>(0))));
sign_mask_cos = v_reinterpret_as_f32(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec32S>(2)), v_setall_<_TpVec32S>(4)), v_setall_<_TpVec32S>(0)));
_TpVec32F _vxx = v_mul(_vx, _vx);
_TpVec32F y1, y2;
y1 = v_fma(v_coscof_p0, _vxx, v_coscof_p1);
y1 = v_fma(y1, _vxx, v_coscof_p2);
y1 = v_fma(y1, _vxx, v_setall_<_TpVec32F>(-0.5f));
y1 = v_fma(y1, _vxx, v_setall_<_TpVec32F>(1.f));
y2 = v_fma(v_sincof_p0, _vxx, v_sincof_p1);
y2 = v_fma(y2, _vxx, v_sincof_p2);
y2 = v_mul(y2, _vxx);
y2 = v_fma(y2, _vx, _vx);
ysin = v_select(poly_mask, y2, y1);
ycos = v_select(poly_mask, y1, y2);
ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin));
ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos);
// sincos(NAN) -> NAN, sincos(±INF) -> NAN
_TpVec32F mask_inf = v_eq(_vx, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7f800000)));
_TpVec32F mask_nan = v_or(mask_inf, v_ne(x, x));
ysin = v_select(mask_nan, v_nan, ysin);
ycos = v_select(mask_nan, v_nan, ycos);
}
template<typename _TpVec32F, typename _TpVec32S>
inline _TpVec32F v_sin_default_32f(const _TpVec32F &x) {
_TpVec32F ysin, ycos;
v_sincos_default_32f<_TpVec32F, _TpVec32S>(x, ysin, ycos);
return ysin;
}
template<typename _TpVec32F, typename _TpVec32S>
inline _TpVec32F v_cos_default_32f(const _TpVec32F &x) {
_TpVec32F ysin, ycos;
v_sincos_default_32f<_TpVec32F, _TpVec32S>(x, ysin, ycos);
return ycos;
}
template<typename _TpVec64F, typename _TpVec64S>
inline void v_sincos_default_64f(const _TpVec64F &x, _TpVec64F &ysin, _TpVec64F &ycos) {
const _TpVec64F v_cephes_FOPI = v_setall_<_TpVec64F>(1.2732395447351626861510701069801148); // 4 / M_PI
const _TpVec64F v_minus_DP1 = v_setall_<_TpVec64F>(-7.853981554508209228515625E-1);
const _TpVec64F v_minus_DP2 = v_setall_<_TpVec64F>(-7.94662735614792836714E-9);
const _TpVec64F v_minus_DP3 = v_setall_<_TpVec64F>(-3.06161699786838294307E-17);
const _TpVec64F v_sin_C1 = v_setall_<_TpVec64F>(1.58962301576546568060E-10);
const _TpVec64F v_sin_C2 = v_setall_<_TpVec64F>(-2.50507477628578072866E-8);
const _TpVec64F v_sin_C3 = v_setall_<_TpVec64F>(2.75573136213857245213E-6);
const _TpVec64F v_sin_C4 = v_setall_<_TpVec64F>(-1.98412698295895385996E-4);
const _TpVec64F v_sin_C5 = v_setall_<_TpVec64F>(8.33333333332211858878E-3);
const _TpVec64F v_sin_C6 = v_setall_<_TpVec64F>(-1.66666666666666307295E-1);
const _TpVec64F v_cos_C1 = v_setall_<_TpVec64F>(-1.13585365213876817300E-11);
const _TpVec64F v_cos_C2 = v_setall_<_TpVec64F>(2.08757008419747316778E-9);
const _TpVec64F v_cos_C3 = v_setall_<_TpVec64F>(-2.75573141792967388112E-7);
const _TpVec64F v_cos_C4 = v_setall_<_TpVec64F>(2.48015872888517045348E-5);
const _TpVec64F v_cos_C5 = v_setall_<_TpVec64F>(-1.38888888888730564116E-3);
const _TpVec64F v_cos_C6 = v_setall_<_TpVec64F>(4.16666666666665929218E-2);
const _TpVec64F v_nan = v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff8000000000000));
const _TpVec64F v_neg_zero = v_setall_<_TpVec64F>(-0.0);
_TpVec64F _vx, _vy, sign_mask_sin, sign_mask_cos;
_TpVec64S emm2;
sign_mask_sin = v_lt(x, v_setzero_<_TpVec64F>());
_vx = v_abs(x);
_vy = v_mul(_vx, v_cephes_FOPI);
emm2 = v_expand_low(v_trunc(_vy));
emm2 = v_add(emm2, v_setall_<_TpVec64S>((int64)1));
emm2 = v_and(emm2, v_setall_<_TpVec64S>((int64)~1));
_vy = v_cvt_f64(emm2);
_TpVec64F poly_mask = v_reinterpret_as_f64(v_eq(v_and(emm2, v_setall_<_TpVec64S>((int64)2)), v_setall_<_TpVec64S>((int64)0)));
_vx = v_fma(_vy, v_minus_DP1, _vx);
_vx = v_fma(_vy, v_minus_DP2, _vx);
_vx = v_fma(_vy, v_minus_DP3, _vx);
sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f64(v_eq(v_and(emm2, v_setall_<_TpVec64S>((int64)4)), v_setall_<_TpVec64S>((int64)0))));
sign_mask_cos = v_reinterpret_as_f64(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec64S>((int64)2)), v_setall_<_TpVec64S>((int64)4)), v_setall_<_TpVec64S>((int64)0)));
_TpVec64F _vxx = v_mul(_vx, _vx);
_TpVec64F y1, y2;
y1 = v_fma(v_cos_C1, _vxx, v_cos_C2);
y1 = v_fma(y1, _vxx, v_cos_C3);
y1 = v_fma(y1, _vxx, v_cos_C4);
y1 = v_fma(y1, _vxx, v_cos_C5);
y1 = v_fma(y1, _vxx, v_cos_C6);
y1 = v_fma(y1, _vxx, v_setall_<_TpVec64F>(-0.5));
y1 = v_fma(y1, _vxx, v_setall_<_TpVec64F>(1.0));
y2 = v_fma(v_sin_C1, _vxx, v_sin_C2);
y2 = v_fma(y2, _vxx, v_sin_C3);
y2 = v_fma(y2, _vxx, v_sin_C4);
y2 = v_fma(y2, _vxx, v_sin_C5);
y2 = v_fma(y2, _vxx, v_sin_C6);
y2 = v_mul(y2, _vxx);
y2 = v_fma(y2, _vx, _vx);
ysin = v_select(poly_mask, y2, y1);
ycos = v_select(poly_mask, y1, y2);
ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin));
ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos);
// sincos(NAN) -> NAN, sincos(±INF) -> NAN
_TpVec64F mask_inf = v_eq(_vx, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff0000000000000)));
_TpVec64F mask_nan = v_or(mask_inf, v_ne(x, x));
ysin = v_select(mask_nan, v_nan, ysin);
ycos = v_select(mask_nan, v_nan, ycos);
}
template<typename _TpVec64F, typename _TpVec64S>
inline _TpVec64F v_sin_default_64f(const _TpVec64F &x) {
_TpVec64F ysin, ycos;
v_sincos_default_64f<_TpVec64F, _TpVec64S>(x, ysin, ycos);
return ysin;
}
template<typename _TpVec64F, typename _TpVec64S>
inline _TpVec64F v_cos_default_64f(const _TpVec64F &x) {
_TpVec64F ysin, ycos;
v_sincos_default_64f<_TpVec64F, _TpVec64S>(x, ysin, ycos);
return ycos;
}
//! @}
/* This implementation is derived from the approximation approach of Error Function (Erf) from PyTorch
https://github.com/pytorch/pytorch/blob/9c50ecc84b9a6e699a7f058891b889aafbf976c7/aten/src/ATen/cpu/vec/vec512/vec512_float.h#L189-L220
*/

@ -1864,12 +1864,18 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
inline void v_cleanup() {}
#include "intrin_math.hpp"
inline v_float32x4 v_exp(v_float32x4 x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_log(v_float32x4 x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_erf(v_float32x4 x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float64x2 v_exp(v_float64x2 x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_log(v_float64x2 x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END

@ -2650,15 +2650,24 @@ inline void v_cleanup() {}
#include "intrin_math.hpp"
#if defined(CV_SIMD_FP16) && CV_SIMD_FP16
inline v_float16x8 v_exp(v_float16x8 x) { return v_exp_default_16f<v_float16x8, v_int16x8>(x); }
inline v_float16x8 v_log(v_float16x8 x) { return v_log_default_16f<v_float16x8, v_int16x8>(x); }
inline v_float16x8 v_exp(const v_float16x8& x) { return v_exp_default_16f<v_float16x8, v_int16x8>(x); }
inline v_float16x8 v_log(const v_float16x8& x) { return v_log_default_16f<v_float16x8, v_int16x8>(x); }
inline void v_sincos(const v_float16x8& x, v_float16x8& s, v_float16x8& c) { v_sincos_default_16f<v_float16x8, v_int16x8>(x, s, c); }
inline v_float16x8 v_sin(const v_float16x8& x) { return v_sin_default_16f<v_float16x8, v_int16x8>(x); }
inline v_float16x8 v_cos(const v_float16x8& x) { return v_cos_default_16f<v_float16x8, v_int16x8>(x); }
#endif
inline v_float32x4 v_exp(v_float32x4 x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_log(v_float32x4 x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_erf(v_float32x4 x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
#if CV_SIMD128_64F
inline v_float64x2 v_exp(v_float64x2 x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_log(v_float64x2 x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
#endif
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END

@ -2867,12 +2867,18 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
inline void v_cleanup() {}
#include "intrin_math.hpp"
inline v_float32x4 v_exp(v_float32x4 x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_log(v_float32x4 x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_erf(v_float32x4 x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float64x2 v_exp(v_float64x2 x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_log(v_float64x2 x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END

@ -2181,12 +2181,18 @@ inline v_float32 v_matmuladd(const v_float32& v, const v_float32& m0,
inline void v_cleanup() {}
#include "intrin_math.hpp"
inline v_float32 v_exp(v_float32 x) { return v_exp_default_32f<v_float32, v_int32>(x); }
inline v_float32 v_log(v_float32 x) { return v_log_default_32f<v_float32, v_int32>(x); }
inline v_float32 v_erf(v_float32 x) { return v_erf_default_32f<v_float32, v_int32>(x); }
inline v_float64 v_exp(v_float64 x) { return v_exp_default_64f<v_float64, v_int64>(x); }
inline v_float64 v_log(v_float64 x) { return v_log_default_64f<v_float64, v_int64>(x); }
inline v_float32 v_exp(const v_float32& x) { return v_exp_default_32f<v_float32, v_int32>(x); }
inline v_float32 v_log(const v_float32& x) { return v_log_default_32f<v_float32, v_int32>(x); }
inline void v_sincos(const v_float32& x, v_float32& s, v_float32& c) { v_sincos_default_32f<v_float32, v_int32>(x, s, c); }
inline v_float32 v_sin(const v_float32& x) { return v_sin_default_32f<v_float32, v_int32>(x); }
inline v_float32 v_cos(const v_float32& x) { return v_cos_default_32f<v_float32, v_int32>(x); }
inline v_float32 v_erf(const v_float32& x) { return v_erf_default_32f<v_float32, v_int32>(x); }
inline v_float64 v_exp(const v_float64& x) { return v_exp_default_64f<v_float64, v_int64>(x); }
inline v_float64 v_log(const v_float64& x) { return v_log_default_64f<v_float64, v_int64>(x); }
inline void v_sincos(const v_float64& x, v_float64& s, v_float64& c) { v_sincos_default_64f<v_float64, v_int64>(x, s, c); }
inline v_float64 v_sin(const v_float64& x) { return v_sin_default_64f<v_float64, v_int64>(x); }
inline v_float64 v_cos(const v_float64& x) { return v_cos_default_64f<v_float64, v_int64>(x); }
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END

@ -3460,12 +3460,19 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
inline void v_cleanup() {}
#include "intrin_math.hpp"
inline v_float32x4 v_exp(v_float32x4 x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_log(v_float32x4 x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_erf(v_float32x4 x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_exp(v_float64x2 x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_log(v_float64x2 x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END

@ -1597,12 +1597,18 @@ inline Tvec v_broadcast_element(const Tvec& v)
{ return Tvec(vec_splat(v.val, i)); }
#include "intrin_math.hpp"
inline v_float32x4 v_exp(v_float32x4 x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_log(v_float32x4 x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_erf(v_float32x4 x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float64x2 v_exp(v_float64x2 x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_log(v_float64x2 x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END

@ -2779,12 +2779,18 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
inline void v_cleanup() {}
#include "intrin_math.hpp"
inline v_float32x4 v_exp(v_float32x4 x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_log(v_float32x4 x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_erf(v_float32x4 x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float64x2 v_exp(v_float64x2 x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_log(v_float64x2 x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END

@ -1912,6 +1912,99 @@ template<typename R> struct TheTest
return *this;
}
void __test_sincos(LaneType diff_thr, LaneType flt_min) {
int n = VTraits<R>::vlanes();
// Test each value for a period, from -PI to PI
const LaneType step = (LaneType) 0.01;
for (LaneType i = 0; i <= (LaneType)M_PI;) {
Data<R> dataPosPI, dataNegPI;
for (int j = 0; j < n; ++j) {
dataPosPI[j] = i;
dataNegPI[j] = -i;
i += step;
}
R posPI = dataPosPI, negPI = dataNegPI, sinPos, cosPos, sinNeg, cosNeg;
v_sincos(posPI, sinPos, cosPos);
v_sincos(negPI, sinNeg, cosNeg);
Data<R> resSinPos = sinPos, resCosPos = cosPos, resSinNeg = sinNeg, resCosNeg = cosNeg;
for (int j = 0; j < n; ++j) {
LaneType std_sin_pos = (LaneType) std::sin(dataPosPI[j]);
LaneType std_cos_pos = (LaneType) std::cos(dataPosPI[j]);
LaneType std_sin_neg = (LaneType) std::sin(dataNegPI[j]);
LaneType std_cos_neg = (LaneType) std::cos(dataNegPI[j]);
SCOPED_TRACE(cv::format("Period test value: %lf and %lf", (double) dataPosPI[j], (double) dataNegPI[j]));
EXPECT_LT(std::abs(resSinPos[j] - std_sin_pos), diff_thr * (std::abs(std_sin_pos) + flt_min * 100));
EXPECT_LT(std::abs(resCosPos[j] - std_cos_pos), diff_thr * (std::abs(std_cos_pos) + flt_min * 100));
EXPECT_LT(std::abs(resSinNeg[j] - std_sin_neg), diff_thr * (std::abs(std_sin_neg) + flt_min * 100));
EXPECT_LT(std::abs(resCosNeg[j] - std_cos_neg), diff_thr * (std::abs(std_cos_neg) + flt_min * 100));
}
}
// Test special values
std::vector<LaneType> specialValues = {(LaneType) 0, (LaneType) M_PI, (LaneType) (M_PI / 2), (LaneType) INFINITY, (LaneType) -INFINITY, (LaneType) NAN};
const int testRandNum = 10000;
const double specialValueProbability = 0.1; // 10% chance to insert a special value
cv::RNG_MT19937 rng;
for (int i = 0; i < testRandNum; i++) {
Data<R> dataRand;
for (int j = 0; j < n; ++j) {
if (rng.uniform(0.f, 1.f) <= specialValueProbability) {
// Insert a special value
int specialValueIndex = rng.uniform(0, (int) specialValues.size());
dataRand[j] = specialValues[specialValueIndex];
} else {
// Generate uniform random data in [-1000, 1000]
dataRand[j] = (LaneType) rng.uniform(-1000, 1000);
}
}
// Compare with std::sin and std::cos
R x = dataRand, s, c;
v_sincos(x, s, c);
Data<R> resSin = s, resCos = c;
for (int j = 0; j < n; ++j) {
SCOPED_TRACE(cv::format("Random test value: %lf", (double) dataRand[j]));
LaneType std_sin = (LaneType) std::sin(dataRand[j]);
LaneType std_cos = (LaneType) std::cos(dataRand[j]);
// input NaN, +INF, -INF -> output NaN
if (std::isnan(dataRand[j]) || std::isinf(dataRand[j])) {
EXPECT_TRUE(std::isnan(resSin[j]));
EXPECT_TRUE(std::isnan(resCos[j]));
} else if(dataRand[j] == 0) {
// sin(0) -> 0, cos(0) -> 1
EXPECT_EQ(resSin[j], 0);
EXPECT_EQ(resCos[j], 1);
} else {
EXPECT_LT(std::abs(resSin[j] - std_sin), diff_thr * (std::abs(std_sin) + flt_min * 100));
EXPECT_LT(std::abs(resCos[j] - std_cos), diff_thr * (std::abs(std_cos) + flt_min * 100));
}
}
}
}
TheTest &test_sincos_fp16() {
#if CV_SIMD_FP16
hfloat flt16_min;
uint16_t flt16_min_hex = 0x0400;
std::memcpy(&flt16_min, &flt16_min_hex, sizeof(hfloat));
__test_sincos((hfloat) 1e-3, flt16_min);
#endif
return *this;
}
TheTest &test_sincos_fp32() {
__test_sincos(1e-6f, FLT_MIN);
return *this;
}
TheTest &test_sincos_fp64() {
#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
__test_sincos(1e-11, DBL_MIN);
#endif
return *this;
}
};
#define DUMP_ENTRY(type) printf("SIMD%d: %s\n", 8*VTraits<v_uint8>::vlanes(), CV__TRACE_FUNCTION);
@ -2227,6 +2320,7 @@ void test_hal_intrin_float32()
.test_pack_triplets()
.test_exp_fp32()
.test_log_fp32()
.test_sincos_fp32()
.test_erf_fp32()
#if CV_SIMD_WIDTH == 32
.test_extract<4>().test_extract<5>().test_extract<6>().test_extract<7>()
@ -2261,6 +2355,7 @@ void test_hal_intrin_float64()
.test_extract_highest()
.test_exp_fp64()
.test_log_fp64()
.test_sincos_fp64()
//.test_broadcast_element<0>().test_broadcast_element<1>()
#if CV_SIMD_WIDTH == 32
.test_extract<2>().test_extract<3>()
@ -2283,6 +2378,7 @@ void test_hal_intrin_float16()
.test_float_cvt_fp16()
.test_exp_fp16()
.test_log_fp16()
.test_sincos_fp16()
#endif
;
#else

Loading…
Cancel
Save