|
|
|
@ -51,6 +51,13 @@ namespace { |
|
|
|
|
|
|
|
|
|
#ifdef CAROTENE_NEON |
|
|
|
|
|
|
|
|
|
inline float32x4_t vroundq(const float32x4_t& v) |
|
|
|
|
{ |
|
|
|
|
const int32x4_t signMask = vdupq_n_s32(1 << 31), half = vreinterpretq_s32_f32(vdupq_n_f32(0.5f)); |
|
|
|
|
float32x4_t v_addition = vreinterpretq_f32_s32(vorrq_s32(half, vandq_s32(signMask, vreinterpretq_s32_f32(v)))); |
|
|
|
|
return vaddq_f32(v, v_addition); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
template <typename T> |
|
|
|
|
inline T divSaturateQ(const T &v1, const T &v2, const float scale) |
|
|
|
|
{ |
|
|
|
@ -62,10 +69,10 @@ inline T divSaturateQ(const T &v1, const T &v2, const float scale) |
|
|
|
|
} |
|
|
|
|
template <> |
|
|
|
|
inline int32x4_t divSaturateQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale) |
|
|
|
|
{ return vcvtq_s32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2)))); } |
|
|
|
|
{ return vcvtq_s32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2))))); } |
|
|
|
|
template <> |
|
|
|
|
inline uint32x4_t divSaturateQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale) |
|
|
|
|
{ return vcvtq_u32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2)))); } |
|
|
|
|
{ return vcvtq_u32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2))))); } |
|
|
|
|
|
|
|
|
|
template <typename T> |
|
|
|
|
inline T divSaturate(const T &v1, const T &v2, const float scale) |
|
|
|
|