|
|
|
@ -151,6 +151,10 @@ void div(const Size2D &size, |
|
|
|
|
typedef typename internal::VecTraits<T>::vec128 vec128; |
|
|
|
|
typedef typename internal::VecTraits<T>::vec64 vec64; |
|
|
|
|
|
|
|
|
|
#if defined(__GNUC__) && (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L) |
|
|
|
|
static_assert(std::numeric_limits<T>::is_integer, "template implementation is for integer types only"); |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
if (scale == 0.0f || |
|
|
|
|
(std::numeric_limits<T>::is_integer && |
|
|
|
|
(scale * std::numeric_limits<T>::max()) < 1.0f && |
|
|
|
@ -311,6 +315,10 @@ void recip(const Size2D &size, |
|
|
|
|
typedef typename internal::VecTraits<T>::vec128 vec128; |
|
|
|
|
typedef typename internal::VecTraits<T>::vec64 vec64; |
|
|
|
|
|
|
|
|
|
#if defined(__GNUC__) && (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L) |
|
|
|
|
static_assert(std::numeric_limits<T>::is_integer, "template implementation is for integer types only"); |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
if (scale == 0.0f || |
|
|
|
|
(std::numeric_limits<T>::is_integer && |
|
|
|
|
scale < 1.0f && |
|
|
|
@ -463,8 +471,6 @@ void div(const Size2D &size, |
|
|
|
|
return; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
float32x4_t v_zero = vdupq_n_f32(0.0f); |
|
|
|
|
|
|
|
|
|
size_t roiw128 = size.width >= 3 ? size.width - 3 : 0; |
|
|
|
|
size_t roiw64 = size.width >= 1 ? size.width - 1 : 0; |
|
|
|
|
|
|
|
|
@ -485,9 +491,7 @@ void div(const Size2D &size, |
|
|
|
|
float32x4_t v_src0 = vld1q_f32(src0 + j); |
|
|
|
|
float32x4_t v_src1 = vld1q_f32(src1 + j); |
|
|
|
|
|
|
|
|
|
uint32x4_t v_mask = vceqq_f32(v_src1,v_zero); |
|
|
|
|
vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32( |
|
|
|
|
vreinterpretq_u32_f32(vmulq_f32(v_src0, internal::vrecpq_f32(v_src1))), v_mask))); |
|
|
|
|
vst1q_f32(dst + j, vmulq_f32(v_src0, internal::vrecpq_f32(v_src1))); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
for (; j < roiw64; j += 2) |
|
|
|
@ -495,14 +499,12 @@ void div(const Size2D &size, |
|
|
|
|
float32x2_t v_src0 = vld1_f32(src0 + j); |
|
|
|
|
float32x2_t v_src1 = vld1_f32(src1 + j); |
|
|
|
|
|
|
|
|
|
uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero)); |
|
|
|
|
vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32( |
|
|
|
|
vreinterpret_u32_f32(vmul_f32(v_src0, internal::vrecp_f32(v_src1))), v_mask))); |
|
|
|
|
vst1_f32(dst + j, vmul_f32(v_src0, internal::vrecp_f32(v_src1))); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
for (; j < size.width; j++) |
|
|
|
|
{ |
|
|
|
|
dst[j] = src1[j] ? src0[j] / src1[j] : 0.0f; |
|
|
|
|
dst[j] = src0[j] / src1[j]; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
@ -523,10 +525,8 @@ void div(const Size2D &size, |
|
|
|
|
float32x4_t v_src0 = vld1q_f32(src0 + j); |
|
|
|
|
float32x4_t v_src1 = vld1q_f32(src1 + j); |
|
|
|
|
|
|
|
|
|
uint32x4_t v_mask = vceqq_f32(v_src1,v_zero); |
|
|
|
|
vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32( |
|
|
|
|
vreinterpretq_u32_f32(vmulq_f32(vmulq_n_f32(v_src0, scale), |
|
|
|
|
internal::vrecpq_f32(v_src1))), v_mask))); |
|
|
|
|
vst1q_f32(dst + j, vmulq_f32(vmulq_n_f32(v_src0, scale), |
|
|
|
|
internal::vrecpq_f32(v_src1))); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
for (; j < roiw64; j += 2) |
|
|
|
@ -534,15 +534,13 @@ void div(const Size2D &size, |
|
|
|
|
float32x2_t v_src0 = vld1_f32(src0 + j); |
|
|
|
|
float32x2_t v_src1 = vld1_f32(src1 + j); |
|
|
|
|
|
|
|
|
|
uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero)); |
|
|
|
|
vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32( |
|
|
|
|
vreinterpret_u32_f32(vmul_f32(vmul_n_f32(v_src0, scale), |
|
|
|
|
internal::vrecp_f32(v_src1))), v_mask))); |
|
|
|
|
vst1_f32(dst + j, vmul_f32(vmul_n_f32(v_src0, scale), |
|
|
|
|
internal::vrecp_f32(v_src1))); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
for (; j < size.width; j++) |
|
|
|
|
{ |
|
|
|
|
dst[j] = src1[j] ? src0[j] * scale / src1[j] : 0.0f; |
|
|
|
|
dst[j] = src0[j] * scale / src1[j]; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
@ -620,8 +618,6 @@ void reciprocal(const Size2D &size, |
|
|
|
|
return; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
float32x4_t v_zero = vdupq_n_f32(0.0f); |
|
|
|
|
|
|
|
|
|
size_t roiw128 = size.width >= 3 ? size.width - 3 : 0; |
|
|
|
|
size_t roiw64 = size.width >= 1 ? size.width - 1 : 0; |
|
|
|
|
|
|
|
|
@ -639,23 +635,19 @@ void reciprocal(const Size2D &size, |
|
|
|
|
|
|
|
|
|
float32x4_t v_src1 = vld1q_f32(src1 + j); |
|
|
|
|
|
|
|
|
|
uint32x4_t v_mask = vceqq_f32(v_src1,v_zero); |
|
|
|
|
vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32( |
|
|
|
|
vreinterpretq_u32_f32(internal::vrecpq_f32(v_src1)), v_mask))); |
|
|
|
|
vst1q_f32(dst + j, internal::vrecpq_f32(v_src1)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
for (; j < roiw64; j += 2) |
|
|
|
|
{ |
|
|
|
|
float32x2_t v_src1 = vld1_f32(src1 + j); |
|
|
|
|
|
|
|
|
|
uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero)); |
|
|
|
|
vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32( |
|
|
|
|
vreinterpret_u32_f32(internal::vrecp_f32(v_src1)), v_mask))); |
|
|
|
|
vst1_f32(dst + j, internal::vrecp_f32(v_src1)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
for (; j < size.width; j++) |
|
|
|
|
{ |
|
|
|
|
dst[j] = src1[j] ? 1.0f / src1[j] : 0; |
|
|
|
|
dst[j] = 1.0f / src1[j]; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
@ -673,25 +665,19 @@ void reciprocal(const Size2D &size, |
|
|
|
|
|
|
|
|
|
float32x4_t v_src1 = vld1q_f32(src1 + j); |
|
|
|
|
|
|
|
|
|
uint32x4_t v_mask = vceqq_f32(v_src1,v_zero); |
|
|
|
|
vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32( |
|
|
|
|
vreinterpretq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(v_src1), |
|
|
|
|
scale)),v_mask))); |
|
|
|
|
vst1q_f32(dst + j, vmulq_n_f32(internal::vrecpq_f32(v_src1), scale)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
for (; j < roiw64; j += 2) |
|
|
|
|
{ |
|
|
|
|
float32x2_t v_src1 = vld1_f32(src1 + j); |
|
|
|
|
|
|
|
|
|
uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero)); |
|
|
|
|
vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32( |
|
|
|
|
vreinterpret_u32_f32(vmul_n_f32(internal::vrecp_f32(v_src1), |
|
|
|
|
scale)), v_mask))); |
|
|
|
|
vst1_f32(dst + j, vmul_n_f32(internal::vrecp_f32(v_src1), scale)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
for (; j < size.width; j++) |
|
|
|
|
{ |
|
|
|
|
dst[j] = src1[j] ? scale / src1[j] : 0; |
|
|
|
|
dst[j] = scale / src1[j]; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|