|
|
|
@ -3121,18 +3121,39 @@ OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float32x8, float, f32, v_uint32x8, un |
|
|
|
|
OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int64x4, int64, s64, v_uint64x4, uint64, u64) |
|
|
|
|
OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4, uint64, u64) |
|
|
|
|
|
|
|
|
|
//
|
|
|
|
|
// FP16
|
|
|
|
|
//
|
|
|
|
|
|
|
|
|
|
inline v_float32x8 v256_load_expand(const float16_t* ptr) |
|
|
|
|
{ |
|
|
|
|
#if CV_FP16 |
|
|
|
|
return v_float32x8(_mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr))); |
|
|
|
|
#else |
|
|
|
|
float CV_DECL_ALIGNED(32) buf[8]; |
|
|
|
|
for (int i = 0; i < 8; i++) |
|
|
|
|
buf[i] = (float)ptr[i]; |
|
|
|
|
return v256_load_aligned(buf); |
|
|
|
|
#endif |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
inline void v_pack_store(float16_t* ptr, const v_float32x8& a) |
|
|
|
|
{ |
|
|
|
|
#if CV_FP16 |
|
|
|
|
__m128i ah = _mm256_cvtps_ph(a.val, 0); |
|
|
|
|
_mm_storeu_si128((__m128i*)ptr, ah); |
|
|
|
|
#else |
|
|
|
|
float CV_DECL_ALIGNED(32) buf[8]; |
|
|
|
|
v_store_aligned(buf, a); |
|
|
|
|
for (int i = 0; i < 8; i++) |
|
|
|
|
ptr[i] = float16_t(buf[i]); |
|
|
|
|
#endif |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
//
|
|
|
|
|
// end of FP16
|
|
|
|
|
//
|
|
|
|
|
|
|
|
|
|
inline void v256_cleanup() { _mm256_zeroall(); } |
|
|
|
|
|
|
|
|
|
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END |
|
|
|
|