|
|
|
@ -4537,16 +4537,6 @@ static short convertFp16SW(float fp32) |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
#if CV_FP16 && (defined __GNUC__) && (defined __arm__ || defined __aarch64__) |
|
|
|
|
#if 5 <= __GNUC__ |
|
|
|
|
static inline float16x4_t load_f16(const short* p) { return vld1_f16((const float16_t*)p); } |
|
|
|
|
static inline void store_f16(short* p, float16x4_t v) { vst1_f16((float16_t*)p, v); } |
|
|
|
|
#else |
|
|
|
|
static inline float16x4_t load_f16(const short* p) { return (float16x4_t)vld1_s16(p); } |
|
|
|
|
static inline void store_f16(short* p, float16x4_t v) { vst1_s16(p, (int16x4_t)v); } |
|
|
|
|
#endif |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
// template for FP16 HW conversion function
|
|
|
|
|
template<typename T, typename DT> static void |
|
|
|
|
cvtScaleHalf_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size); |
|
|
|
@ -4570,21 +4560,11 @@ cvtScaleHalf_<float, short>( const float* src, size_t sstep, short* dst, size_t |
|
|
|
|
#if CV_FP16 |
|
|
|
|
for ( ; x <= size.width - 4; x += 4) |
|
|
|
|
{ |
|
|
|
|
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386) |
|
|
|
|
__m128 v_src = _mm_loadu_ps(src + x); |
|
|
|
|
v_float32x4 v_src = v_load(src + x); |
|
|
|
|
|
|
|
|
|
__m128i v_dst = _mm_cvtps_ph(v_src, 0); |
|
|
|
|
v_float16x4 v_dst = v_cvt_f16(v_src); |
|
|
|
|
|
|
|
|
|
_mm_storel_epi64((__m128i *)(dst + x), v_dst); |
|
|
|
|
#elif defined __GNUC__ && (defined __arm__ || defined __aarch64__) |
|
|
|
|
float32x4_t v_src = vld1q_f32(src + x); |
|
|
|
|
|
|
|
|
|
float16x4_t v_dst = vcvt_f16_f32(v_src); |
|
|
|
|
|
|
|
|
|
store_f16(dst + x, v_dst); |
|
|
|
|
#else |
|
|
|
|
#error "Configuration error" |
|
|
|
|
#endif |
|
|
|
|
v_store_f16(dst + x, v_dst); |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
} |
|
|
|
@ -4626,21 +4606,11 @@ cvtScaleHalf_<short, float>( const short* src, size_t sstep, float* dst, size_t |
|
|
|
|
#if CV_FP16 |
|
|
|
|
for ( ; x <= size.width - 4; x += 4) |
|
|
|
|
{ |
|
|
|
|
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386) |
|
|
|
|
__m128i v_src = _mm_loadl_epi64((__m128i*)(src+x)); |
|
|
|
|
|
|
|
|
|
__m128 v_dst = _mm_cvtph_ps(v_src); |
|
|
|
|
v_float16x4 v_src = v_load_f16(src + x); |
|
|
|
|
|
|
|
|
|
_mm_storeu_ps(dst + x, v_dst); |
|
|
|
|
#elif defined __GNUC__ && (defined __arm__ || defined __aarch64__) |
|
|
|
|
float16x4_t v_src = load_f16(src+x); |
|
|
|
|
v_float32x4 v_dst = v_cvt_f32(v_src); |
|
|
|
|
|
|
|
|
|
float32x4_t v_dst = vcvt_f32_f16(v_src); |
|
|
|
|
|
|
|
|
|
vst1q_f32(dst + x, v_dst); |
|
|
|
|
#else |
|
|
|
|
#error "Configuration error" |
|
|
|
|
#endif |
|
|
|
|
v_store(dst + x, v_dst); |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
} |
|
|
|
|