|
|
|
@ -4547,20 +4547,7 @@ static short convertFp16SW(float fp32) |
|
|
|
|
|
|
|
|
|
// template for FP16 HW conversion function
|
|
|
|
|
template<typename T, typename DT> static void |
|
|
|
|
cvtScaleHalf_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size) |
|
|
|
|
{ |
|
|
|
|
sstep /= sizeof(src[0]); |
|
|
|
|
dstep /= sizeof(dst[0]); |
|
|
|
|
|
|
|
|
|
for( ; size.height--; src += sstep, dst += dstep ) |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
|
|
|
|
|
for ( ; x < size.width; x++ ) |
|
|
|
|
{ |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
cvtScaleHalf_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size); |
|
|
|
|
|
|
|
|
|
template<> void |
|
|
|
|
cvtScaleHalf_<float, short>( const float* src, size_t sstep, short* dst, size_t dstep, Size size) |
|
|
|
@ -4574,23 +4561,25 @@ cvtScaleHalf_<float, short>( const float* src, size_t sstep, short* dst, size_t |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
|
|
|
|
|
if ( ( (intptr_t)dst & 0xf ) == 0 && ( (intptr_t)src & 0xf ) == 0 ) |
|
|
|
|
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386) |
|
|
|
|
if ( ( (intptr_t)dst & 0xf ) == 0 ) |
|
|
|
|
#endif |
|
|
|
|
{ |
|
|
|
|
#if CV_FP16 |
|
|
|
|
for ( ; x <= size.width - 4; x += 4) |
|
|
|
|
{ |
|
|
|
|
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386) |
|
|
|
|
__m128 v_src = _mm_load_ps(src + x); |
|
|
|
|
__m128 v_src = _mm_loadu_ps(src + x); |
|
|
|
|
|
|
|
|
|
__m128i v_dst = _mm_cvtps_ph(v_src, 0); |
|
|
|
|
|
|
|
|
|
_mm_storel_epi64((__m128i *)(dst + x), v_dst); |
|
|
|
|
#elif defined __GNUC__ && (defined __arm__ || defined __aarch64__) |
|
|
|
|
float32x4_t v_src = *(float32x4_t*)(src + x); |
|
|
|
|
float32x4_t v_src = vld1q_f32(src + x); |
|
|
|
|
|
|
|
|
|
float16x4_t v_dst = vcvt_f16_f32(v_src); |
|
|
|
|
|
|
|
|
|
*(float16x4_t*)(dst + x) = v_dst; |
|
|
|
|
vst1_f16((float16_t*)(dst + x), v_dst); |
|
|
|
|
#else |
|
|
|
|
#error "Configuration error" |
|
|
|
|
#endif |
|
|
|
@ -4628,7 +4617,9 @@ cvtScaleHalf_<short, float>( const short* src, size_t sstep, float* dst, size_t |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
|
|
|
|
|
if ( ( (intptr_t)dst & 0xf ) == 0 && ( (intptr_t)src & 0xf ) == 0 && checkHardwareSupport(CV_CPU_FP16) ) |
|
|
|
|
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386) |
|
|
|
|
if ( ( (intptr_t)src & 0xf ) == 0 ) |
|
|
|
|
#endif |
|
|
|
|
{ |
|
|
|
|
#if CV_FP16 |
|
|
|
|
for ( ; x <= size.width - 4; x += 4) |
|
|
|
@ -4638,13 +4629,13 @@ cvtScaleHalf_<short, float>( const short* src, size_t sstep, float* dst, size_t |
|
|
|
|
|
|
|
|
|
__m128 v_dst = _mm_cvtph_ps(v_src); |
|
|
|
|
|
|
|
|
|
_mm_store_ps((dst + x), v_dst); |
|
|
|
|
_mm_storeu_ps(dst + x, v_dst); |
|
|
|
|
#elif defined __GNUC__ && (defined __arm__ || defined __aarch64__) |
|
|
|
|
float16x4_t v_src = *(float16x4_t*)(src + x); |
|
|
|
|
float16x4_t v_src = vld1_f16((float16_t*)(src + x)); |
|
|
|
|
|
|
|
|
|
float32x4_t v_dst = vcvt_f32_f16(v_src); |
|
|
|
|
|
|
|
|
|
*(float32x4_t*)(dst + x) = v_dst; |
|
|
|
|
vst1q_f32(dst + x, v_dst); |
|
|
|
|
#else |
|
|
|
|
#error "Configuration error" |
|
|
|
|
#endif |
|
|
|
@ -4761,7 +4752,7 @@ static void cvtScaleAbs##suffix( const stype* src, size_t sstep, const uchar*, s |
|
|
|
|
static void cvtScaleHalf##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
|
|
|
|
|
dtype* dst, size_t dstep, Size size, double*) \
|
|
|
|
|
{ \
|
|
|
|
|
cvtScaleHalf##_<stype,dtype>(src, sstep, dst, dstep, size); \
|
|
|
|
|
cvtScaleHalf_<stype,dtype>(src, sstep, dst, dstep, size); \
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#define DEF_CVT_SCALE_FUNC(suffix, stype, dtype, wtype) \ |
|
|
|
@ -5153,6 +5144,7 @@ void cv::convertFp16( InputArray _src, OutputArray _dst) |
|
|
|
|
ddepth = CV_32F; |
|
|
|
|
break; |
|
|
|
|
default: |
|
|
|
|
CV_Error(Error::StsUnsupportedFormat, "Unsupported input depth"); |
|
|
|
|
return; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|