From e269ef96cb7bd77edad12a30963b1c97c08875e8 Mon Sep 17 00:00:00 2001 From: Tomoaki Teshima Date: Tue, 6 Jun 2017 22:26:51 +0900 Subject: [PATCH] update convertFp16 using CV_CPU_CALL_FP16 * avoid link error (move the implementation of software version to header) * make getConvertFuncFp16 local (move from precomp.hpp to convert.hpp) * fix error on 32bit x86 --- modules/core/src/convert.cpp | 263 +++--------------------------- modules/core/src/convert.fp16.cpp | 172 +++++++++++++++++++ modules/core/src/convert.hpp | 173 ++++++++++++++++++++ modules/core/src/precomp.hpp | 1 - 4 files changed, 371 insertions(+), 238 deletions(-) create mode 100644 modules/core/src/convert.fp16.cpp create mode 100644 modules/core/src/convert.hpp diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index 8baef8a8d9..58aa0066f3 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -44,7 +44,7 @@ #include "precomp.hpp" #include "opencl_kernels_core.hpp" -#include "opencv2/core/hal/intrin.hpp" +#include "convert.hpp" #include "opencv2/core/openvx/ovx_defs.hpp" @@ -4573,256 +4573,40 @@ struct Cvt_SIMD #endif -#if !CV_FP16_TYPE -// const numbers for floating points format -const unsigned int kShiftSignificand = 13; -const unsigned int kMaskFp16Significand = 0x3ff; -const unsigned int kBiasFp16Exponent = 15; -const unsigned int kBiasFp32Exponent = 127; -#endif - -#if CV_FP16_TYPE -static float convertFp16SW(short fp16) -{ - // Fp16 -> Fp32 - Cv16suf a; - a.i = fp16; - return (float)a.h; -} -#else -static float convertFp16SW(short fp16) -{ - // Fp16 -> Fp32 - Cv16suf b; - b.i = fp16; - int exponent = b.fmt.exponent - kBiasFp16Exponent; - int significand = b.fmt.significand; - - Cv32suf a; - a.i = 0; - a.fmt.sign = b.fmt.sign; // sign bit - if( exponent == 16 ) - { - // Inf or NaN - a.i = a.i | 0x7F800000; - if( significand != 0 ) - { - // NaN -#if defined(__x86_64__) || defined(_M_X64) - // 64bit - a.i = a.i | 0x7FC00000; -#endif - a.fmt.significand = a.fmt.significand | (significand << kShiftSignificand); - } - return a.f; - } - else if ( exponent == -15 ) - { - // subnormal in Fp16 - if( significand == 0 ) - { - // zero - return a.f; - } - else - { - int shift = -1; - while( ( significand & 0x400 ) == 0 ) - { - significand = significand << 1; - shift++; - } - significand = significand & kMaskFp16Significand; - exponent -= shift; - } - } - - a.fmt.exponent = (exponent+kBiasFp32Exponent); - a.fmt.significand = significand << kShiftSignificand; - return a.f; -} -#endif - -#if CV_FP16_TYPE -static short convertFp16SW(float fp32) -{ - // Fp32 -> Fp16 - Cv16suf a; - a.h = (__fp16)fp32; - return a.i; -} -#else -static short convertFp16SW(float fp32) -{ - // Fp32 -> Fp16 - Cv32suf a; - a.f = fp32; - int exponent = a.fmt.exponent - kBiasFp32Exponent; - int significand = a.fmt.significand; - - Cv16suf result; - result.i = 0; - unsigned int absolute = a.i & 0x7fffffff; - if( 0x477ff000 <= absolute ) - { - // Inf in Fp16 - result.i = result.i | 0x7C00; - if( exponent == 128 && significand != 0 ) - { - // NaN - result.i = (short)( result.i | 0x200 | ( significand >> kShiftSignificand ) ); - } - } - else if ( absolute < 0x33000001 ) - { - // too small for fp16 - result.i = 0; - } - else if ( absolute < 0x33c00000 ) - { - result.i = 1; - } - else if ( absolute < 0x34200001 ) - { - result.i = 2; - } - else if ( absolute < 0x387fe000 ) - { - // subnormal in Fp16 - int fp16Significand = significand | 0x800000; - int bitShift = (-exponent) - 1; - fp16Significand = fp16Significand >> bitShift; - - // special cases to round up - bitShift = exponent + 24; - int threshold = ( ( 0x400000 >> bitShift ) | ( ( ( significand & ( 0x800000 >> bitShift ) ) >> ( 126 - a.fmt.exponent ) ) ^ 1 ) ); - if( threshold <= ( significand & ( 0xffffff >> ( exponent + 25 ) ) ) ) - { - fp16Significand++; - } - result.i = (short)fp16Significand; - } - else - { - // usual situation - // exponent - result.fmt.exponent = ( exponent + kBiasFp16Exponent ); - - // significand; - short fp16Significand = (short)(significand >> kShiftSignificand); - result.fmt.significand = fp16Significand; - - // special cases to round up - short lsb10bitsFp32 = (significand & 0x1fff); - short threshold = 0x1000 + ( ( fp16Significand & 0x1 ) ? 0 : 1 ); - if( threshold <= lsb10bitsFp32 ) - { - result.i++; - } - else if ( fp16Significand == 0x3ff && exponent == -15) - { - result.i++; - } - } - - // sign bit - result.fmt.sign = a.fmt.sign; - return result.i; -} -#endif - // template for FP16 HW conversion function template static void cvtScaleHalf_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size); template<> void -cvtScaleHalf_( const float* src, size_t sstep, short* dst, size_t dstep, Size size) +cvtScaleHalf_( const float* src, size_t sstep, short* dst, size_t dstep, Size size ) { + CV_CPU_CALL_FP16(cvtScaleHalf_SIMD32f16f, (src, sstep, dst, dstep, size)); + sstep /= sizeof(src[0]); dstep /= sizeof(dst[0]); - if( checkHardwareSupport(CV_CPU_FP16) ) - { - for( ; size.height--; src += sstep, dst += dstep ) - { - int x = 0; - -#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386) - if ( ( (intptr_t)dst & 0xf ) == 0 ) -#endif - { -#if CV_FP16 && CV_SIMD128 - for ( ; x <= size.width - 4; x += 4) - { - v_float32x4 v_src = v_load(src + x); - - v_float16x4 v_dst = v_cvt_f16(v_src); - - v_store_f16(dst + x, v_dst); - } -#endif - } - for ( ; x < size.width; x++ ) - { - dst[x] = convertFp16SW(src[x]); - } - } - } - else + for( ; size.height--; src += sstep, dst += dstep ) { - for( ; size.height--; src += sstep, dst += dstep ) + for ( int x = 0; x < size.width; x++ ) { - int x = 0; - for ( ; x < size.width; x++ ) - { - dst[x] = convertFp16SW(src[x]); - } + dst[x] = convertFp16SW(src[x]); } } } template<> void -cvtScaleHalf_( const short* src, size_t sstep, float* dst, size_t dstep, Size size) +cvtScaleHalf_( const short* src, size_t sstep, float* dst, size_t dstep, Size size ) { + CV_CPU_CALL_FP16(cvtScaleHalf_SIMD16f32f, (src, sstep, dst, dstep, size)); + sstep /= sizeof(src[0]); dstep /= sizeof(dst[0]); - if( checkHardwareSupport(CV_CPU_FP16) ) - { - for( ; size.height--; src += sstep, dst += dstep ) - { - int x = 0; - -#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386) - if ( ( (intptr_t)src & 0xf ) == 0 ) -#endif - { -#if CV_FP16 && CV_SIMD128 - for ( ; x <= size.width - 4; x += 4) - { - v_float16x4 v_src = v_load_f16(src + x); - - v_float32x4 v_dst = v_cvt_f32(v_src); - - v_store(dst + x, v_dst); - } -#endif - } - for ( ; x < size.width; x++ ) - { - dst[x] = convertFp16SW(src[x]); - } - } - } - else + for( ; size.height--; src += sstep, dst += dstep ) { - for( ; size.height--; src += sstep, dst += dstep ) + for ( int x = 0; x < size.width; x++ ) { - int x = 0; - for ( ; x < size.width; x++ ) - { - dst[x] = convertFp16SW(src[x]); - } + dst[x] = convertFp16SW(src[x]); } } } @@ -5024,12 +4808,13 @@ static void cvtScaleAbs##suffix( const stype* src, size_t sstep, const uchar*, s } #define DEF_CVT_SCALE_FP16_FUNC(suffix, stype, dtype) \ -static void cvtScaleHalf##suffix( const stype* src, size_t sstep, const uchar*, size_t, \ -dtype* dst, size_t dstep, Size size, double*) \ +static void cvtScaleHalf##suffix( const stype* src, size_t sstep, \ +dtype* dst, size_t dstep, Size size) \ { \ cvtScaleHalf_(src, sstep, dst, dstep, size); \ } + #define DEF_CVT_SCALE_FUNC(suffix, stype, dtype, wtype) \ static void cvtScale##suffix( const stype* src, size_t sstep, const uchar*, size_t, \ dtype* dst, size_t dstep, Size size, double* scale) \ @@ -5210,12 +4995,16 @@ static BinaryFunc getCvtScaleAbsFunc(int depth) return cvtScaleAbsTab[depth]; } -BinaryFunc getConvertFuncFp16(int ddepth) +typedef void (*UnaryFunc)(const uchar* src1, size_t step1, + uchar* dst, size_t step, Size sz, + void*); + +static UnaryFunc getConvertFuncFp16(int ddepth) { - static BinaryFunc cvtTab[] = + static UnaryFunc cvtTab[] = { 0, 0, 0, - (BinaryFunc)(cvtScaleHalf32f16f), 0, (BinaryFunc)(cvtScaleHalf16f32f), + (UnaryFunc)(cvtScaleHalf32f16f), 0, (UnaryFunc)(cvtScaleHalf16f32f), 0, 0, }; return cvtTab[CV_MAT_DEPTH(ddepth)]; @@ -5461,14 +5250,14 @@ void cv::convertFp16( InputArray _src, OutputArray _dst) int type = CV_MAKETYPE(ddepth, src.channels()); _dst.create( src.dims, src.size, type ); Mat dst = _dst.getMat(); - BinaryFunc func = getConvertFuncFp16(ddepth); + UnaryFunc func = getConvertFuncFp16(ddepth); int cn = src.channels(); CV_Assert( func != 0 ); if( src.dims <= 2 ) { Size sz = getContinuousSize(src, dst, cn); - func( src.data, src.step, 0, 0, dst.data, dst.step, sz, 0); + func( src.data, src.step, dst.data, dst.step, sz, 0); } else { @@ -5478,7 +5267,7 @@ void cv::convertFp16( InputArray _src, OutputArray _dst) Size sz((int)(it.size*cn), 1); for( size_t i = 0; i < it.nplanes; i++, ++it ) - func(ptrs[0], 1, 0, 0, ptrs[1], 1, sz, 0); + func(ptrs[0], 1, ptrs[1], 1, sz, 0); } } diff --git a/modules/core/src/convert.fp16.cpp b/modules/core/src/convert.fp16.cpp new file mode 100644 index 0000000000..f5ad29bf0f --- /dev/null +++ b/modules/core/src/convert.fp16.cpp @@ -0,0 +1,172 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" + +#include "convert.hpp" + + +namespace cv +{ +namespace opt_FP16 +{ +#if !defined(CV_NEON) || !CV_NEON +const static int cVectorWidth = 8; + +void cvtScaleHalf_SIMD32f16f( const float* src, size_t sstep, short* dst, size_t dstep, cv::Size size ) +{ + CV_INSTRUMENT_REGION() + + sstep /= sizeof(src[0]); + dstep /= sizeof(dst[0]); + + for( ; size.height--; src += sstep, dst += dstep ) + { + int x = 0; + for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth ) + { + __m256 v_src = _mm256_loadu_ps(src + x); + + // round to nearest even + __m128i v_dst = _mm256_cvtps_ph(v_src, 0); + + _mm_storeu_si128((__m128i*)(dst + x), v_dst); + } + + for ( ; x < size.width; x++ ) + { + dst[x] = convertFp16SW(src[x]); + } + } +} + +void cvtScaleHalf_SIMD16f32f( const short* src, size_t sstep, float* dst, size_t dstep, cv::Size size ) +{ + CV_INSTRUMENT_REGION() + + sstep /= sizeof(src[0]); + dstep /= sizeof(dst[0]); + + for( ; size.height--; src += sstep, dst += dstep ) + { + int x = 0; + for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth ) + { + __m128i v_src = _mm_loadu_si128((__m128i*)(src + x)); + + __m256 v_dst = _mm256_cvtph_ps(v_src); + + _mm256_storeu_ps(dst + x, v_dst); + } + + for ( ; x < size.width; x++ ) + { + dst[x] = convertFp16SW(src[x]); + } + } +} +#elif CV_NEON +const static int cVectorWidth = 4; + +template static inline float16x4_t vld1_f16(const T* ptr) +{ return (float16x4_t)vld1_s16((const short*)ptr); } +template static inline void vst1_f16(T* ptr, float16x4_t a) +{ vst1_s16((short*)ptr, a); } + +void cvtScaleHalf_SIMD32f16f( const float* src, size_t sstep, short* dst, size_t dstep, cv::Size size ) +{ + CV_INSTRUMENT_REGION() + + sstep /= sizeof(src[0]); + dstep /= sizeof(dst[0]); + + for( ; size.height--; src += sstep, dst += dstep ) + { + int x = 0; + for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth) + { + float32x4_t v_src = vld1q_f32(src + x); + + float16x4_t v_dst = vcvt_f16_f32(v_src); + + vst1_f16((__fp16*)dst + x, v_dst); + } + + for ( ; x < size.width; x++ ) + { + dst[x] = convertFp16SW(src[x]); + } + } +} + +void cvtScaleHalf_SIMD16f32f( const short* src, size_t sstep, float* dst, size_t dstep, cv::Size size ) +{ + CV_INSTRUMENT_REGION() + + sstep /= sizeof(src[0]); + dstep /= sizeof(dst[0]); + + for( ; size.height--; src += sstep, dst += dstep ) + { + int x = 0; + for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth ) + { + float16x4_t v_src = vld1_f16((__fp16*)src + x); + + float32x4_t v_dst = vcvt_f32_f16(v_src); + + vst1q_f32(dst + x, v_dst); + } + + for ( ; x < size.width; x++ ) + { + dst[x] = convertFp16SW(src[x]); + } + } +} +#else +#error "Unsupported build configuration" +#endif +} +} +/* End of file. */ diff --git a/modules/core/src/convert.hpp b/modules/core/src/convert.hpp new file mode 100644 index 0000000000..936d1ca341 --- /dev/null +++ b/modules/core/src/convert.hpp @@ -0,0 +1,173 @@ + +namespace +{ +float convertFp16SW(short fp16); +short convertFp16SW(float fp32); + +#if !CV_FP16_TYPE +// const numbers for floating points format +const unsigned int kShiftSignificand = 13; +const unsigned int kMaskFp16Significand = 0x3ff; +const unsigned int kBiasFp16Exponent = 15; +const unsigned int kBiasFp32Exponent = 127; +#endif + +#if CV_FP16_TYPE +float convertFp16SW(short fp16) +{ + // Fp16 -> Fp32 + Cv16suf a; + a.i = fp16; + return (float)a.h; +} +#else +float convertFp16SW(short fp16) +{ + // Fp16 -> Fp32 + Cv16suf b; + b.i = fp16; + int exponent = b.fmt.exponent - kBiasFp16Exponent; + int significand = b.fmt.significand; + + Cv32suf a; + a.i = 0; + a.fmt.sign = b.fmt.sign; // sign bit + if( exponent == 16 ) + { + // Inf or NaN + a.i = a.i | 0x7F800000; + if( significand != 0 ) + { + // NaN +#if defined(__x86_64__) || defined(_M_X64) + // 64bit + a.i = a.i | 0x7FC00000; +#endif + a.fmt.significand = a.fmt.significand | (significand << kShiftSignificand); + } + return a.f; + } + else if ( exponent == -(int)kBiasFp16Exponent ) + { + // subnormal in Fp16 + if( significand == 0 ) + { + // zero + return a.f; + } + else + { + int shift = -1; + while( ( significand & 0x400 ) == 0 ) + { + significand = significand << 1; + shift++; + } + significand = significand & kMaskFp16Significand; + exponent -= shift; + } + } + + a.fmt.exponent = (exponent+kBiasFp32Exponent); + a.fmt.significand = significand << kShiftSignificand; + return a.f; +} +#endif + +#if CV_FP16_TYPE +short convertFp16SW(float fp32) +{ + // Fp32 -> Fp16 + Cv16suf a; + a.h = (__fp16)fp32; + return a.i; +} +#else +short convertFp16SW(float fp32) +{ + // Fp32 -> Fp16 + Cv32suf a; + a.f = fp32; + int exponent = a.fmt.exponent - kBiasFp32Exponent; + int significand = a.fmt.significand; + + Cv16suf result; + result.i = 0; + unsigned int absolute = a.i & 0x7fffffff; + if( 0x477ff000 <= absolute ) + { + // Inf in Fp16 + result.i = result.i | 0x7C00; + if( exponent == 128 && significand != 0 ) + { + // NaN + result.i = (short)( result.i | 0x200 | ( significand >> kShiftSignificand ) ); + } + } + else if ( absolute < 0x33000001 ) + { + // too small for fp16 + result.i = 0; + } + else if ( absolute < 0x387fe000 ) + { + // subnormal in Fp16 + int fp16Significand = significand | 0x800000; + int bitShift = (-exponent) - 1; + fp16Significand = fp16Significand >> bitShift; + + // special cases to round up + bitShift = exponent + 24; + int threshold = ( ( 0x400000 >> bitShift ) | ( ( ( significand & ( 0x800000 >> bitShift ) ) >> ( 126 - a.fmt.exponent ) ) ^ 1 ) ); + if( absolute == 0x33c00000 ) + { + result.i = 2; + } + else + { + if( threshold <= ( significand & ( 0xffffff >> ( exponent + 25 ) ) ) ) + { + fp16Significand++; + } + result.i = (short)fp16Significand; + } + } + else + { + // usual situation + // exponent + result.fmt.exponent = ( exponent + kBiasFp16Exponent ); + + // significand; + short fp16Significand = (short)(significand >> kShiftSignificand); + result.fmt.significand = fp16Significand; + + // special cases to round up + short lsb10bitsFp32 = (significand & 0x1fff); + short threshold = 0x1000 + ( ( fp16Significand & 0x1 ) ? 0 : 1 ); + if( threshold <= lsb10bitsFp32 ) + { + result.i++; + } + else if ( fp16Significand == kMaskFp16Significand && exponent == -15) + { + result.i++; + } + } + + // sign bit + result.fmt.sign = a.fmt.sign; + return result.i; +} +#endif + +} + +namespace cv +{ +namespace opt_FP16 +{ +void cvtScaleHalf_SIMD32f16f( const float* src, size_t sstep, short* dst, size_t dstep, cv::Size size ); +void cvtScaleHalf_SIMD16f32f( const short* src, size_t sstep, float* dst, size_t dstep, cv::Size size ); +} +} \ No newline at end of file diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp index 6a63e84ef6..c852f3739c 100644 --- a/modules/core/src/precomp.hpp +++ b/modules/core/src/precomp.hpp @@ -135,7 +135,6 @@ typedef void (*BinaryFuncC)(const uchar* src1, size_t step1, uchar* dst, size_t step, int width, int height, void*); -BinaryFunc getConvertFuncFp16(int ddepth); BinaryFunc getConvertFunc(int sdepth, int ddepth); BinaryFunc getCopyMaskFunc(size_t esz);