mirror of https://github.com/opencv/opencv.git
Merge pull request #8838 from tomoaki0705:dispatchFp16
commit
125abe2fe4
4 changed files with 371 additions and 238 deletions
@ -0,0 +1,172 @@ |
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "precomp.hpp" |
||||
|
||||
#include "convert.hpp" |
||||
|
||||
|
||||
namespace cv |
||||
{ |
||||
namespace opt_FP16 |
||||
{ |
||||
#if !defined(CV_NEON) || !CV_NEON |
||||
const static int cVectorWidth = 8; |
||||
|
||||
void cvtScaleHalf_SIMD32f16f( const float* src, size_t sstep, short* dst, size_t dstep, cv::Size size ) |
||||
{ |
||||
CV_INSTRUMENT_REGION() |
||||
|
||||
sstep /= sizeof(src[0]); |
||||
dstep /= sizeof(dst[0]); |
||||
|
||||
for( ; size.height--; src += sstep, dst += dstep ) |
||||
{ |
||||
int x = 0; |
||||
for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth ) |
||||
{ |
||||
__m256 v_src = _mm256_loadu_ps(src + x); |
||||
|
||||
// round to nearest even
|
||||
__m128i v_dst = _mm256_cvtps_ph(v_src, 0); |
||||
|
||||
_mm_storeu_si128((__m128i*)(dst + x), v_dst); |
||||
} |
||||
|
||||
for ( ; x < size.width; x++ ) |
||||
{ |
||||
dst[x] = convertFp16SW(src[x]); |
||||
} |
||||
} |
||||
} |
||||
|
||||
void cvtScaleHalf_SIMD16f32f( const short* src, size_t sstep, float* dst, size_t dstep, cv::Size size ) |
||||
{ |
||||
CV_INSTRUMENT_REGION() |
||||
|
||||
sstep /= sizeof(src[0]); |
||||
dstep /= sizeof(dst[0]); |
||||
|
||||
for( ; size.height--; src += sstep, dst += dstep ) |
||||
{ |
||||
int x = 0; |
||||
for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth ) |
||||
{ |
||||
__m128i v_src = _mm_loadu_si128((__m128i*)(src + x)); |
||||
|
||||
__m256 v_dst = _mm256_cvtph_ps(v_src); |
||||
|
||||
_mm256_storeu_ps(dst + x, v_dst); |
||||
} |
||||
|
||||
for ( ; x < size.width; x++ ) |
||||
{ |
||||
dst[x] = convertFp16SW(src[x]); |
||||
} |
||||
} |
||||
} |
||||
#elif CV_NEON |
||||
const static int cVectorWidth = 4; |
||||
|
||||
template <typename T> static inline float16x4_t vld1_f16(const T* ptr) |
||||
{ return (float16x4_t)vld1_s16((const short*)ptr); } |
||||
template <typename T> static inline void vst1_f16(T* ptr, float16x4_t a) |
||||
{ vst1_s16((short*)ptr, a); } |
||||
|
||||
void cvtScaleHalf_SIMD32f16f( const float* src, size_t sstep, short* dst, size_t dstep, cv::Size size ) |
||||
{ |
||||
CV_INSTRUMENT_REGION() |
||||
|
||||
sstep /= sizeof(src[0]); |
||||
dstep /= sizeof(dst[0]); |
||||
|
||||
for( ; size.height--; src += sstep, dst += dstep ) |
||||
{ |
||||
int x = 0; |
||||
for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth) |
||||
{ |
||||
float32x4_t v_src = vld1q_f32(src + x); |
||||
|
||||
float16x4_t v_dst = vcvt_f16_f32(v_src); |
||||
|
||||
vst1_f16((__fp16*)dst + x, v_dst); |
||||
} |
||||
|
||||
for ( ; x < size.width; x++ ) |
||||
{ |
||||
dst[x] = convertFp16SW(src[x]); |
||||
} |
||||
} |
||||
} |
||||
|
||||
void cvtScaleHalf_SIMD16f32f( const short* src, size_t sstep, float* dst, size_t dstep, cv::Size size ) |
||||
{ |
||||
CV_INSTRUMENT_REGION() |
||||
|
||||
sstep /= sizeof(src[0]); |
||||
dstep /= sizeof(dst[0]); |
||||
|
||||
for( ; size.height--; src += sstep, dst += dstep ) |
||||
{ |
||||
int x = 0; |
||||
for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth ) |
||||
{ |
||||
float16x4_t v_src = vld1_f16((__fp16*)src + x); |
||||
|
||||
float32x4_t v_dst = vcvt_f32_f16(v_src); |
||||
|
||||
vst1q_f32(dst + x, v_dst); |
||||
} |
||||
|
||||
for ( ; x < size.width; x++ ) |
||||
{ |
||||
dst[x] = convertFp16SW(src[x]); |
||||
} |
||||
} |
||||
} |
||||
#else |
||||
#error "Unsupported build configuration" |
||||
#endif |
||||
} |
||||
} |
||||
/* End of file. */ |
@ -0,0 +1,173 @@ |
||||
|
||||
namespace |
||||
{ |
||||
float convertFp16SW(short fp16); |
||||
short convertFp16SW(float fp32); |
||||
|
||||
#if !CV_FP16_TYPE |
||||
// const numbers for floating points format
|
||||
const unsigned int kShiftSignificand = 13; |
||||
const unsigned int kMaskFp16Significand = 0x3ff; |
||||
const unsigned int kBiasFp16Exponent = 15; |
||||
const unsigned int kBiasFp32Exponent = 127; |
||||
#endif |
||||
|
||||
#if CV_FP16_TYPE |
||||
float convertFp16SW(short fp16) |
||||
{ |
||||
// Fp16 -> Fp32
|
||||
Cv16suf a; |
||||
a.i = fp16; |
||||
return (float)a.h; |
||||
} |
||||
#else |
||||
float convertFp16SW(short fp16) |
||||
{ |
||||
// Fp16 -> Fp32
|
||||
Cv16suf b; |
||||
b.i = fp16; |
||||
int exponent = b.fmt.exponent - kBiasFp16Exponent; |
||||
int significand = b.fmt.significand; |
||||
|
||||
Cv32suf a; |
||||
a.i = 0; |
||||
a.fmt.sign = b.fmt.sign; // sign bit
|
||||
if( exponent == 16 ) |
||||
{ |
||||
// Inf or NaN
|
||||
a.i = a.i | 0x7F800000; |
||||
if( significand != 0 ) |
||||
{ |
||||
// NaN
|
||||
#if defined(__x86_64__) || defined(_M_X64) |
||||
// 64bit
|
||||
a.i = a.i | 0x7FC00000; |
||||
#endif |
||||
a.fmt.significand = a.fmt.significand | (significand << kShiftSignificand); |
||||
} |
||||
return a.f; |
||||
} |
||||
else if ( exponent == -(int)kBiasFp16Exponent ) |
||||
{ |
||||
// subnormal in Fp16
|
||||
if( significand == 0 ) |
||||
{ |
||||
// zero
|
||||
return a.f; |
||||
} |
||||
else |
||||
{ |
||||
int shift = -1; |
||||
while( ( significand & 0x400 ) == 0 ) |
||||
{ |
||||
significand = significand << 1; |
||||
shift++; |
||||
} |
||||
significand = significand & kMaskFp16Significand; |
||||
exponent -= shift; |
||||
} |
||||
} |
||||
|
||||
a.fmt.exponent = (exponent+kBiasFp32Exponent); |
||||
a.fmt.significand = significand << kShiftSignificand; |
||||
return a.f; |
||||
} |
||||
#endif |
||||
|
||||
#if CV_FP16_TYPE |
||||
short convertFp16SW(float fp32) |
||||
{ |
||||
// Fp32 -> Fp16
|
||||
Cv16suf a; |
||||
a.h = (__fp16)fp32; |
||||
return a.i; |
||||
} |
||||
#else |
||||
short convertFp16SW(float fp32) |
||||
{ |
||||
// Fp32 -> Fp16
|
||||
Cv32suf a; |
||||
a.f = fp32; |
||||
int exponent = a.fmt.exponent - kBiasFp32Exponent; |
||||
int significand = a.fmt.significand; |
||||
|
||||
Cv16suf result; |
||||
result.i = 0; |
||||
unsigned int absolute = a.i & 0x7fffffff; |
||||
if( 0x477ff000 <= absolute ) |
||||
{ |
||||
// Inf in Fp16
|
||||
result.i = result.i | 0x7C00; |
||||
if( exponent == 128 && significand != 0 ) |
||||
{ |
||||
// NaN
|
||||
result.i = (short)( result.i | 0x200 | ( significand >> kShiftSignificand ) ); |
||||
} |
||||
} |
||||
else if ( absolute < 0x33000001 ) |
||||
{ |
||||
// too small for fp16
|
||||
result.i = 0; |
||||
} |
||||
else if ( absolute < 0x387fe000 ) |
||||
{ |
||||
// subnormal in Fp16
|
||||
int fp16Significand = significand | 0x800000; |
||||
int bitShift = (-exponent) - 1; |
||||
fp16Significand = fp16Significand >> bitShift; |
||||
|
||||
// special cases to round up
|
||||
bitShift = exponent + 24; |
||||
int threshold = ( ( 0x400000 >> bitShift ) | ( ( ( significand & ( 0x800000 >> bitShift ) ) >> ( 126 - a.fmt.exponent ) ) ^ 1 ) ); |
||||
if( absolute == 0x33c00000 ) |
||||
{ |
||||
result.i = 2; |
||||
} |
||||
else |
||||
{ |
||||
if( threshold <= ( significand & ( 0xffffff >> ( exponent + 25 ) ) ) ) |
||||
{ |
||||
fp16Significand++; |
||||
} |
||||
result.i = (short)fp16Significand; |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
// usual situation
|
||||
// exponent
|
||||
result.fmt.exponent = ( exponent + kBiasFp16Exponent ); |
||||
|
||||
// significand;
|
||||
short fp16Significand = (short)(significand >> kShiftSignificand); |
||||
result.fmt.significand = fp16Significand; |
||||
|
||||
// special cases to round up
|
||||
short lsb10bitsFp32 = (significand & 0x1fff); |
||||
short threshold = 0x1000 + ( ( fp16Significand & 0x1 ) ? 0 : 1 ); |
||||
if( threshold <= lsb10bitsFp32 ) |
||||
{ |
||||
result.i++; |
||||
} |
||||
else if ( fp16Significand == kMaskFp16Significand && exponent == -15) |
||||
{ |
||||
result.i++; |
||||
} |
||||
} |
||||
|
||||
// sign bit
|
||||
result.fmt.sign = a.fmt.sign; |
||||
return result.i; |
||||
} |
||||
#endif |
||||
|
||||
} |
||||
|
||||
namespace cv |
||||
{ |
||||
namespace opt_FP16 |
||||
{ |
||||
void cvtScaleHalf_SIMD32f16f( const float* src, size_t sstep, short* dst, size_t dstep, cv::Size size ); |
||||
void cvtScaleHalf_SIMD16f32f( const short* src, size_t sstep, float* dst, size_t dstep, cv::Size size ); |
||||
} |
||||
} |
Loading…
Reference in new issue