Merge pull request #8838 from tomoaki0705:dispatchFp16

8 years ago · 125abe2fe4
parent ebd98eaf3a e269ef96cb
commit 125abe2fe4
4 changed files with 371 additions and 238 deletions
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@ -44,7 +44,7 @@
 #include "precomp.hpp"

 #include "opencl_kernels_core.hpp"
-#include "opencv2/core/hal/intrin.hpp"
+#include "convert.hpp"

 #include "opencv2/core/openvx/ovx_defs.hpp"

@ -4573,256 +4573,40 @@ struct Cvt_SIMD<float, int>

 #endif

-#if !CV_FP16_TYPE
-// const numbers for floating points format
-const unsigned int kShiftSignificand    = 13;
-const unsigned int kMaskFp16Significand = 0x3ff;
-const unsigned int kBiasFp16Exponent    = 15;
-const unsigned int kBiasFp32Exponent    = 127;
-#endif
-
-#if CV_FP16_TYPE
-static float convertFp16SW(short fp16)
-{
-    // Fp16 -> Fp32
-    Cv16suf a;
-    a.i = fp16;
-    return (float)a.h;
-}
-#else
-static float convertFp16SW(short fp16)
-{
-    // Fp16 -> Fp32
-    Cv16suf b;
-    b.i = fp16;
-    int exponent    = b.fmt.exponent - kBiasFp16Exponent;
-    int significand = b.fmt.significand;
-
-    Cv32suf a;
-    a.i = 0;
-    a.fmt.sign = b.fmt.sign; // sign bit
-    if( exponent == 16 )
-    {
-        // Inf or NaN
-        a.i = a.i | 0x7F800000;
-        if( significand != 0 )
-        {
-            // NaN
-#if defined(__x86_64__) || defined(_M_X64)
-            // 64bit
-            a.i = a.i | 0x7FC00000;
-#endif
-            a.fmt.significand = a.fmt.significand | (significand << kShiftSignificand);
-        }
-        return a.f;
-    }
-    else if ( exponent == -15 )
-    {
-        // subnormal in Fp16
-        if( significand == 0 )
-        {
-            // zero
-            return a.f;
-        }
-        else
-        {
-            int shift = -1;
-            while( ( significand & 0x400 ) == 0 )
-            {
-                significand = significand << 1;
-                shift++;
-            }
-            significand = significand & kMaskFp16Significand;
-            exponent -= shift;
-        }
-    }
-
-    a.fmt.exponent = (exponent+kBiasFp32Exponent);
-    a.fmt.significand = significand << kShiftSignificand;
-    return a.f;
-}
-#endif
-
-#if CV_FP16_TYPE
-static short convertFp16SW(float fp32)
-{
-    // Fp32 -> Fp16
-    Cv16suf a;
-    a.h = (__fp16)fp32;
-    return a.i;
-}
-#else
-static short convertFp16SW(float fp32)
-{
-    // Fp32 -> Fp16
-    Cv32suf a;
-    a.f = fp32;
-    int exponent    = a.fmt.exponent - kBiasFp32Exponent;
-    int significand = a.fmt.significand;
-
-    Cv16suf result;
-    result.i = 0;
-    unsigned int absolute = a.i & 0x7fffffff;
-    if( 0x477ff000 <= absolute )
-    {
-        // Inf in Fp16
-        result.i = result.i | 0x7C00;
-        if( exponent == 128 && significand != 0 )
-        {
-            // NaN
-            result.i = (short)( result.i | 0x200 | ( significand >> kShiftSignificand ) );
-        }
-    }
-    else if ( absolute < 0x33000001 )
-    {
-        // too small for fp16
-        result.i = 0;
-    }
-    else if ( absolute < 0x33c00000 )
-    {
-        result.i = 1;
-    }
-    else if ( absolute < 0x34200001 )
-    {
-        result.i = 2;
-    }
-    else if ( absolute < 0x387fe000 )
-    {
-        // subnormal in Fp16
-        int fp16Significand = significand | 0x800000;
-        int bitShift = (-exponent) - 1;
-        fp16Significand = fp16Significand >> bitShift;
-
-        // special cases to round up
-        bitShift = exponent + 24;
-        int threshold = ( ( 0x400000 >> bitShift ) | ( ( ( significand & ( 0x800000 >> bitShift ) ) >> ( 126 - a.fmt.exponent ) ) ^ 1 ) );
-        if( threshold <= ( significand & ( 0xffffff >> ( exponent + 25 ) ) ) )
-        {
-            fp16Significand++;
-        }
-        result.i = (short)fp16Significand;
-    }
-    else
-    {
-        // usual situation
-        // exponent
-        result.fmt.exponent = ( exponent + kBiasFp16Exponent );
-
-        // significand;
-        short fp16Significand = (short)(significand >> kShiftSignificand);
-        result.fmt.significand = fp16Significand;
-
-        // special cases to round up
-        short lsb10bitsFp32 = (significand & 0x1fff);
-        short threshold = 0x1000 + ( ( fp16Significand & 0x1 ) ? 0 : 1 );
-        if( threshold <= lsb10bitsFp32 )
-        {
-            result.i++;
-        }
-        else if ( fp16Significand == 0x3ff && exponent == -15)
-        {
-            result.i++;
-        }
-    }
-
-    // sign bit
-    result.fmt.sign = a.fmt.sign;
-    return result.i;
-}
-#endif
-
 // template for FP16 HW conversion function
 template<typename T, typename DT> static void
 cvtScaleHalf_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size);

 template<> void
-cvtScaleHalf_<float, short>( const float* src, size_t sstep, short* dst, size_t dstep, Size size)
+cvtScaleHalf_<float, short>( const float* src, size_t sstep, short* dst, size_t dstep, Size size )
 {
+    CV_CPU_CALL_FP16(cvtScaleHalf_SIMD32f16f, (src, sstep, dst, dstep, size));
+
    sstep /= sizeof(src[0]);
    dstep /= sizeof(dst[0]);

-    if( checkHardwareSupport(CV_CPU_FP16) )
-    {
-        for( ; size.height--; src += sstep, dst += dstep )
-        {
-            int x = 0;
-
-#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
-            if ( ( (intptr_t)dst & 0xf ) == 0 )
-#endif
-            {
-#if CV_FP16 && CV_SIMD128
-                for ( ; x <= size.width - 4; x += 4)
-                {
-                    v_float32x4 v_src = v_load(src + x);
-
-                    v_float16x4 v_dst = v_cvt_f16(v_src);
-
-                    v_store_f16(dst + x, v_dst);
-                }
-#endif
-            }
-            for ( ; x < size.width; x++ )
-            {
-                dst[x] = convertFp16SW(src[x]);
-            }
-        }
-    }
-    else
+    for( ; size.height--; src += sstep, dst += dstep )
    {
-        for( ; size.height--; src += sstep, dst += dstep )
+        for ( int x = 0; x < size.width; x++ )
        {
-            int x = 0;
-            for ( ; x < size.width; x++ )
-            {
-                dst[x] = convertFp16SW(src[x]);
-            }
+            dst[x] = convertFp16SW(src[x]);
        }
    }
 }

 template<> void
-cvtScaleHalf_<short, float>( const short* src, size_t sstep, float* dst, size_t dstep, Size size)
+cvtScaleHalf_<short, float>( const short* src, size_t sstep, float* dst, size_t dstep, Size size )
 {
+    CV_CPU_CALL_FP16(cvtScaleHalf_SIMD16f32f, (src, sstep, dst, dstep, size));
+
    sstep /= sizeof(src[0]);
    dstep /= sizeof(dst[0]);

-    if( checkHardwareSupport(CV_CPU_FP16) )
-    {
-        for( ; size.height--; src += sstep, dst += dstep )
-        {
-            int x = 0;
-
-#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
-            if ( ( (intptr_t)src & 0xf ) == 0 )
-#endif
-            {
-#if CV_FP16 && CV_SIMD128
-                for ( ; x <= size.width - 4; x += 4)
-                {
-                    v_float16x4 v_src = v_load_f16(src + x);
-
-                    v_float32x4 v_dst = v_cvt_f32(v_src);
-
-                    v_store(dst + x, v_dst);
-                }
-#endif
-            }
-            for ( ; x < size.width; x++ )
-            {
-                dst[x] = convertFp16SW(src[x]);
-            }
-        }
-    }
-    else
+    for( ; size.height--; src += sstep, dst += dstep )
    {
-        for( ; size.height--; src += sstep, dst += dstep )
+        for ( int x = 0; x < size.width; x++ )
        {
-            int x = 0;
-            for ( ; x < size.width; x++ )
-            {
-                dst[x] = convertFp16SW(src[x]);
-            }
+            dst[x] = convertFp16SW(src[x]);
        }
    }
 }
@ -5027,12 +4811,13 @@ static void cvtScaleAbs##suffix( const stype* src, size_t sstep, const uchar*, s
 }

 #define DEF_CVT_SCALE_FP16_FUNC(suffix, stype, dtype) \
-static void cvtScaleHalf##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
-dtype* dst, size_t dstep, Size size, double*) \
+static void cvtScaleHalf##suffix( const stype* src, size_t sstep, \
+dtype* dst, size_t dstep, Size size) \
 { \
    cvtScaleHalf_<stype,dtype>(src, sstep, dst, dstep, size); \
 }

+
 #define DEF_CVT_SCALE_FUNC(suffix, stype, dtype, wtype) \
 static void cvtScale##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
 dtype* dst, size_t dstep, Size size, double* scale) \
@ -5213,12 +4998,16 @@ static BinaryFunc getCvtScaleAbsFunc(int depth)
    return cvtScaleAbsTab[depth];
 }

-BinaryFunc getConvertFuncFp16(int ddepth)
+typedef void (*UnaryFunc)(const uchar* src1, size_t step1,
+                       uchar* dst, size_t step, Size sz,
+                       void*);
+
+static UnaryFunc getConvertFuncFp16(int ddepth)
 {
-    static BinaryFunc cvtTab[] =
+    static UnaryFunc cvtTab[] =
    {
        0, 0, 0,
-        (BinaryFunc)(cvtScaleHalf32f16f), 0, (BinaryFunc)(cvtScaleHalf16f32f),
+        (UnaryFunc)(cvtScaleHalf32f16f), 0, (UnaryFunc)(cvtScaleHalf16f32f),
        0, 0,
    };
    return cvtTab[CV_MAT_DEPTH(ddepth)];
@ -5464,14 +5253,14 @@ void cv::convertFp16( InputArray _src, OutputArray _dst)
    int type = CV_MAKETYPE(ddepth, src.channels());
    _dst.create( src.dims, src.size, type );
    Mat dst = _dst.getMat();
-    BinaryFunc func = getConvertFuncFp16(ddepth);
+    UnaryFunc func = getConvertFuncFp16(ddepth);
    int cn = src.channels();
    CV_Assert( func != 0 );

    if( src.dims <= 2 )
    {
        Size sz = getContinuousSize(src, dst, cn);
-        func( src.data, src.step, 0, 0, dst.data, dst.step, sz, 0);
+        func( src.data, src.step, dst.data, dst.step, sz, 0);
    }
    else
    {
@ -5481,7 +5270,7 @@ void cv::convertFp16( InputArray _src, OutputArray _dst)
        Size sz((int)(it.size*cn), 1);

        for( size_t i = 0; i < it.nplanes; i++, ++it )
-            func(ptrs[0], 1, 0, 0, ptrs[1], 1, sz, 0);
+            func(ptrs[0], 1, ptrs[1], 1, sz, 0);
    }
 }

--- a/modules/core/src/convert.fp16.cpp
+++ b/modules/core/src/convert.fp16.cpp
@ -0,0 +1,172 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#include "convert.hpp"
+
+
+namespace cv
+{
+namespace opt_FP16
+{
+#if !defined(CV_NEON) || !CV_NEON
+const static int cVectorWidth = 8;
+
+void cvtScaleHalf_SIMD32f16f( const float* src, size_t sstep, short* dst, size_t dstep, cv::Size size )
+{
+    CV_INSTRUMENT_REGION()
+
+    sstep /= sizeof(src[0]);
+    dstep /= sizeof(dst[0]);
+
+    for( ; size.height--; src += sstep, dst += dstep )
+    {
+        int x = 0;
+        for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth )
+        {
+            __m256 v_src = _mm256_loadu_ps(src + x);
+
+            // round to nearest even
+            __m128i v_dst = _mm256_cvtps_ph(v_src, 0);
+
+            _mm_storeu_si128((__m128i*)(dst + x), v_dst);
+        }
+
+        for ( ; x < size.width; x++ )
+        {
+            dst[x] = convertFp16SW(src[x]);
+        }
+    }
+}
+
+void cvtScaleHalf_SIMD16f32f( const short* src, size_t sstep, float* dst, size_t dstep, cv::Size size )
+{
+    CV_INSTRUMENT_REGION()
+
+    sstep /= sizeof(src[0]);
+    dstep /= sizeof(dst[0]);
+
+    for( ; size.height--; src += sstep, dst += dstep )
+    {
+        int x = 0;
+        for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth )
+        {
+            __m128i v_src = _mm_loadu_si128((__m128i*)(src + x));
+
+            __m256 v_dst = _mm256_cvtph_ps(v_src);
+
+            _mm256_storeu_ps(dst + x, v_dst);
+        }
+
+        for ( ; x < size.width; x++ )
+        {
+            dst[x] = convertFp16SW(src[x]);
+        }
+    }
+}
+#elif CV_NEON
+const static int cVectorWidth = 4;
+
+template <typename T> static inline float16x4_t vld1_f16(const T* ptr)
+{ return (float16x4_t)vld1_s16((const short*)ptr); }
+template <typename T> static inline void vst1_f16(T* ptr, float16x4_t a)
+{ vst1_s16((short*)ptr, a); }
+
+void cvtScaleHalf_SIMD32f16f( const float* src, size_t sstep, short* dst, size_t dstep, cv::Size size )
+{
+    CV_INSTRUMENT_REGION()
+
+    sstep /= sizeof(src[0]);
+    dstep /= sizeof(dst[0]);
+
+    for( ; size.height--; src += sstep, dst += dstep )
+    {
+        int x = 0;
+        for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth)
+        {
+            float32x4_t v_src = vld1q_f32(src + x);
+
+            float16x4_t v_dst = vcvt_f16_f32(v_src);
+
+            vst1_f16((__fp16*)dst + x, v_dst);
+        }
+
+        for ( ; x < size.width; x++ )
+        {
+            dst[x] = convertFp16SW(src[x]);
+        }
+    }
+}
+
+void cvtScaleHalf_SIMD16f32f( const short* src, size_t sstep, float* dst, size_t dstep, cv::Size size )
+{
+    CV_INSTRUMENT_REGION()
+
+    sstep /= sizeof(src[0]);
+    dstep /= sizeof(dst[0]);
+
+    for( ; size.height--; src += sstep, dst += dstep )
+    {
+        int x = 0;
+        for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth )
+        {
+            float16x4_t v_src = vld1_f16((__fp16*)src + x);
+
+            float32x4_t v_dst = vcvt_f32_f16(v_src);
+
+            vst1q_f32(dst + x, v_dst);
+        }
+
+        for ( ; x < size.width; x++ )
+        {
+            dst[x] = convertFp16SW(src[x]);
+        }
+    }
+}
+#else
+#error "Unsupported build configuration"
+#endif
+}
+}
+/* End of file. */
--- a/modules/core/src/convert.hpp
+++ b/modules/core/src/convert.hpp
@ -0,0 +1,173 @@
+
+namespace
+{
+float convertFp16SW(short fp16);
+short convertFp16SW(float fp32);
+
+#if !CV_FP16_TYPE
+// const numbers for floating points format
+const unsigned int kShiftSignificand    = 13;
+const unsigned int kMaskFp16Significand = 0x3ff;
+const unsigned int kBiasFp16Exponent    = 15;
+const unsigned int kBiasFp32Exponent    = 127;
+#endif
+
+#if CV_FP16_TYPE
+float convertFp16SW(short fp16)
+{
+    // Fp16 -> Fp32
+    Cv16suf a;
+    a.i = fp16;
+    return (float)a.h;
+}
+#else
+float convertFp16SW(short fp16)
+{
+    // Fp16 -> Fp32
+    Cv16suf b;
+    b.i = fp16;
+    int exponent    = b.fmt.exponent - kBiasFp16Exponent;
+    int significand = b.fmt.significand;
+
+    Cv32suf a;
+    a.i = 0;
+    a.fmt.sign = b.fmt.sign; // sign bit
+    if( exponent == 16 )
+    {
+        // Inf or NaN
+        a.i = a.i | 0x7F800000;
+        if( significand != 0 )
+        {
+            // NaN
+#if defined(__x86_64__) || defined(_M_X64)
+            // 64bit
+            a.i = a.i | 0x7FC00000;
+#endif
+            a.fmt.significand = a.fmt.significand | (significand << kShiftSignificand);
+        }
+        return a.f;
+    }
+    else if ( exponent == -(int)kBiasFp16Exponent )
+    {
+        // subnormal in Fp16
+        if( significand == 0 )
+        {
+            // zero
+            return a.f;
+        }
+        else
+        {
+            int shift = -1;
+            while( ( significand & 0x400 ) == 0 )
+            {
+                significand = significand << 1;
+                shift++;
+            }
+            significand = significand & kMaskFp16Significand;
+            exponent -= shift;
+        }
+    }
+
+    a.fmt.exponent = (exponent+kBiasFp32Exponent);
+    a.fmt.significand = significand << kShiftSignificand;
+    return a.f;
+}
+#endif
+
+#if CV_FP16_TYPE
+short convertFp16SW(float fp32)
+{
+    // Fp32 -> Fp16
+    Cv16suf a;
+    a.h = (__fp16)fp32;
+    return a.i;
+}
+#else
+short convertFp16SW(float fp32)
+{
+    // Fp32 -> Fp16
+    Cv32suf a;
+    a.f = fp32;
+    int exponent    = a.fmt.exponent - kBiasFp32Exponent;
+    int significand = a.fmt.significand;
+
+    Cv16suf result;
+    result.i = 0;
+    unsigned int absolute = a.i & 0x7fffffff;
+    if( 0x477ff000 <= absolute )
+    {
+        // Inf in Fp16
+        result.i = result.i | 0x7C00;
+        if( exponent == 128 && significand != 0 )
+        {
+            // NaN
+            result.i = (short)( result.i | 0x200 | ( significand >> kShiftSignificand ) );
+        }
+    }
+    else if ( absolute < 0x33000001 )
+    {
+        // too small for fp16
+        result.i = 0;
+    }
+    else if ( absolute < 0x387fe000 )
+    {
+        // subnormal in Fp16
+        int fp16Significand = significand | 0x800000;
+        int bitShift = (-exponent) - 1;
+        fp16Significand = fp16Significand >> bitShift;
+
+        // special cases to round up
+        bitShift = exponent + 24;
+        int threshold = ( ( 0x400000 >> bitShift ) | ( ( ( significand & ( 0x800000 >> bitShift ) ) >> ( 126 - a.fmt.exponent ) ) ^ 1 ) );
+        if( absolute == 0x33c00000 )
+        {
+            result.i = 2;
+        }
+        else
+        {
+            if( threshold <= ( significand & ( 0xffffff >> ( exponent + 25 ) ) ) )
+            {
+                fp16Significand++;
+            }
+            result.i = (short)fp16Significand;
+        }
+    }
+    else
+    {
+        // usual situation
+        // exponent
+        result.fmt.exponent = ( exponent + kBiasFp16Exponent );
+
+        // significand;
+        short fp16Significand = (short)(significand >> kShiftSignificand);
+        result.fmt.significand = fp16Significand;
+
+        // special cases to round up
+        short lsb10bitsFp32 = (significand & 0x1fff);
+        short threshold = 0x1000 + ( ( fp16Significand & 0x1 ) ? 0 : 1 );
+        if( threshold <= lsb10bitsFp32 )
+        {
+            result.i++;
+        }
+        else if ( fp16Significand == kMaskFp16Significand && exponent == -15)
+        {
+            result.i++;
+        }
+    }
+
+    // sign bit
+    result.fmt.sign = a.fmt.sign;
+    return result.i;
+}
+#endif
+
+}
+
+namespace cv
+{
+namespace opt_FP16
+{
+void cvtScaleHalf_SIMD32f16f( const float* src, size_t sstep, short* dst, size_t dstep, cv::Size size );
+void cvtScaleHalf_SIMD16f32f( const short* src, size_t sstep, float* dst, size_t dstep, cv::Size size );
+}
+}
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@ -137,7 +137,6 @@ typedef void (*BinaryFuncC)(const uchar* src1, size_t step1,
                       uchar* dst, size_t step, int width, int height,
                       void*);

-BinaryFunc getConvertFuncFp16(int ddepth);
 BinaryFunc getConvertFunc(int sdepth, int ddepth);
 BinaryFunc getCopyMaskFunc(size_t esz);